In [1]:
!pip install sentence-transformers faiss-cpu pandas openpyxl PyPDF2 numpy

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>

In [2]:
import os
import pickle
from pathlib import Path
from typing import List, Dict, Any
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from PyPDF2 import PdfReader
import warnings
import logging

2025-10-13 04:51:43.553257: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760331103.745937      37 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760331103.796911      37 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
warnings.filterwarnings('ignore')
logging.getLogger('sentence_transformers').setLevel(logging.ERROR)
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [4]:
class AgricultureVectorDB:
    def __init__(self, base_folder: str, model_name: str = "all-mpnet-base-v2"):
        """
        Initialize the vector database builder
        
        Args:
            base_folder: Path to agriculture folder containing csvs/, pdfs/, excel/
            model_name: Sentence transformer model (all-mpnet-base-v2 or all-MiniLM-L6-v2)
        """
        self.base_folder = Path(base_folder)
        
        print(f"Loading embedding model: {model_name}...")
        # Load model with trust_remote_code to avoid warnings
        self.model = SentenceTransformer(model_name, trust_remote_code=True)
        self.embedding_dim = self.model.get_sentence_embedding_dimension()
        self.documents = []  # Store text chunks
        self.metadata = []   # Store metadata for each chunk
        self.index = None
        
        print(f"✓ Loaded embedding model: {model_name}")
        print(f"✓ Embedding dimension: {self.embedding_dim}")
    
    def chunk_text(self, text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
        """
        Split text into overlapping chunks
        
        Args:
            text: Input text
            chunk_size: Target chunk size in characters
            overlap: Overlap between chunks
        """
        if len(text) <= chunk_size:
            return [text]
        
        chunks = []
        start = 0
        while start < len(text):
            end = start + chunk_size
            chunk = text[start:end]
            
            # Try to break at sentence boundary
            if end < len(text):
                last_period = chunk.rfind('.')
                last_newline = chunk.rfind('\n')
                break_point = max(last_period, last_newline)
                if break_point > chunk_size * 0.5:  # At least 50% of chunk
                    chunk = chunk[:break_point + 1]
                    end = start + break_point + 1
            
            chunks.append(chunk.strip())
            start = end - overlap
        
        return chunks
    
    def process_pdf(self, pdf_path: Path) -> None:
        """Extract and chunk text from PDF files (textbooks)"""
        try:
            reader = PdfReader(str(pdf_path))
            print(f"  Processing PDF: {pdf_path.name} ({len(reader.pages)} pages)")
            
            for page_num, page in enumerate(reader.pages, 1):
                text = page.extract_text()
                if text.strip():
                    chunks = self.chunk_text(text)
                    for chunk_idx, chunk in enumerate(chunks):
                        self.documents.append(chunk)
                        self.metadata.append({
                            'source': str(pdf_path.name),
                            'type': 'pdf',
                            'page': page_num,
                            'chunk': chunk_idx,
                            'path': str(pdf_path)
                        })
        except Exception as e:
            print(f"  ✗ Error processing {pdf_path.name}: {str(e)}")
    
    def process_csv(self, csv_path: Path) -> None:
        """Process CSV with numerical data and headers"""
        try:
            df = pd.read_csv(csv_path)
            print(f"  Processing CSV: {csv_path.name} ({len(df)} rows, {len(df.columns)} columns)")
            
            # Create text representation of each row
            for idx, row in df.iterrows():
                # Combine column names with values for context
                row_text = f"Data from {csv_path.stem}:\n"
                for col in df.columns:
                    value = row[col]
                    if pd.notna(value):
                        row_text += f"{col}: {value}\n"
                
                # Chunk if row text is too long
                chunks = self.chunk_text(row_text, chunk_size=800)
                for chunk_idx, chunk in enumerate(chunks):
                    self.documents.append(chunk)
                    self.metadata.append({
                        'source': str(csv_path.name),
                        'type': 'csv',
                        'row': idx,
                        'chunk': chunk_idx,
                        'path': str(csv_path)
                    })
        except Exception as e:
            print(f"  ✗ Error processing {csv_path.name}: {str(e)}")
    
    def process_excel(self, excel_path: Path) -> None:
        """Process Excel files with numerical data"""
        try:
            # Read all sheets
            excel_file = pd.ExcelFile(excel_path)
            print(f"  Processing Excel: {excel_path.name} ({len(excel_file.sheet_names)} sheets)")
            
            for sheet_name in excel_file.sheet_names:
                df = pd.read_excel(excel_path, sheet_name=sheet_name)
                
                for idx, row in df.iterrows():
                    row_text = f"Data from {excel_path.stem} - Sheet: {sheet_name}:\n"
                    for col in df.columns:
                        value = row[col]
                        if pd.notna(value):
                            row_text += f"{col}: {value}\n"
                    
                    chunks = self.chunk_text(row_text, chunk_size=800)
                    for chunk_idx, chunk in enumerate(chunks):
                        self.documents.append(chunk)
                        self.metadata.append({
                            'source': str(excel_path.name),
                            'type': 'excel',
                            'sheet': sheet_name,
                            'row': idx,
                            'chunk': chunk_idx,
                            'path': str(excel_path)
                        })
        except Exception as e:
            print(f"  ✗ Error processing {excel_path.name}: {str(e)}")
    
    def load_all_documents(self) -> None:
        """Load and process all documents from the folder structure"""
        print("\n" + "="*60)
        print("LOADING DOCUMENTS")
        print("="*60)
        
        # Process PDFs
        pdf_folder = self.base_folder / "pdfs"
        if pdf_folder.exists():
            print("\n📄 Processing PDFs...")
            for pdf_file in pdf_folder.glob("*.pdf"):
                self.process_pdf(pdf_file)
        
        # Process CSVs
        csv_folder = self.base_folder / "csvs"
        if csv_folder.exists():
            print("\n📊 Processing CSVs...")
            for csv_file in csv_folder.glob("*.csv"):
                self.process_csv(csv_file)
        
        # Process Excel files
        excel_folder = self.base_folder / "excel"
        if excel_folder.exists():
            print("\n📈 Processing Excel files...")
            for excel_file in excel_folder.glob("*.xlsx"):
                self.process_excel(excel_file)
            for excel_file in excel_folder.glob("*.xls"):
                self.process_excel(excel_file)
        
        print(f"\n✓ Total documents loaded: {len(self.documents)}")
        print(f"✓ Total chunks created: {len(self.documents)}")
    
    def build_index(self, batch_size: int = 32) -> None:
        """Build FAISS index from documents"""
        if not self.documents:
            raise ValueError("No documents loaded. Call load_all_documents() first.")
        
        print("\n" + "="*60)
        print("BUILDING FAISS INDEX")
        print("="*60)
        
        # Generate embeddings in batches
        print(f"\nGenerating embeddings for {len(self.documents)} chunks...")
        embeddings = []
        
        for i in range(0, len(self.documents), batch_size):
            batch = self.documents[i:i + batch_size]
            batch_embeddings = self.model.encode(batch, show_progress_bar=False)
            embeddings.append(batch_embeddings)
            if (i // batch_size + 1) % 10 == 0:
                print(f"  Processed {i + len(batch)}/{len(self.documents)} chunks...")
        
        embeddings = np.vstack(embeddings).astype('float32')
        print(f"✓ Embeddings shape: {embeddings.shape}")
        
        # Create FAISS index
        print("\nCreating FAISS index...")
        self.index = faiss.IndexFlatL2(self.embedding_dim)
        self.index.add(embeddings)
        print(f"✓ FAISS index created with {self.index.ntotal} vectors")
    
    def save(self, output_dir: str = "faiss_index") -> None:
        """Save FAISS index and metadata"""
        output_path = Path(output_dir)
        output_path.mkdir(exist_ok=True)
        
        print("\n" + "="*60)
        print("SAVING INDEX")
        print("="*60)
        
        # Save FAISS index
        index_path = output_path / "faiss_index.bin"
        faiss.write_index(self.index, str(index_path))
        print(f"✓ FAISS index saved to: {index_path}")
        
        # Save documents and metadata
        data_path = output_path / "documents_metadata.pkl"
        with open(data_path, 'wb') as f:
            pickle.dump({
                'documents': self.documents,
                'metadata': self.metadata,
                'model_name': self.model.get_sentence_embedding_dimension()
            }, f)
        print(f"✓ Documents and metadata saved to: {data_path}")
        
        # Save summary
        summary_path = output_path / "index_summary.txt"
        with open(summary_path, 'w') as f:
            f.write("FAISS Vector Database Summary\n")
            f.write("="*50 + "\n\n")
            f.write(f"Total chunks: {len(self.documents)}\n")
            f.write(f"Embedding dimension: {self.embedding_dim}\n")
            f.write(f"Index size: {self.index.ntotal}\n\n")
            
            # Count by file type
            pdf_count = sum(1 for m in self.metadata if m['type'] == 'pdf')
            csv_count = sum(1 for m in self.metadata if m['type'] == 'csv')
            excel_count = sum(1 for m in self.metadata if m['type'] == 'excel')
            
            f.write(f"PDF chunks: {pdf_count}\n")
            f.write(f"CSV chunks: {csv_count}\n")
            f.write(f"Excel chunks: {excel_count}\n")
        
        print(f"✓ Summary saved to: {summary_path}")
    
    def load(self, index_dir: str = "faiss_index") -> None:
        """Load existing FAISS index"""
        index_path = Path(index_dir)
        
        # Load FAISS index
        self.index = faiss.read_index(str(index_path / "faiss_index.bin"))
        
        # Load documents and metadata
        with open(index_path / "documents_metadata.pkl", 'rb') as f:
            data = pickle.load(f)
            self.documents = data['documents']
            self.metadata = data['metadata']
        
        print(f"✓ Loaded index with {self.index.ntotal} vectors")
    
    def search(self, query: str, k: int = 5) -> List[Dict[str, Any]]:
        """
        Search the vector database
        
        Args:
            query: Search query
            k: Number of results to return
        
        Returns:
            List of dictionaries containing document text, metadata, and similarity score
        """
        if self.index is None:
            raise ValueError("Index not built. Call build_index() or load() first.")
        
        # Generate query embedding
        query_embedding = self.model.encode([query]).astype('float32')
        
        # Search
        distances, indices = self.index.search(query_embedding, k)
        
        results = []
        for dist, idx in zip(distances[0], indices[0]):
            results.append({
                'text': self.documents[idx],
                'metadata': self.metadata[idx],
                'similarity_score': float(1 / (1 + dist))  # Convert distance to similarity
            })
        
        return results

In [5]:
if __name__ == "__main__":
    # Initialize
    db = AgricultureVectorDB(
        base_folder="/kaggle/input/agriculture/New folder",
        model_name="all-mpnet-base-v2"  # or "all-MiniLM-L6-v2" for faster processing
    )
    
    # Load all documents
    db.load_all_documents()
    
    # Build FAISS index
    db.build_index()
    
    # Save index
    db.save(output_dir="faiss_index")
    
    print("\n" + "="*60)
    print("✓ VECTOR DATABASE BUILD COMPLETE!")
    print("="*60)
    
    # Test search
    print("\n📝 Testing search functionality...")
    results = db.search("crop cultivation techniques", k=3)
    
    print("\nTop 3 results:")
    for i, result in enumerate(results, 1):
        print(f"\n{i}. Source: {result['metadata']['source']}")
        print(f"   Type: {result['metadata']['type']}")
        print(f"   Score: {result['similarity_score']:.4f}")
        print(f"   Text preview: {result['text'][:200]}...")

Loading embedding model: all-mpnet-base-v2...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✓ Loaded embedding model: all-mpnet-base-v2
✓ Embedding dimension: 768

LOADING DOCUMENTS

📄 Processing PDFs...
  Processing PDF: agronomy_textbook.pdf (856 pages)
  Processing PDF: indian_agriculture_after_independence.pdf (447 pages)
  Processing PDF: basic_agriculture_cbse.pdf (208 pages)
  Processing PDF: ncert_agriculture_textbook.pdf (10 pages)

📊 Processing CSVs...
  Processing CSV: upag_crop_data.csv (104 rows, 12 columns)
  Processing CSV: Percentage_Participation_Report_2022_23_0.csv (70 rows, 7 columns)
  Processing CSV: des_crop_data.csv (1459 rows, 105 columns)
  Processing CSV: Animal Dataset.csv (205 rows, 16 columns)
  Processing CSV: data_core.csv (8000 rows, 9 columns)
  Processing CSV: all_agriculture_india.csv (2238 rows, 9 columns)
  Processing CSV: RS_Session_267_AU_2998_3.csv (33 rows, 8 columns)
  Processing CSV: indian_agriculture_dataset.csv (16146 rows, 80 columns)

📈 Processing Excel files...
  Processing Excel: EXPORT_OF_HORTICULTURE_PRODUCE_IN_INDIA.xls (1