## 1. Data Ingestion

#### 1.1 Data import for the year of 2006

In [51]:
import pandas as pd
import re
import uuid

In [52]:
df = pd.read_csv(r"D:\Data Science Projects\datasets\Year_2024_dataset.csv")
df.head(2)

Unnamed: 0,BlockName,Category,Year,Month,Day,Crop,DistrictName,QueryType,Season,Sector,StateName,QueryText,KccAns,latitude,longitude
0,PONDURU,Pulses,2024,1,1,Green Gram Moong Bean Moong,SRIKAKULAM,Nutrient Management,,AGRICULTURE,ANDHRA PRADESH,FARMER ASKED QUERY ON NUTRIENT MANAGEMENT IN g...,RECOMMENDED TO SPRAY BORAX 3 GRAMS 1 LITR OF ...,18.2949,83.8939
1,GARA,Millets,2024,1,1,Maize Makka,SRIKAKULAM,Plant Protection,,AGRICULTURE,ANDHRA PRADESH,FARMER ASKED QUERY ON USAGE OF NEEM OIL IN MAIZE,RECOMMENDED TO SPRAY AZADIRHACHTIN NEEM OIL 1...,18.2949,83.8939


#### 1.2 Clean, normalize, and split into logical “document” chunks or Q&A pairs.

####  EDA

In [53]:
print("🔹 Dataset shape:", df.shape)

🔹 Dataset shape: (3234061, 15)


In [54]:
print("\n🔹 Null values per column:")
print(df.isnull().sum())


🔹 Null values per column:
BlockName            44
Category              0
Year                  0
Month                 0
Day                   0
Crop                  0
DistrictName          0
QueryType             0
Season          3234061
Sector                0
StateName             0
QueryText            21
KccAns              750
latitude              0
longitude             0
dtype: int64


In [55]:
# Unique values in each column
print("\n🔹 Unique value counts:")
for col in df.columns:
    print(f"{col}: {df[col].nunique()}")


🔹 Unique value counts:
BlockName: 6126
Category: 22
Year: 1
Month: 12
Day: 31
Crop: 297
DistrictName: 668
QueryType: 65
Season: 0
Sector: 4
StateName: 32
QueryText: 698092
KccAns: 1292215
latitude: 649
longitude: 647


In [56]:
 ## Distribution of some key categorical columns
print("\n🔹 Top 10 States:")
print(df['StateName'].value_counts().head(10))


🔹 Top 10 States:
StateName
UTTAR PRADESH     536021
RAJASTHAN         431179
MADHYA PRADESH    345544
HARYANA           236464
MAHARASHTRA       220050
BIHAR             195627
GUJARAT           183659
WEST BENGAL       166475
TAMILNADU         157176
PUNJAB            147633
Name: count, dtype: int64


In [57]:
#print("\n🔹 Top 10 Crops:")
print(df['Crop'].value_counts().head(10))

Crop
Others                          1300332
Paddy Dhan                       408910
Wheat                            224229
Cotton Kapas                      91734
Groundnut pea nutmung phalli      80240
Soybean bhat                      74237
Maize Makka                       66959
Green Gram Moong Bean Moong       59394
Mustard                           51492
Potato                            47092
Name: count, dtype: int64


#### Table cleaning 

In [58]:
# Drop the 'Season' column due to high missingness
df.drop(columns=['Season'], inplace=True)

print("✅ Dropped 'Season' column.")

✅ Dropped 'Season' column.


In [59]:
print("🔹 Dataset shape:", df.shape)

🔹 Dataset shape: (3234061, 14)


In [60]:
print("🧪 Any NaNs?", df.isnull().values.any())


🧪 Any NaNs? True


In [61]:
# Drop rows that contain any NaN values
df.dropna(inplace=True)

print(f"✅ Dropped rows with NaN values. New shape: {df.shape}")


✅ Dropped rows with NaN values. New shape: (3233249, 14)


In [62]:
print("🧪 Any NaNs left?", df.isnull().values.any())


🧪 Any NaNs left? False


In [63]:
df.head()

Unnamed: 0,BlockName,Category,Year,Month,Day,Crop,DistrictName,QueryType,Sector,StateName,QueryText,KccAns,latitude,longitude
0,PONDURU,Pulses,2024,1,1,Green Gram Moong Bean Moong,SRIKAKULAM,Nutrient Management,AGRICULTURE,ANDHRA PRADESH,FARMER ASKED QUERY ON NUTRIENT MANAGEMENT IN g...,RECOMMENDED TO SPRAY BORAX 3 GRAMS 1 LITR OF ...,18.2949,83.8939
1,GARA,Millets,2024,1,1,Maize Makka,SRIKAKULAM,Plant Protection,AGRICULTURE,ANDHRA PRADESH,FARMER ASKED QUERY ON USAGE OF NEEM OIL IN MAIZE,RECOMMENDED TO SPRAY AZADIRHACHTIN NEEM OIL 1...,18.2949,83.8939
2,PONDURU,Pulses,2024,1,1,Green Gram Moong Bean Moong,SRIKAKULAM,Plant Protection,AGRICULTURE,ANDHRA PRADESH,GREEN GRAM LEAF EATING CATERPILLAR MANAGEMENT,500 200,18.2949,83.8939
3,MANDASA,Vegetables,2024,1,1,Tomato,SRIKAKULAM,Fertilizer Use and Availability,HORTICULTURE,ANDHRA PRADESH,FARMER ASKED QUERY ON FERTILIZER MANAGEMENT IN...,RECOMMENDED TO FERTILISERS: UREA 30KGDAP- 50 K...,18.2949,83.8939
4,MANDASA,Vegetables,2024,1,1,Tomato,SRIKAKULAM,Cultural Practices,HORTICULTURE,ANDHRA PRADESH,FARMER ASKED QUERY ON WEED MANAGEMENT IN TOM...,RECOMMENDED TO SPRAY ATRAZINE ATRATOPSOLARO 1 ...,18.2949,83.8939


In [64]:
# Normalize text fields (lowercase, strip, basic cleaning)
def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower().strip()
    text = re.sub(r'\s+', ' ', text)  # remove extra whitespace
    text = re.sub(r'[^a-z0-9\s.,]', '', text)  # basic character filtering
    return text

df['QueryText'] = df['QueryText'].apply(clean_text)
df['KccAns'] = df['KccAns'].apply(clean_text)


#  normalize categorical fields (strip & lowercase)
for col in ['Crop', 'DistrictName', 'Category', 'QueryType', 'Sector', 'StateName']:
    df[col] = df[col].astype(str).str.lower().str.strip()

In [65]:
metadata_cols = ['StateName', 'DistrictName', 'Crop', 'Category', 'QueryType', 'Sector', 'Year', 'Month', 'Day']

In [66]:
# Function to create a single document
def make_qa_doc(row):
    return {
        "doc_id": str(uuid.uuid4()),  # unique ID
        "query": row['QueryText'],
        "answer": row['KccAns'],
        "metadata": {col: row[col] for col in metadata_cols}
    }
    
# Apply transformation
qa_docs = df.apply(make_qa_doc, axis=1).tolist()

In [67]:
qa_df = pd.DataFrame(qa_docs)
qa_df.head()

Unnamed: 0,doc_id,query,answer,metadata
0,0582d804-19ae-41be-94f6-a0ab80188561,farmer asked query on nutrient management in g...,recommended to spray borax 3 grams 1 litr of w...,"{'StateName': 'andhra pradesh', 'DistrictName'..."
1,16aff8f2-94a1-4db5-8538-6e244b2a3ec4,farmer asked query on usage of neem oil in maize,recommended to spray azadirhachtin neem oil 1 ...,"{'StateName': 'andhra pradesh', 'DistrictName'..."
2,9d7cac83-0f0c-405a-b8ef-4bfdb2d278fb,green gram leaf eating caterpillar management,500 200,"{'StateName': 'andhra pradesh', 'DistrictName'..."
3,4e73d41d-31c8-475a-a87b-4d9afe064756,farmer asked query on fertilizer management in...,recommended to fertilisers urea 30kgdap 50 kg ...,"{'StateName': 'andhra pradesh', 'DistrictName'..."
4,b332b768-87f0-47c9-89b5-77c84e70841f,farmer asked query on weed management in tomato,recommended to spray atrazine atratopsolaro 1 ...,"{'StateName': 'andhra pradesh', 'DistrictName'..."


In [68]:
# Save preprocessed Q&A pairs to file
qa_df.to_json("kcc_qa_clean.json", orient="records", lines=True)
print("✅ Q&A chunks saved to 'kcc_qa_clean.json'")

✅ Q&A chunks saved to 'kcc_qa_clean.json'


#### 1.4 Export both raw and preprocessed formats, preserving metadata fields.

In [69]:
# Save the cleaned version of the original dataset (no Q&A restructuring)
df.to_csv("kcc_cleaned_raw.csv", index=False)
print("✅ Raw cleaned data saved as 'kcc_cleaned_raw.csv'")


✅ Raw cleaned data saved as 'kcc_cleaned_raw.csv'


In [70]:
# Also save as CSV for readability
qa_df.to_csv("kcc_qa_clean.csv", index=False)
print("✅ Preprocessed Q&A data also saved as 'kcc_qa_clean.csv'")


✅ Preprocessed Q&A data also saved as 'kcc_qa_clean.csv'


## Task 02: Local LLM Deployment

#### 2.1 Use an open-source model via the Ollama API (e.g., Gemma 3, Deepseek).

In [71]:
import requests

In [72]:
import requests
import json

# Set up the base URL for the local Ollama API
url = "http://localhost:11434/api/chat"

# Define the payload (your input prompt)
payload = {
    "model": "gemma3:1b",  # Replace with the model name you're using
    "messages": [{"role": "user", "content": "what is the capital of Andhra Pradesh?"}]
}

# Send the HTTP POST request with streaming enabled
response = requests.post(url, json=payload, stream=True)

# Check the response status
if response.status_code == 200:
    print("Streaming response from Ollama:")
    for line in response.iter_lines(decode_unicode=True):
        if line:  # Ignore empty lines
            try:
                # Parse each line as a JSON object
                json_data = json.loads(line)
                # Extract and print the assistant's message content
                if "message" in json_data and "content" in json_data["message"]:
                    print(json_data["message"]["content"], end="")
            except json.JSONDecodeError:
                print(f"\nFailed to parse line: {line}")
    print()  # Ensure the final output ends with a newline
else:
    print(f"Error: {response.status_code}")
    print(response.text)

Streaming response from Ollama:
The capital of Andhra Pradesh is **Amaravati**.

While the state has had several capitals throughout its history, Amaravati is generally considered the current and most recognized capital.



#### trying out ollama python package

In [73]:
import ollama

In [74]:
client = ollama.Client()
model = "gemma3:1b"

In [75]:
response = client.generate(model, prompt = "what are the two crop seasons in Andhra Pradesh?")

In [76]:
response.response

"The two major crop seasons in Andhra Pradesh are:\n\n1.  **Puli & Panna (Rice & Sugarcane):** This is the most significant crop season in Andhra Pradesh. It’s a massive undertaking involving rice cultivation (Puli) and sugarcane harvesting (Panna). The timing of this season varies slightly each year, typically spanning from **October to March**. It's a crucial period for the state's agricultural economy.\n\n2.  **Harirama (Cotton):** This is the second major crop season in Andhra Pradesh, predominantly focused on cotton cultivation. The season typically runs from **March to September**. \n\n**Important Note:** While these are the two most prominent, other crops like lentils, pulses, and vegetables also contribute significantly to the agricultural calendar.\n\nDo you want to know more about either of these seasons – perhaps about specific crops grown during them or the economic significance of each?"

## 3. Retrieval-Augmented Generation (RAG)

#### 3.1 Generate Embeddings from Document Chunks

In [77]:
# !pip install sentence-transformers

In [80]:
from sentence_transformers import SentenceTransformer
import numpy as np
import tqdm as notebook_tqdm

In [86]:
# Sample 100 entries
sample_df = qa_df.sample(n=100, random_state=42).reset_index(drop=True)

# Combine question and answer into one chunk
sample_df['text'] = sample_df['query'] + " " + sample_df['answer']


In [88]:
## generate embbedings

# Load a small, fast embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate dense vector embeddings
embeddings = model.encode(sample_df['text'].tolist(), show_progress_bar=True)

# Convert to NumPy for compatibility with vector DBs
embeddings = np.array(embeddings)

# Confirm shape
print("✅ Embeddings shape:", embeddings.shape)


Batches: 100%|██████████| 4/4 [00:01<00:00,  3.64it/s]

✅ Embeddings shape: (100, 384)





#### 3.2 Store embeddings in a lightweight vector database (ChromaDB, FAISS, or MongoDB).

In [82]:
# !pip install chromadb

In [91]:
import chromadb
from chromadb.config import Settings
import chromadb.utils.embedding_functions as embedding_functions

In [93]:
chroma_client = chromadb.Client()

chroma_client = chromadb.PersistentClient(path="./chroma_store")


In [94]:
# Optional: remove existing collection (if rerunning)
try:
    chroma_client.delete_collection("kcc_embeddings")
except:
    pass

In [95]:
# Create embedding function wrapper
embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")

# Create new collection
collection = chroma_client.create_collection(
    name="kcc_embeddings",
    embedding_function=embedding_fn
)

In [97]:
# Prepare documents, metadatas, and IDs
documents = sample_df['text'].tolist()
metadatas = sample_df['metadata'].tolist()  # ← Fixed: removed eval
ids = sample_df['doc_id'].tolist()

# Add to Chroma
collection.add(
    documents=documents,
    metadatas=metadatas,
    ids=ids
)

print("✅ Stored 100 embedded documents in ChromaDB.")

✅ Stored 100 embedded documents in ChromaDB.


In [None]:
## Output: Sample Query
results = collection.query(
    query_texts=["how to control fruit borer in brinjal"],
    n_results=3
)

print("🔎 Top results:")
for doc in results['documents'][0]:
    print("→", doc)

🔎 Top results:
→ information about shoot and fruit borer management in brinjal crop  25ec400 ml 200
→ farmer want to know information about how to control stem borercaterpillar in brinjal crop 525 09 sc 30
→ asked about management of gram pod borer in red gram suggested to spray emamectin benzoate 1gliter and 2000g200liter of water 1 2000 200


#### 3.3: Semantic Search using ChromaDB

Encode the incoming query using the same SentenceTransformer model.

Use ChromaDB’s query() method to retrieve the most relevant documents.

Return both the matched text and the associated metadata.

In [100]:
# Load the same embedding model used before
model = SentenceTransformer('all-MiniLM-L6-v2')

# Define your query
user_query = "how to control leaf spot in tomato crop"

# Generate query embedding
query_embedding = model.encode([user_query])

# Perform semantic search
results = collection.query(
    query_embeddings=query_embedding,
    n_results=5,  # top-k
    include=['documents', 'metadatas']
)

# Show results
for i, (doc, meta) in enumerate(zip(results['documents'][0], results['metadatas'][0]), 1):
    print(f"\n🔹 Match {i}")
    print(f"Text: {doc}")
    print("Metadata:")
    for k, v in meta.items():
        print(f"  {k}: {v}")


🔹 Match 1
Text: farmer wants information regarding control of bacterial leaf blight disease in wheat crop 25 200 15 150 200
Metadata:
  Day: 21
  Month: 2
  StateName: chhattisgarh
  DistrictName: durg
  Crop: wheat
  Category: cereals
  Year: 2024
  QueryType: plant protection
  Sector: agriculture

🔹 Match 2
Text: information about control of late blight in potato crop 75 wp 500 200
Metadata:
  Category: vegetables
  DistrictName: barabanki
  Sector: horticulture
  Crop: potato
  StateName: uttar pradesh
  QueryType: plant protection
  Month: 12
  Day: 16
  Year: 2024

🔹 Match 3
Text: information regarding the control of leaf folder in paddybasmati 80 5 60 185 20 480 50 20 wg 170 75 1 20 100
Metadata:
  DistrictName: sangrur
  QueryType: plant protection
  Day: 26
  Crop: paddy dhan
  Sector: agriculture
  Month: 8
  StateName: punjab
  Category: cereals
  Year: 2024

🔹 Match 4
Text: information regarding to control yellowing and enhance good growth in wheat crop 191919 1 600 150200

#### 3.4 If no context meets a relevance threshold, invoke a live Internet search and clearly notify the user.

In [None]:
from scipy.spatial.distance import cosine
from sentence_transformers import SentenceTransformer
import requests

def semantic_search_with_fallback(query, model, collection, threshold=0.75, top_k=5):
    # Step 1: Embed the query
    query_embedding = model.encode([query])[0]

    # Step 2: Retrieve top-k results
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k,
        include=['documents', 'metadatas', 'embeddings']
    )

    docs = results['documents'][0]
    metas = results['metadatas'][0]
    doc_embeddings = results['embeddings'][0]

    # Step 3: Compute cosine similarity
    similarities = [1 - cosine(query_embedding, emb) for emb in doc_embeddings]

    # Step 4: Filter based on threshold
    relevant_results = [
        (doc, meta, sim)
        for doc, meta, sim in zip(docs, metas, similarities)
        if sim >= threshold
    ]

    if relevant_results:
        print(f"\n✅ Found {len(relevant_results)} relevant results (threshold ≥ {threshold})")
        for i, (doc, meta, sim) in enumerate(relevant_results, 1):
            print(f"\n🔹 Match {i} (Similarity: {sim:.3f})")
            print(f"Text: {doc}")
            print("Metadata:")
            for k, v in meta.items():
                print(f"  {k}: {v}")
    else:
        print("\n⚠️ No sufficiently relevant local results.")
        print("🌐 Performing a live internet search...")

        # Step 5: Perform live internet search using SerpAPI
        serpapi_api_key = 'YOUR_SERPAPI_API_KEY'  # Replace with your actual SerpAPI key
        params = {
            'engine': 'google',
            'q': query,
            # 'api_key': "6fb5953aa0005416f5307922637a89b395a05e7208c7b66ce2171b30d3df4e80"
        }
        response = requests.get('https://serpapi.com/search', params=params)

        if response.status_code == 200:
            search_results = response.json()
            organic_results = search_results.get('organic_results', [])
            if organic_results:
                print("\n🔎 Top Internet Search Results:")
                for i, result in enumerate(organic_results[:5], 1):
                    print(f"\n🔹 Result {i}")
                    print(f"Title: {result.get('title')}")
                    print(f"Link: {result.get('link')}")
                    print(f"Snippet: {result.get('snippet')}")
            else:
                print("No search results found.")
        else:
            print(f"Error fetching search results: {response.status_code}")


In [107]:
# Initialize your model and collection as before
model = SentenceTransformer('all-MiniLM-L6-v2')  # or your preferred model
# collection = your ChromaDB collection

# Perform semantic search with fallback
semantic_search_with_fallback("how to increase yield of bottle gourd", model, collection)



⚠️ No sufficiently relevant local results.
🌐 Performing a live internet search...

🔎 Top Internet Search Results:

🔹 Result 1
Title: How do I grow bigger gourds : r/gardening
Link: https://www.reddit.com/r/gardening/comments/16b6fg3/how_do_i_grow_bigger_gourds/
Snippet: Use fertilizer that is higher on potash (the "K" in NPK) once the vines get growing. Also do not be impatient to pick them. Let the vines get brown.

🔹 Result 2
Title: WHY BOTTLE GOURD FEMALE FLOWERS DYING
Link: https://www.youtube.com/live/sSxjSUH0LOI?pp=QAFIAg%3D%3D
Snippet: WHY BOTTLE GOURD FEMALE FLOWERS DYING | HOW TO INCREASE YIELD OF GOURDS In this gardening video we will give you tips and guidance how to ...

🔹 Result 3
Title: Bottle Gourd Cultivation: Maximizing Yield in Summer and ...
Link: https://agriculture.institute/production-tech-vegetables-crops/bottle-gourd-cultivation-maximizing-yield/
Snippet: Fertilization plays a vital role in the growth and yield of bottle gourd. Here is a recommended fertilizati