In [25]:
from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceBgeEmbeddings
from langchain.embeddings.base import Embeddings
from typing import List
import numpy as np

class HybridUrduEmbeddings(Embeddings):
    def __init__(self):
        # Urdu-Optimized Model (Semantic)
        self.urdu_model = HuggingFaceEmbeddings(
            model_name="paraphrase-multilingual-mpnet-base-v2"
        )
        
        # Numeric/Table-Optimized Model
        self.num_model = HuggingFaceEmbeddings(
            model_name="all-MiniLM-L6-v2"
        )
        
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        embeddings = []
        for text in texts:
           emb1 = self.urdu_model.embed_documents([text])[0]   # 768
           emb2 = self.num_model.embed_documents([text])[0]    # 384
           hybrid_emb = np.concatenate([emb1, emb2])           # 1152
           embeddings.append(hybrid_emb.tolist())
        return embeddings

    
    def embed_query(self, text: str) -> List[float]:
        return self.embed_documents([text])[0]

# Initialize
hybrid_embeddings = HybridUrduEmbeddings()

In [66]:


from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Pinecone
import pinecone
from tqdm.auto import tqdm
import hashlib
from dotenv import load_dotenv
import os

# Load API key and environment
load_dotenv()
# Make sure you have a .env file with:
# PINECONE_API_KEY=your-key-here
# PINECONE_ENV=your-env-here (e.g., "gcp-starter" or "us-west4-gcp")
PINECONE_KEY = os.getenv("PINECONE_API_KEY")

# Create Pinecone client instance (for Pinecone v3+)
pc = pinecone.Pinecone(api_key=PINECONE_KEY)

# Create index (dimension must match embedding size)
# Get the embedding dimension dynamically from the model
test_emb = hybrid_embeddings.embed_documents(["test"])


embedding_dim = 1152

index_name = "cotton-vectors"
if index_name not in pc.list_indexes():
    pc.create_index(
        name=index_name,
        dimension=embedding_dim,
        metric="cosine",
        spec=pinecone.ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )
    print(f"Created index {index_name} with dimension {embedding_dim}")
else:
    print(f"Index {index_name} already exists")

# Connect to index
index = pc.Index(index_name)





# Text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    length_function=len,
)

# Process the text file
def process_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # Split by page markers
    pages = content.split('===== Page ')[1:]
    
    documents = []
    metadatas = []
    
    for page in pages:
        # Extract page info
        try:
            page_info, page_content = page.split(' =====', 1)
            page_name = page_info.strip()
            page_content = page_content.strip()
            
            # Split content into chunks
            chunks = text_splitter.split_text(page_content)
            
            for i, chunk in enumerate(chunks):
                # Create a unique ID for each chunk
                chunk_id = hashlib.md5(f"{page_name}_{i}".encode()).hexdigest()
                
                documents.append(chunk)
                metadatas.append({
                    "page": page_name,
                    "chunk_id": chunk_id,
                    "chunk_index": i,
                    "source": "extracted_text1.txt"
                })
                
        except ValueError:
            print(f"Skipping malformed page: {page[:100]}...")
    
    return documents, metadatas

# Path to your text file
file_path = "extracted_text1.txt"

# Process the file
documents, metadatas = process_text_file(file_path)

# Create embeddings and store in Pinecone
batch_size = 32  # Adjust based on your resources

# Use the Pinecone client instance (pc) to get the index
index = pc.Index(index_name)

# Process in batches
for i in tqdm(range(0, len(documents), batch_size)):
    batch_docs = documents[i:i+batch_size]
    batch_metas = metadatas[i:i+batch_size]
    
    # Generate embeddings
    embeddings = hybrid_embeddings.embed_documents(batch_docs)
    
    # Prepare upsert data
    upsert_data = []
    for doc, meta, embedding in zip(batch_docs, batch_metas, embeddings):
        doc_id = meta.get("chunk_id", hashlib.md5(doc.encode()).hexdigest())
        # Pinecone v3 expects a dict with "id", "values", and "metadata"
        upsert_data.append({
            "id": doc_id,
            "values": embedding,
            "metadata": {
                "text": doc,
                **meta
            }
        })
    
    # Upsert to Pinecone using the Pinecone v3 client
    index.upsert(vectors=upsert_data)

print(f"Successfully uploaded {len(documents)} document chunks to Pinecone index '{index_name}'")

Created index cotton-vectors with dimension 1152


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 25/25 [06:42<00:00, 16.12s/it]

Successfully uploaded 774 document chunks to Pinecone index 'cotton-vectors'





In [86]:
index = pc.Index("wheat-vectors")
print(index_name)

cotton-vectors


In [91]:
query = "fertilizer recommendations for weak land, specifying nutrient levels (organic matter, phosphorus, and potash) in kilograms per hectare"

In [92]:
import google.generativeai as genai
from langchain.schema import Document  # If your results are in LangChain Doc format
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
if not GEMINI_API_KEY:
    raise ValueError("GEMINI_API_KEY not found in environment. Please set it in your .env file.")

# Step 1: Initialize Gemini with your API key
genai.configure(api_key=GEMINI_API_KEY)

# Step 2: Load Gemini model (Gemini Pro for text tasks)
model = genai.GenerativeModel('gemini-1.5-flash')

# Step 3: Format vectorstore results
def combine_documents(docs: list[Document]) -> str:
    return "\n\n".join([f"{i+1}. {doc.page_content}" for i, doc in enumerate(docs)])

# Step 4: Generate response from Gemini
def generate_response_from_docs(query: str, docs: list[Document]) -> str:
    context = combine_documents(docs)
    prompt = f"""You are a skillfull farmer. Answer the following question based only on the provided context.

Context:
{context}

Question:
{query}

Answer:"""

    response = model.generate_content(prompt)
    return response.text


In [93]:

from langchain_pinecone import PineconeVectorStore

# Load the Pinecone vectorstore for the current index_name
pinecone_vectorstore = PineconeVectorStore(
    index=index,
    embedding=hybrid_embeddings,
    text_key="text"
)


results = pinecone_vectorstore.similarity_search(query, k=3)
final_answer = generate_response_from_docs(query, results)
print("ðŸ¤– Gemini Answer:\n", final_answer)


ðŸ¤– Gemini Answer:
 Based on the provided text, for weak land with organic matter â‰¤ 0.86%, phosphorus â‰¤ 7 ppm, and potash â‰¤ 80 ppm,  the fertilizer recommendation in kilograms per hectare is not explicitly given.  The text only states that a soil test should be conducted to determine the appropriate fertilizer application based on various factors including soil analysis results.

