In [14]:
import os 
from dotenv import load_dotenv
load_dotenv()

os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

In [15]:
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader('farming_threats.pdf')
docs = loader.load()

docs

[Document(metadata={'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® Word 2021', 'creationdate': '2025-07-15T21:35:22+05:30', 'author': 'yathi yathish', 'moddate': '2025-07-15T21:35:22+05:30', 'source': 'farming_threats.pdf', 'total_pages': 3, 'page': 0, 'page_label': '1'}, page_content='1. Climate Change & Extreme Weather Events \n• Droughts (water scarcity) \n• Floods (waterlogging, soil erosion) \n• Unpredictable rainfall patterns \n• Heatwaves (crop stress, reduced yields) \n• Frost & hailstorms (damage to crops) \n2. Pests & Diseases \n• Insect infestations (locusts, aphids, borers) \n• Fungal, bacterial & viral diseases (blight, rust, wilt) \n• Weed infestations (competing for nutrients) \n• Invasive species (disrupting local ecosystems) \n3. Soil Degradation & Loss of Fertility \n• Soil erosion (wind/water) \n• Salinization (excess irrigation) \n• Nutrient depletion (over-farming) \n• Acidification (excessive chemical use) \n4. Market & Economic Challenges \n• Price vo

In [16]:
# converting into chunks
#breaking the bigger PDF into Chunks 
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000,chunk_overlap=200)
documents = text_splitter.split_documents(docs)
documents[:5]

[Document(metadata={'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® Word 2021', 'creationdate': '2025-07-15T21:35:22+05:30', 'author': 'yathi yathish', 'moddate': '2025-07-15T21:35:22+05:30', 'source': 'farming_threats.pdf', 'total_pages': 3, 'page': 0, 'page_label': '1'}, page_content='1. Climate Change & Extreme Weather Events \n• Droughts (water scarcity) \n• Floods (waterlogging, soil erosion) \n• Unpredictable rainfall patterns \n• Heatwaves (crop stress, reduced yields) \n• Frost & hailstorms (damage to crops) \n2. Pests & Diseases \n• Insect infestations (locusts, aphids, borers) \n• Fungal, bacterial & viral diseases (blight, rust, wilt) \n• Weed infestations (competing for nutrients) \n• Invasive species (disrupting local ecosystems) \n3. Soil Degradation & Loss of Fertility \n• Soil erosion (wind/water) \n• Salinization (excess irrigation) \n• Nutrient depletion (over-farming) \n• Acidification (excessive chemical use) \n4. Market & Economic Challenges \n• Price vo

In [17]:
#vector embeddings and vector store
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma

# Use an embedding-capable model instead
#gemma 3b:1 is not an text generation model 
#Nomic Embed Text: A Powerful Open-Source Embedding Model
# not a generative AI model
embeddings = OllamaEmbeddings(model="nomic-embed-text")  # Correct embedding model

db = Chroma.from_documents(documents[:20], embeddings)

Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


In [18]:
query = "what can be solution for soil erosion"
result = db.similarity_search(query)
result[0].page_content




In [19]:
#FAISS database
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OllamaEmbeddings

db1 = FAISS.from_documents(documents[:20], OllamaEmbeddings(model="nomic-embed-text"))

In [20]:
#RAG pipeline
from langchain_community.llms import Ollama  
db1  #db1 is our FAISS vector store
llm = Ollama(model="gemma3:1b")

In [21]:
from langchain_core.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_template("""
Answer the following question based on the provided context.
Think step by step before providing a detaiiled answer.
<context>
{context}
</context>
Question :{input} """)

In [22]:
#Chain domain chain stuff
from langchain.chains.combine_documents import create_stuff_documents_chain
document_chain = create_stuff_documents_chain(llm,prompt)


In [23]:
retriever = db.as_retriever()
retriever

VectorStoreRetriever(tags=['Chroma', 'OllamaEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x0000025240C97890>, search_kwargs={})

In [None]:
# retriever chain : to combine both document_chain and retriever
from langchain.chains import create_retrieval_chain
retrieval_chain = create_retrieval_chain(retriever,document_chain)
retrieval_chain.invoke({"input":"tell about few plant disease"})


In [13]:
response = retrieval_chain.invoke({"input":"Tell about plant disease"})
response['answer']

'Here’s a breakdown of plant diseases, based solely on the provided text:\n\n**Plant Diseases:**\n\nAccording to the text, plant diseases include:\n\n*   **Fungal, bacterial & viral diseases:** (blight, rust, wilt)\n*   **Weed infestations:** (competing for nutrients)\n*   **Insect infestations:** (locusts, aphids, borers)\n*   **Ice-related diseases:** (frost & hailstorms)\n\n'