In [1]:
! pip install -q --upgrade langchain langchain-openai langchain-core langchain_community langchain_chroma docx2txt pypdf sentence_transformers

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.5/74.5 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m444.0/444.0 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m57.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.5/310.5 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m76.9 MB/s[0m eta [36m0:00:

In [11]:
import langchain
print(langchain.__version__)

0.3.27


In [13]:
# importing the openai key from the secrets in the colab
import os
from google.colab import userdata
os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')


In [15]:
# importing other secrets for the langchain config
os.environ['LANGCHAIN_TRACING_V2'] = "true"
os.environ['LANGCHAIN_API_KEY'] = userdata.get('LANGCHAIN_API_KEY')
os.environ['LANGCHAIN_PROJECT'] = "langchain_world"

call llm

parse output

In [39]:
# This cell is responsible for the unzip the docs.zip which contains the dataset
import zipfile
import os

zip_path = "/content/docs.zip"
extract_to = "/content/"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)

print("Files extracted to:", extract_to)

Files extracted to: /content/


In [26]:
# To import the files from the document folder and load it as document object
# 1. Import the libraries
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader # To the documents as document objects
from langchain_text_splitters import RecursiveCharacterTextSplitter # To split the document object as chunks
from langchain_openai import OpenAIEmbeddings # To use the openai embedding model(text-embedding-3-small)
from typing import List
from langchain_core.documents import Document

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len
)
# 2. Function to load documents from a folder
def load_documents(folder_path: str) -> List[Document]:
    documents = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if filename.endswith('.pdf'):
            loader = PyPDFLoader(file_path)
        elif filename.endswith('.docx'):
            loader = Docx2txtLoader(file_path)
        else:
            print(f"Unsupported file type: {filename}")
            continue
        documents.extend(loader.load())
    return documents

# Load documents from a folder
folder_path = "/content/docs/"
documents = load_documents(folder_path)

print(f"Loaded {len(documents)} documents from the folder.")
splits = text_splitter.split_documents(documents)
print(f"Split the documents into {len(splits)} chunks.")

Loaded 5 documents from the folder.
Split the documents into 15 chunks.


In [27]:
embeddings = OpenAIEmbeddings()

# 4. Embedding Documents

document_embeddings = embeddings.embed_documents([split.page_content for split in splits])

print(f"Created embeddings for {len(document_embeddings)} document chunks.")

Created embeddings for 15 document chunks.


In [41]:
from langchain_chroma import Chroma

embedding_function = OpenAIEmbeddings()
collection_name = "my_collection"
vectorstore = Chroma.from_documents(collection_name=collection_name, documents=splits, embedding=embedding_function, persist_directory="./chroma_db")
#db.persist()

print("Vector store created and persisted to './chroma_db'")

Vector store created and persisted to './chroma_db'


In [37]:
retriever= vectorstore.as_retriever(search_kwargs={"k":3})
result = retriever.invoke('what is headquaters of the greenland')
print(result)

[Document(id='360877d0-c12a-4046-9fdd-9b4b2c2dd891', metadata={'page_label': '1', 'source': '/content/docs/GreenGrow Innovations_ Company History - Copy (3).pdf', 'moddate': '2025-09-01T15:44:48+05:30', 'page': 0, 'author': 'Naveen kumar', 'creator': 'Microsoft® Word 2019', 'producer': 'Microsoft® Word 2019', 'total_pages': 1, 'creationdate': '2025-09-01T15:44:48+05:30'}, page_content='crop development, and AI-powered farm management systems. Despite its growth, GreenGrow \nremains committed to its original mission of promoting sustainable farming practices. The company \nregularly partners with universities and research institutions to advance the field of agricultural \ntechnology and hosts annual conferences to share knowledge with farmers and other industry \nprofessionals.'), Document(id='ea9dd395-2797-411e-97ab-f57d28eb76d9', metadata={'source': '/content/docs/GreenGrow Innovations_ Company History - Copy.docx'}, page_content='their second major product, the SoilHealth Monitor, w