In [1]:
import os
import matplotlib.pyplot as plt
import google.generativeai as genai

from tqdm import tqdm
from openai import OpenAI
from langchain import hub
from dotenv import load_dotenv
from langchain_chroma import Chroma
from langchain.schema import Document
from langchain_nomic import NomicEmbeddings
from langgraph.graph import StateGraph, START
from langchain_core.documents import Document
from typing_extensions import List, TypedDict
from langchain_core.prompts import PromptTemplate
from langchain.chat_models import init_chat_model
from langchain.text_splitter import RecursiveCharacterTextSplitter

load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["GOOGLE_API_KEY"] = os.getenv('GEMINI_API_KEY')
os.environ["NOMIC_API_KEY"] = os.getenv('NOMIC_API_KEY')

In [3]:
data_dir = "scraped_data"
info = [] # Each doc content is a string of words in wach file
file_paths = []

for root, dirs, files in os.walk(data_dir):
    for file in files:
        if file.endswith(".txt"):
            file_path = os.path.join(root, file)
            with open(file_path, "r", encoding="utf-8") as f:
                text = f.read()
                info.append(text)
                file_paths.append(file_path)
docs = [Document(page_content=d, metadata={}) for d in info]

In [4]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=300, 
    chunk_overlap=50,
    add_start_index=True
)

all_chunks = splitter.split_documents(docs) # metadata, page_content
texts = [chunk.page_content for chunk in all_chunks]
len(all_chunks), len(texts)

(6035, 6035)

In [5]:
import requests

url = "http://127.0.0.1:5000/api/documents/"

ids = [f"chunk-{i}" for i in range(len(all_chunks))]
documents = [chunk.page_content for chunk in all_chunks]
metadatas = [chunk.metadata for chunk in all_chunks]

batch_size = 300  

for i in range(0, len(all_chunks), batch_size):
    batch_ids = ids[i:i+batch_size]
    batch_docs = documents[i:i+batch_size]
    batch_metas = metadatas[i:i+batch_size]

    data = {
        "ids": batch_ids,
        "documents": batch_docs,
        "metadatas": batch_metas,
    }

    res = requests.post(url, json=data)
    print(f"Uploaded batch {i//batch_size + 1}: {res.json()}")

Uploaded batch 1: {'ids': ['chunk-0', 'chunk-1', 'chunk-2', 'chunk-3', 'chunk-4', 'chunk-5', 'chunk-6', 'chunk-7', 'chunk-8', 'chunk-9', 'chunk-10', 'chunk-11', 'chunk-12', 'chunk-13', 'chunk-14', 'chunk-15', 'chunk-16', 'chunk-17', 'chunk-18', 'chunk-19', 'chunk-20', 'chunk-21', 'chunk-22', 'chunk-23', 'chunk-24', 'chunk-25', 'chunk-26', 'chunk-27', 'chunk-28', 'chunk-29', 'chunk-30', 'chunk-31', 'chunk-32', 'chunk-33', 'chunk-34', 'chunk-35', 'chunk-36', 'chunk-37', 'chunk-38', 'chunk-39', 'chunk-40', 'chunk-41', 'chunk-42', 'chunk-43', 'chunk-44', 'chunk-45', 'chunk-46', 'chunk-47', 'chunk-48', 'chunk-49', 'chunk-50', 'chunk-51', 'chunk-52', 'chunk-53', 'chunk-54', 'chunk-55', 'chunk-56', 'chunk-57', 'chunk-58', 'chunk-59', 'chunk-60', 'chunk-61', 'chunk-62', 'chunk-63', 'chunk-64', 'chunk-65', 'chunk-66', 'chunk-67', 'chunk-68', 'chunk-69', 'chunk-70', 'chunk-71', 'chunk-72', 'chunk-73', 'chunk-74', 'chunk-75', 'chunk-76', 'chunk-77', 'chunk-78', 'chunk-79', 'chunk-80', 'chunk-81',

In [5]:
all_chunks[0]

Document(metadata={'start_index': 0}, page_content='Placement Statistics|Why Recruit at DAU?|Recruiters|Placement Cell: Team|Student Placement Cell|Contact us\nMessage from Placement Cell:')

In [6]:
llm = init_chat_model("gemini-2.5-flash", model_provider="google_genai")
embeddings = NomicEmbeddings(model="nomic-embed-text-v1.5")

In [7]:
vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",
)

In [8]:
vector_1 = embeddings.embed_query(all_chunks[0].page_content)
vector_2 = embeddings.embed_query(all_chunks[1].page_content)

assert len(vector_1) == len(vector_2)
print(f"Generated vectors of length {len(vector_1)}\n")
print(vector_1[:10])

Generated vectors of length 768

[-0.05316162, 0.05178833, -0.17114258, 0.0072669983, 0.08062744, 0.0063934326, 0.07434082, 0.025527954, -0.0009188652, -0.029418945]


In [9]:
batch_size = 5000  # must be < 5461

document_ids = []
for i in tqdm(range(0, len(all_chunks), batch_size)):
    batch = all_chunks[i:i+batch_size]
    ids = vector_store.add_documents(documents=batch)
    document_ids.extend(ids)

print("Total inserted:", len(document_ids))
print("First 3 IDs:", document_ids[:3])

100%|██████████| 2/2 [00:32<00:00, 16.45s/it]

Total inserted: 6035
First 3 IDs: ['0241a455-315b-4005-a887-706430564d35', 'b5db5e29-51ea-4574-a840-b92f6dff758f', 'c665f9d1-a991-439a-b853-1b44b63906da']





In [10]:
# You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don\'t know the answer, just say that you don\'t know. Use three sentences maximum and keep the answer concise. \nQuestion: {question} \nContext: {context} \nAnswer:

In [22]:
template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use five sentences maximum and keep the answer as concise as possible.

{context}

Question: {question}

Helpful Answer:"""

custom_rag_prompt = PromptTemplate.from_template(template)

example_messages = custom_rag_prompt.invoke(
    {"context": "(context goes here)", "question": "(question goes here)"}
).to_messages()

assert len(example_messages) == 1
print(example_messages[0].content)

Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use five sentences maximum and keep the answer as concise as possible.

(context goes here)

Question: (question goes here)

Helpful Answer:


In [23]:
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str

In [47]:
def retrieve(state: State):
    similar_chunks = vector_store.similarity_search(state['question'], 5)
    return {
        "context": similar_chunks
    }

def generate_answers(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state['context'])
    message = custom_rag_prompt.invoke(
        {
            "question": state['question'],
            "context": docs_content
        }
    )
    print(message)
    response = llm.invoke(message)
    return {
        "answer": response.content
    }

In [48]:
graph_builder = StateGraph(State).add_sequence(
    [
        retrieve, 
        generate_answers
    ]
)
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [None]:
query = "How to get admission at DAU"

response = graph.invoke({
    "question": query
})

text="Use the following pieces of context to answer the question at the end.\nIf you don't know the answer, just say that you don't know, don't try to make up an answer.\nUse five sentences maximum and keep the answer as concise as possible.\n\nDAU Scholarships to M.Sc (Data Science) Students | Dhirubhai Ambani Institute of Information and Communication Technology Admissions Enquire Now! displayNone displayNone Accessibility Menu Contrast View Text Resize Skip to Main Content Screen Reader Access Pause Animations Pause Cursor Admissions\n\nDAU Scholarships to M.Sc (Data Science) Students | Dhirubhai Ambani Institute of Information and Communication Technology Admissions Enquire Now! displayNone displayNone Accessibility Menu Contrast View Text Resize Skip to Main Content Screen Reader Access Pause Animations Pause Cursor Admissions\n\nDAU Scholarships to M.Sc (Data Science) Students | Dhirubhai Ambani Institute of Information and Communication Technology Admissions Enquire Now! display

: 

In [46]:
print(response)

{'question': 'How to get admission at DAU', 'context': [Document(id='6143f129-5d3d-4737-a720-ea0f21a38347', metadata={'start_index': 0}, page_content='DAU Scholarships to M.Sc (Data Science) Students | Dhirubhai Ambani Institute of Information and Communication Technology Admissions Enquire Now! displayNone displayNone Accessibility Menu Contrast View Text Resize Skip to Main Content Screen Reader Access Pause Animations Pause Cursor Admissions'), Document(id='0f6f5f42-a26f-4402-883e-65f479971d20', metadata={'start_index': 0}, page_content='DAU Scholarships to M.Sc (Data Science) Students | Dhirubhai Ambani Institute of Information and Communication Technology Admissions Enquire Now! displayNone displayNone Accessibility Menu Contrast View Text Resize Skip to Main Content Screen Reader Access Pause Animations Pause Cursor Admissions'), Document(id='c29350d3-12dd-4438-9c1c-b8fe44265bb5', metadata={'start_index': 0}, page_content='DAU Scholarships to M.Sc (Data Science) Students | Dhirub