In [None]:
pip install langchain langchain-community langchain-huggingface sentence-transformers faiss-cpu transformers torch pypdf docx2txt huggingface_hub

Collecting langchain-community
  Downloading langchain_community-0.3.23-py3-none-any.whl.metadata (2.5 kB)
Collecting langchain-huggingface
  Downloading langchain_huggingface-0.2.0-py3-none-any.whl.metadata (941 bytes)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting pypdf
  Downloading pypdf-5.5.0-py3-none-any.whl.metadata (7.2 kB)
Collecting docx2txt
  Downloading docx2txt-0.9-py3-none-any.whl.metadata (529 bytes)
Collecting langchain-core<1.0.0,>=0.3.35 (from langchain)
  Downloading langchain_core-0.3.59-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain
  Downloading langchain-0.3.25-py3-none-any.whl.metadata (7.8 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.9.1-py3-none-any.whl.metadata (3.8 kB)
Collecting h

In [None]:
import os
import time
from langchain_community.document_loaders import DirectoryLoader, TextLoader, PyPDFLoader, Docx2txtLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain_community.llms import HuggingFacePipeline
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from huggingface_hub import login


In [None]:

# Step 1: Set Hugging Face token (optional for public models, required for gated models)
HF_TOKEN = "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"
if HF_TOKEN:
    login(token=HF_TOKEN)
else:
    print("Warning: HF_TOKEN not set. Public models will work, but gated models may fail.")


In [None]:
# Step 2: Define directory and load multiple document types
directory_path = "/content/drive/MyDrive/documents/"
def create_loader(file_path):
    if file_path.endswith(".txt"):
        return TextLoader(file_path, encoding="utf8")
    elif file_path.endswith(".pdf"):
        return PyPDFLoader(file_path)
    elif file_path.endswith(".docx"):
        return Docx2txtLoader(file_path)
    return None

print("🔁 Loading documents from", directory_path, "...")
documents = []
for file_name in os.listdir(directory_path):
    file_path = os.path.join(directory_path, file_name)
    loader = create_loader(file_path)
    if loader:
        try:
            documents.extend(loader.load())
        except Exception as e:
            print(f"Error loading {file_name}: {e}")
if not documents:
    raise ValueError("No valid documents found in the directory.")


🔁 Loading documents from /content/drive/MyDrive/documents/ ...


In [None]:
# Step 3: Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
text_chunks = text_splitter.split_documents(documents)
print(f"Created {len(text_chunks)} document chunks.")


Created 529 document chunks.


In [None]:
# Step 4: Create vector embeddings using HuggingFace
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# Step 4: Create vector embeddings using HuggingFace
# embedding_model = HuggingFaceEmbeddings(model_name="meta-llama/Llama-2-7b-chat-hf")


In [None]:

# Step 5: Store in FAISS vector database and persist
faiss_index_path = "faiss_index"
if os.path.exists(faiss_index_path):
    print("🔁 Loading existing FAISS index...")
    vectorstore = FAISS.load_local(faiss_index_path, embedding_model, allow_dangerous_deserialization=True)
else:
    print("🔁 Creating new FAISS index...")
    vectorstore = FAISS.from_documents(text_chunks, embedding_model)
    vectorstore.save_local(faiss_index_path)
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})


🔁 Creating new FAISS index...


In [None]:

# Step 6: Define Prompt Template
prompt_template = """
You are a precise question-answering assistant for a Retrieval-Augmented Generation system.
Answer the question based solely on the provided context, without using external knowledge.
If the context does not contain enough information to answer, respond with "I don't have enough information to answer."
Provide a direct, concise answer in no more than five sentences, using clear and neutral language.
Do not repeat the question or include unnecessary details.

Question: {question}
Context: {context}
Answer:
"""
prompt = ChatPromptTemplate.from_template(prompt_template)


In [None]:

# Step 7: Load TinyLlama locally using HuggingFace
print("🔁 Loading TinyLlama model (this may take some time on first run)...")
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
try:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
    llm_pipeline = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=512,
        do_sample=True,
        temperature=0.7,
        top_p=0.9
    )
except Exception as e:
    print(f"Error loading model: {e}")
    raise


🔁 Loading TinyLlama model (this may take some time on first run)...


Device set to use cuda:0


In [None]:

# Step 8: Create LangChain pipeline
llm_model = HuggingFacePipeline(pipeline=llm_pipeline)
output_parser = StrOutputParser()
rag_chain = (
    {"context": retriever | (lambda docs: "\n".join([doc.page_content for doc in docs])), "question": RunnablePassthrough()}
    | prompt
    | llm_model
    | output_parser
)


In [None]:

# Step 9: Query with retry logic
def query_rag(question, max_retries=3, base_delay=5):
    retries = 0
    while retries < max_retries:
        try:
            response = rag_chain.invoke(question)
            answer_start = response.find("Answer:") + len("Answer:")
            return response[answer_start:].strip()
        except Exception as e:
            print(f"Error during query: {e}. Retrying {retries + 1}/{max_retries}...")
            retries += 1
            time.sleep(base_delay * (2 ** retries))
    raise Exception("Max retries exceeded.")


In [26]:

# Step 10: Test query
question = "what is object detection in Deap Learning?"
print("\n🔍 Query:", question)
print("📝 Response:", query_rag(question))


🔍 Query: what is object detection in Deap Learning?
📝 Response: Object detection is a technique used in computer vision to classify and recognize objects in an image or video. It involves identifying the key features of an object, such as its shape, size, color, and texture, and assigning it a class or label.

YOLO (You Only Look Once) is a popular object detection method that uses an convolutional neural network (CNN) to extract feature maps from the input image. The feature maps are then processed by a multi-layer perceptron (MLP) to identify the most likely class for each pixel.

Design:
YOLO is a two-stage system, with the first stage being a CNN to extract feature maps from the input image. The feature maps are then processed by a MLP to identify the most likely class for each pixel. The second stage is a softmax function that calculates the probability of each class.

Loss function:
YOLO uses a cross-entropy loss function to train the network. The loss function calculates the di

In [27]:
question = "What is one shot learning in Deep Learning?"
print("\n🔍 Query:", question)
print("📝 Response:", query_rag(question))


🔍 Query: What is one shot learning in Deep Learning?
📝 Response: One-shot learning is a deep learning technique that allows a machine learning model to learn from one example, called a "shot." One-shot learning can be used in a wide range of applications, including image classification, object detection, and natural language processing. One-shot learning is different from two-shot learning, where the model needs to learn from two or more examples in a single training session. One-shot learning is also different from transfer learning, where the model is trained on a large dataset and then fine-tuned on a new task. In one-shot learning, the model is trained on a single example, which is called a "shot." This technique has been shown to be effective in a variety of domains, including object detection and image classification. One-shot learning has also been used in natural language processing, where the model is trained on a single example of a sentence or paragraph. One-shot learning h

In [28]:
question = "What action is the U.S. taking to address rising gas prices?"
print("\n🔍 Query:", question)
print("📝 Response:", query_rag(question))


🔍 Query: What action is the U.S. taking to address rising gas prices?
📝 Response: The U.S. Has released 60 Million barrels from our own Strategic Petroleum Reserve, and we're working with 30 other countries to release additional barrels.

The actions we're taking are designed to help blunt gas prices here at home and make sure that the pain of our sanctions is targeted at Russia's economy.

We've also taken steps to help our allies, including increasing the amount of crude oil we're selling to them.

We're working with other countries to help them maintain the supply of oil they need to keep their economies moving.

This is a global problem, and we're taking steps to address it as a global community.

The goal is to bring down the price of gasoline and diesel.

I know the news about what's happening can seem alarming.

But I want you to know that we are going to be okay.

When the history of this era is written, Putin's war on Ukraine will have left Russia weaker and the rest of the w

In [None]:
# Step 10: Evaluate Model Performance
# Define test questions and expected document chunks (for retrieval evaluation)
import numpy as np
test_cases = [
    {
        "question": "How is the United States supporting Ukraine economically and militarily?",
        "expected_chunk_keywords": ["Ukraine", "aid", "military", "economic"],  # Keywords in relevant chunks
    },
    {
        "question": "What are the key economic policies mentioned?",
        "expected_chunk_keywords": ["economic", "policy", "tax", "budget"],
    },
    {
        "question": "What is the stance on climate change?",
        "expected_chunk_keywords": ["climate", "environment", "energy"],
    },
]

# Function to evaluate retrieval precision
def evaluate_retrieval(retriever, test_cases, k=3):
    retrieval_results = []
    for test in test_cases:
        question = test["question"]
        expected_keywords = test["expected_chunk_keywords"]
        retrieved_docs = retriever.get_relevant_documents(question)[:k]

        # Check if retrieved chunks contain expected keywords
        relevant = []
        for doc in retrieved_docs:
            is_relevant = any(keyword.lower() in doc.page_content.lower() for keyword in expected_keywords)
            relevant.append(1 if is_relevant else 0)

        retrieval_results.append(relevant)

    # Calculate precision@k
    retrieval_precision = [np.mean(results) for results in retrieval_results]
    avg_precision = np.mean(retrieval_precision)
    return retrieval_precision, avg_precision

# Function to evaluate generation accuracy (manual scoring)
def evaluate_generation(rag_chain, test_cases):
    print("Manual Evaluation: Score each answer from 0 (incorrect) to 1 (correct).")
    generation_scores = []

    for test in test_cases:
        question = test["question"]
        response = query_rag(question)
        print(f"\nQuestion: {question}")
        print(f"Response: {response}")
        score = float(input("Enter score (0 to 1): "))
        generation_scores.append(score)

    avg_generation_score = np.mean(generation_scores)
    return generation_scores, avg_generation_score

# Function to measure latency
def measure_latency(rag_chain, test_cases):
    latencies = []
    for test in test_cases:
        start_time = time.time()
        query_rag(test["question"])
        end_time = time.time()
        latencies.append(end_time - start_time)

    avg_latency = np.mean(latencies)
    return latencies, avg_latency

# Run evaluation
print("\n🔍 Evaluating Model Performance...")
retrieval_precision, avg_retrieval_precision = evaluate_retrieval(retriever, test_cases)
generation_scores, avg_generation_score = evaluate_generation(rag_chain, test_cases)
latencies, avg_latency = measure_latency(rag_chain, test_cases)

# Print results
print("\n📊 Evaluation Results:")
print(f"Retrieval Precision per Question: {retrieval_precision}")
print(f"Average Retrieval Precision: {avg_retrieval_precision:.2f}")
print(f"Generation Scores per Question: {generation_scores}")
print(f"Average Generation Score: {avg_generation_score:.2f}")
print(f"Latencies per Question (seconds): {[round(l, 2) for l in latencies]}")
print(f"Average Latency: {avg_latency:.2f} seconds")

# Step 11: Test query
question = "How is the United States supporting Ukraine economically and militarily?"
print("\n🔍 Query:", question)
print("📝 Response:", query_rag(question))


🔍 Evaluating Model Performance...
Manual Evaluation: Score each answer from 0 (incorrect) to 1 (correct).

Question: How is the United States supporting Ukraine economically and militarily?
Response: I do not know the answer to this question. Can you provide a summary of the United States' support for Ukraine economically and militarily, including direct assistance, military assistance, and humanitarian assistance?
Enter score (0 to 1): 0

Question: What are the key economic policies mentioned?
Response: 1. The key economic policies mentioned are increasing the productive capacity of our economy.

2. Economists call it “increasing the productive capacity of our economy.”

3. My plan to fight inflation will lower your costs and lower the deficit.

4. My plan will lower the deficit.

5. The 17 Nobel laureates in economics say my plan will ease long-term inflationary pressures.

6. Business leaders and most Americans support my plan.

7. My plan is a plan to bring fundamental change to a

In [30]:
pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert_score
Successfully installed bert_score-0.3.13


In [31]:
import time
import numpy as np
from sklearn.metrics import precision_score
from bert_score import score as bert_score
import psutil
from langchain_community.vectorstores import FAISS

# Define test cases with ground-truth relevance and reference answers
test_cases = [
    {
        "question": "How is the United States supporting Ukraine economically and militarily?",
        "relevant_chunk_indices": [],  # Add indices of relevant chunks after inspection
        "reference_answer": "The U.S. provides $50 billion in economic aid and military equipment like Javelin missiles to Ukraine."
    },
    {
        "question": "What are the key economic policies mentioned?",
        "relevant_chunk_indices": [],  # Add indices
        "reference_answer": "Key economic policies include tax reforms and increased infrastructure spending."
    },
    {
        "question": "What is the stance on climate change?",
        "relevant_chunk_indices": [],  # Add indices
        "reference_answer": "The U.S. prioritizes renewable energy and carbon emission reductions."
    },
]

# Function to get document chunk indices (for relevance annotation)
def get_chunk_indices(vectorstore, documents):
    print("Document Chunks for Annotation:")
    for i, chunk in enumerate(documents):
        print(f"Chunk {i}: {chunk.page_content[:100]}...")
    return list(range(len(documents)))

# Function to evaluate retrieval (P@k, MRR)
def evaluate_retrieval(retriever, test_cases, text_chunks, k=3):
    precisions = []
    reciprocal_ranks = []

    for test in test_cases:
        question = test["question"]
        relevant_indices = test["relevant_chunk_indices"]

        # Get top-k retrieved documents
        retrieved_docs = retriever.get_relevant_documents(question)[:k]
        retrieved_indices = [text_chunks.index(doc) if doc in text_chunks else -1 for doc in retrieved_docs]

        # Calculate Precision@k
        relevant = [1 if idx in relevant_indices else 0 for idx in retrieved_indices]
        precision = np.mean(relevant) if relevant else 0
        precisions.append(precision)

        # Calculate Reciprocal Rank
        for rank, idx in enumerate(retrieved_indices, 1):
            if idx in relevant_indices:
                reciprocal_ranks.append(1 / rank)
                break
        else:
            reciprocal_ranks.append(0)

    avg_precision = np.mean(precisions)
    mrr = np.mean(reciprocal_ranks)
    return precisions, avg_precision, reciprocal_ranks, mrr

# Function to evaluate generation (BERTScore)
def evaluate_generation(rag_chain, test_cases):
    generated_answers = []
    reference_answers = []

    for test in test_cases:
        question = test["question"]
        response = query_rag(question)
        generated_answers.append(response)
        reference_answers.append(test["reference_answer"])

    # Calculate BERTScore
    P, R, F1 = bert_score(generated_answers, reference_answers, lang="en", verbose=True)
    bert_f1_scores = F1.numpy()
    avg_bert_f1 = np.mean(bert_f1_scores)

    return bert_f1_scores, avg_bert_f1

# Function to measure latency and memory usage
def measure_latency_and_memory(rag_chain, test_cases):
    latencies = []
    memory_usages = []
    process = psutil.Process()

    for test in test_cases:
        start_time = time.time()
        query_rag(test["question"])
        end_time = time.time()
        latencies.append(end_time - start_time)
        memory_usages.append(process.memory_info().rss / 1024 ** 2)  # Memory in MB

    avg_latency = np.mean(latencies)
    avg_memory = np.mean(memory_usages)
    return latencies, avg_latency, memory_usages, avg_memory

# Run evaluation
print("\n🔍 Preparing for Evaluation...")
# Annotate chunk indices (run once to identify relevant chunks)
chunk_indices = get_chunk_indices(vectorstore, text_chunks)
print("Please update 'relevant_chunk_indices' in test_cases with relevant chunk indices.")

# Example: Manually set relevant_chunk_indices after inspection
# test_cases[0]["relevant_chunk_indices"] = [0, 2, 5]  # Example indices
# test_cases[1]["relevant_chunk_indices"] = [1, 3]
# test_cases[2]["relevant_chunk_indices"] = [4, 6]

print("\n🔍 Evaluating Model Performance...")
precisions, avg_precision, reciprocal_ranks, mrr = evaluate_retrieval(retriever, test_cases, text_chunks)
bert_f1_scores, avg_bert_f1 = evaluate_generation(rag_chain, test_cases)
latencies, avg_latency, memory_usages, avg_memory = measure_latency_and_memory(rag_chain, test_cases)

# Print results
print("\n📊 Evaluation Results:")
print(f"Precision@k per Question: {precisions}")
print(f"Average Precision@k: {avg_precision:.2f}")
print(f"Reciprocal Ranks per Question: {reciprocal_ranks}")
print(f"Mean Reciprocal Rank (MRR): {mrr:.2f}")
print(f"BERTScore F1 per Question: {[round(score, 2) for score in bert_f1_scores]}")
print(f"Average BERTScore F1: {avg_bert_f1:.2f}")
print(f"Latencies per Question (seconds): {[round(l, 2) for l in latencies]}")
print(f"Average Latency: {avg_latency:.2f} seconds")
print(f"Memory Usage per Question (MB): {[round(m, 2) for m in memory_usages]}")
print(f"Average Memory Usage: {avg_memory:.2f} MB")


🔍 Preparing for Evaluation...
Document Chunks for Annotation:
Chunk 0: Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and th...
Chunk 1: Six days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he ...
Chunk 2: Groups of citizens blocking tanks with their bodies. Everyone from students to retirees teachers tur...
Chunk 3: Please rise if you are able and show that, Yes, we the United States of America stand with the Ukrai...
Chunk 4: The United States is a member along with 29 other nations. 

It matters. American diplomacy matters....
Chunk 5: We prepared extensively and carefully. 

We spent months building a coalition of other freedom-lovin...
Chunk 6: Along with twenty-seven members of the European Union including France, Germany, Italy, as well as c...
Chunk 7: We are cutting off Russia’s largest banks from the international financial system.  

Preventing Rus...
Chunk 8: The U.S. Department of J

  retrieved_docs = retriever.get_relevant_documents(question)[:k]
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.57 seconds, 5.30 sentences/sec

📊 Evaluation Results:
Precision@k per Question: [0.0, 0.0, 0.0]
Average Precision@k: 0.00
Reciprocal Ranks per Question: [0, 0, 0]
Mean Reciprocal Rank (MRR): 0.00
BERTScore F1 per Question: [0.82, 0.86, 0.78]
Average BERTScore F1: 0.82
Latencies per Question (seconds): [3.82, 27.78, 17.07]
Average Latency: 16.23 seconds
Memory Usage per Question (MB): [3022.07, 3016.88, 3016.88]
Average Memory Usage: 3018.61 MB
