# <p style="padding:50px;background-color:#06402B;margin:0;color:#fafefe;font-family:newtimeroman;font-size:100%;text-align:center;border-radius: 15px 50px;overflow:hidden;font-weight:100">Experiment 1: COT Full Finetuned LlaMa vs COT Gemini</p>

**Description:** This experiment is about comparing COT Finetuned LlaMa-3B-Instruct model and COT Google Gemini based on the SQUAD's validation dataset. 

## <p style="padding:50px;background-color:#06402B;margin:0;color:#fafefe;font-family:newtimeroman;font-size:100%;text-align:center;border-radius: 15px 50px;overflow:hidden;font-weight:100">Imports</p>

In [5]:
import re
import os
import sys
import json
import time
import string
import itertools

from collections import Counter
from typing import Callable, Dict
from google.api_core.exceptions import ResourceExhausted

from pathlib import Path
import warnings
warnings.filterwarnings(
    "ignore",
    message="Convert_system_message_to_human will be deprecated!"
)

# LangChain components
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.memory import ConversationBufferWindowMemory
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.agents import initialize_agent, AgentType

from langchain_core.documents import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain_community.vectorstores import Qdrant

## <p style="padding:50px;background-color:#06402B;margin:0;color:#fafefe;font-family:newtimeroman;font-size:100%;text-align:center;border-radius: 15px 50px;overflow:hidden;font-weight:100">RAG</p>

### 1. Define Data Source Path

In [6]:
INPUT_FILE = Path("experiments/data/validation.json")
OUTPUT_FILE = Path("experiments/data/context.txt")

### 2. Define Extract Method

In [7]:
def extract(input_path):
    context_set = set()
    with input_path.open("r", encoding="utf-8") as f:
        for line in f:
            record = json.loads(line)
            ctx = record.get("context")
            if ctx and ctx not in context_set:
                context_set.add(ctx)
                yield ctx

def write_contexts_to_file(contexts, output_path):
    with output_path.open("w", encoding="utf-8") as f:
        for i, ctx in enumerate(contexts, start=1):
            f.write(ctx)
            f.write("\n\n")
    print(f"Wrote {i} contexts to {output_path}")

In [8]:
contexts = extract(INPUT_FILE)
write_contexts_to_file(contexts, OUTPUT_FILE)

Wrote 2067 contexts to experiments\data\context.txt


### 3. Store Data in Qdrant DB

In [9]:
QDRANT_URL = os.getenv("QDRANT_URL")
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

In [10]:
def get_fastembed_model():
    return FastEmbedEmbeddings()

def create_embeddings():
    return get_fastembed_model()

def load_text_files(txt_file: str) -> list[Document]:
    text = Path(txt_file).read_text(encoding="utf-8")
    blocks = [blk.strip() for blk in text.split("\n\n") if blk.strip()]
    docs = []
    for idx, blk in enumerate(blocks, start=1):
        docs.append(
            Document(
                page_content=blk,
                metadata={
                    "source": Path(txt_file).name,
                    "block_index": idx,
                },
            )
        )
    return docs

def build_qdrant_index(docs: list[Document], embeddings):
    return Qdrant.from_documents(
        documents=docs,
        embedding=embeddings,
        location=":memory:",
        collection_name="text_chunks",
    )

def generate_retriever(txt_dir: str = "./data/all_contexts.txt"):
    raw_docs = load_text_files(txt_dir)
    print(f"Loaded {len(raw_docs)} documents from {txt_dir}")
    embeddings = create_embeddings()
    vectorstore = build_qdrant_index(raw_docs, embeddings)
    retriever = vectorstore.as_retriever(
        search_type="similarity",
        search_kwargs={"k": 5},
    )

    return retriever

In [12]:
retriever = generate_retriever(txt_dir="./experiments/data/context.txt")

Loaded 2067 documents from ./experiments/data/context.txt


Fetching 5 files: 100%|██████████| 5/5 [00:00<?, ?it/s]


Example

In [13]:
query = "What is ABC?"
results = retriever.get_relevant_documents(query)
# get text from the first 5 results
print(f"Found {results[0].page_content} relevant documents.")

  results = retriever.get_relevant_documents(query)


Found Since its inception, ABC has had many affiliated stations, which include WABC-TV and WPVI-TV, the first two stations to carry the network's programming. As of March 2015[update], ABC has eight owned-and-operated stations, and current and pending affiliation agreements with 235 additional television stations encompassing 49 states, the District of Columbia, four U.S. possessions, Bermuda and Saba; this makes ABC the largest U.S. broadcast television network by total number of affiliates. The network has an estimated national reach of 96.26% of all households in the United States (or 300,794,157 Americans with at least one television set). relevant documents.


## <p style="padding:50px;background-color:#06402B;margin:0;color:#fafefe;font-family:newtimeroman;font-size:100%;text-align:center;border-radius: 15px 50px;overflow:hidden;font-weight:100">Evaluation Functions</p>

### 1. Define Evaluation Functions

In [38]:
def f1_score(prediction: str, ground_truth: str) -> float:
    """
    Compute the token-level F1 score between prediction and ground_truth.
    """
    pred_tokens = prediction.split()
    gt_tokens = ground_truth.split()
    common = Counter(pred_tokens) & Counter(gt_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0.0
    precision = num_same / len(pred_tokens)
    recall = num_same / len(gt_tokens)
    return 2 * precision * recall / (precision + recall)

def evaluate_model(
    predict_fn: Callable[[str, str], str],
    data_path: Path,
    max_rows: int = 50
) -> Dict[str, float]:
    """
    Evaluate a model over the first `max_rows` entries of a JSONL file with fields: context, question, answers.text.
    Returns average F1 score and number of samples evaluated.
    """
    scores = []
    with data_path.open('r', encoding='utf-8') as f:
        for line in itertools.islice(f, max_rows):
            record = json.loads(line)
            question = record.get('question', '')
            docs = retriever.get_relevant_documents(question)
            context = "\n".join([doc.page_content for doc in docs])
            gold_texts = record.get('answers', {}).get('text', [])
            
            # get prediction
            pred = predict_fn(question, context)
            
            # compute best F1 against all golds
            if gold_texts:
                sample_f1 = max(f1_score(pred, gt) for gt in gold_texts)
            else:
                sample_f1 = 0.0
            scores.append(sample_f1)

    num_samples = len(scores)
    avg_f1 = sum(scores) / num_samples if num_samples else 0.0
    return {"average_f1": avg_f1, "num_samples": num_samples}

## <p style="padding:50px;background-color:#06402B;margin:0;color:#fafefe;font-family:newtimeroman;font-size:100%;text-align:center;border-radius: 15px 50px;overflow:hidden;font-weight:100">LLM Model 1: Zero-Shot Google Gemini</p>

### 1. Define Model & API keys

In [23]:
from dotenv import load_dotenv
load_dotenv()

MODEL_NAME = "gemini-1.5-flash-8b"
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
LANGCHAIN_PROJECT = os.getenv("LANGCHAIN_PROJECT")

### 2. Instantiate Model

In [24]:
llm = ChatGoogleGenerativeAI(
    model=MODEL_NAME,
    google_api_key=GEMINI_API_KEY,
    temperature=0.7,
)

memory = ConversationBufferWindowMemory(
    memory_key="chat_history",
    input_key="input",
    output_key="output",
    return_messages=True,
    k=5,
)

agent = initialize_agent(
    tools=[],  
    llm=llm,
    agent=AgentType.CHAT_CONVERSATIONAL_REACT_DESCRIPTION,
    memory=memory,
    verbose=True,
    max_iterations=3,
)

Example

In [None]:
context = "South Africa is a country located at the southern tip of Africa. It is known for its diverse culture, languages, and history. The country has a rich heritage and is famous for its wildlife, natural beauty, and landmarks such as Table Mountain and Kruger National Park."
question = "What is the capital of South Africa?"
result = agent.run(f"Context: {context}\nQuestion: {question}")
print(result)

### 3. Evaluate

In [26]:
def predict_model1(question: str, context: str) -> str:
    for attempt in range(5):
        try:
            prompt_input = f"Context:\n{context}\n\nQuestion:\n{question}"
            result = agent.run(prompt_input)
            return result
        except ResourceExhausted as e:
            wait = 60  # sleep long enough to clear the per-minute bucket
            print(f"Quota exceeded, sleeping for {wait}s… (attempt {attempt+1}/5)")
            time.sleep(wait)
    raise RuntimeError("Max retries hit for model1 due to quota limits")

In [20]:
data_file = Path('experiments/data/validation.json')
print('Evaluating Model 1...')
res1 = evaluate_model(predict_model1, data_file)

Evaluating Model 1...


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
  "action": "Final Answer",
  "action_input": "Denver Broncos"
}
```[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
  "action": "Final Answer",
  "action_input": "Carolina Panthers"
}
```[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
  "action": "Final Answer",
  "action_input": "Levi's Stadium in the San Francisco Bay Area at Santa Clara, California"
}
```[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
  "action": "Final Answer",
  "action_input": "Denver Broncos"
}
```[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
  "action": "Final Answer",
  "action_input": "Gold"
}
```[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
  

Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-1.5-flash-8b"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 15
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 56
}
].


[32;1m[1;3m```json
{
  "action": "Final Answer",
  "action_input": "2015"
}
```[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
Quota exceeded, sleeping for 60s… (attempt 1/5)


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
  "action": "Final Answer",
  "action_input": "2015"
}
```[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
  "action": "Final Answer",
  "action_input": "Santa Clara, California"
}
```[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
  "action": "Final Answer",
  "action_input": "Levi's Stadium"
}
```[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
  "action": "Final Answer",
  "action_input": "24–10"
}
```[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
  "action": "Final Answer",
  "action_input": 

Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-1.5-flash-8b"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 15
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 41
}
].


[32;1m[1;3m```json
{
  "action": "Final Answer",
  "action_input": "8"
}
```[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
Quota exceeded, sleeping for 60s… (attempt 1/5)


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
  "action": "Final Answer",
  "action_input": "1995"
}
```[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
  "action": "Final Answer",
  "action_input": "The Carolina Panthers defeated the Arizona Cardinals."
}
```[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
  "action": "Final Answer",
  "action_input": "The New England Patriots"
}
```[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
  "action": "Final Answer",
  "action_input": "The Arizona Cardinals"
}
```[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```

Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-1.5-flash-8b"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 15
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 27
}
].


[32;1m[1;3m```json
{
  "action": "Final Answer",
  "action_input": "The Carolina Panthers beat the Arizona Cardinals in the NFC Championship game."
}
```[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
Quota exceeded, sleeping for 60s… (attempt 1/5)


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
  "action": "Final Answer",
  "action_input": "The Panthers have been to the Super Bowl two times."
}
```[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
  "action": "Final Answer",
  "action_input": "The Denver Broncos beat the New England Patriots in the AFC Championship game."
}
```[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
  "action": "tool_code",
  "action_input": "Search the internet for '2015 NFL MVP'"
}
```[0m
Observation: tool_code is not a valid tool, try one of [].
Thought:[32;1m[1;3m```json
{
  "action": "Final 

In [21]:
print(res1)

{'average_f1': 0.7083780206721383, 'num_samples': 50}


## <p style="padding:50px;background-color:#06402B;margin:0;color:#fafefe;font-family:newtimeroman;font-size:100%;text-align:center;border-radius: 15px 50px;overflow:hidden;font-weight:100">LLM Model 2: Zero-Shot Finetuned LlaMa-3B-Instruct Model</p>

### 1. Load the model and tokenizer

In [27]:
from llms.llama_3b_instruct_finetuned_llm import load_model, generate_response

In [28]:
tokenizer, model = load_model()

In [29]:
DEFAULT_SYSTEM_PROMPT = """
You are an expert QA assistant. Use ReAct-style reasoning:
- Start each step with \"Thought: <what you’re thinking>\"
- If you would call a tool, write \"Action: <tool_name>(<input>)\" (no actual tool is run)
- After reasoning, conclude with \"Final Answer: <your answer>\"

Be sure to:
1. Show your chain-of-thought step by step.
2. Then output exactly one \"Final Answer:\" line with the answer.
"""

Example

In [30]:
input =  "What is the capital of France?",
context = "The capital of France is Paris."

final_answer: str = generate_response(
    model=model,
    tokenizer=tokenizer,
    context=context,
    question=input,
    system_prompt=DEFAULT_SYSTEM_PROMPT,
)

print(final_answer)

 ('Paris')

    ### Output: 
    Final Answer: Paris

    Final Answer: Paris

    Final Answer: Paris

    Final Answer: Paris

    Final Answer: Paris

    Final Answer: Paris

    Final Answer: Paris

    Final Answer: Paris

    Final Answer: Paris

    Final Answer: Paris

    Final Answer: Paris

    Final Answer: Paris

    Final Answer: Paris

    Final Answer: Paris

    Final Answer: Paris

    Final Answer: Paris

    Final Answer: Paris

    Final Answer: Paris

    Final Answer: Paris

    Final Answer: Paris




### 2. Evaluate

In [31]:
PROMPT = PromptTemplate(
    input_variables=["input", "context", "chat_history"],
    template="""
### Instructions
You are an expert QA assistant. You must think through the reasoning step-by-step (chain-of-thought) before giving the final answer. Show your intermediate reasoning clearly.

### Context:
{context}

### User question:
{input}

### Reasoning (think step-by-step):
"""
)

In [42]:
def predict_model2(question: str, context: str) -> str:
    result = generate_response(model=model, tokenizer=tokenizer, context=context, question=question, system_prompt=PROMPT)
    return result

In [43]:
data_file = Path('experiments/data/validation.json')
print('Evaluating Model 2...')
res2 = evaluate_model(predict_model2, data_file)
print(res2)

Evaluating Model 2...
{'average_f1': 0.10473273657113578, 'num_samples': 50}
