# <p style="padding:50px;background-color:#06402B;margin:0;color:#fafefe;font-family:newtimeroman;font-size:100%;text-align:center;border-radius: 15px 50px;overflow:hidden;font-weight:100">Experiment 2: Zero-Shot Full Finetuned LlaMa vs Zero-Shot Gemini</p>

**Description:** This experiment is about comparing zero-shot Finetuned LlaMa-3B-Instruct model and zero-shot Google Gemini based on the SQUAD's validation dataset. 

## <p style="padding:50px;background-color:#06402B;margin:0;color:#fafefe;font-family:newtimeroman;font-size:100%;text-align:center;border-radius: 15px 50px;overflow:hidden;font-weight:100">Imports</p>

In [55]:
import re
import os
import sys
import json
import time
import string
import itertools

from collections import Counter
from typing import Callable, Dict
from google.api_core.exceptions import ResourceExhausted

from pathlib import Path
import warnings
warnings.filterwarnings(
    "ignore",
    message="Convert_system_message_to_human will be deprecated!"
)

# LangChain components
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.memory import ConversationBufferWindowMemory
from langchain_google_genai import ChatGoogleGenerativeAI

from langchain_core.documents import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain_community.vectorstores import Qdrant

## <p style="padding:50px;background-color:#06402B;margin:0;color:#fafefe;font-family:newtimeroman;font-size:100%;text-align:center;border-radius: 15px 50px;overflow:hidden;font-weight:100">RAG</p>

### 1. Define Data Source Path

In [13]:
INPUT_FILE = Path("experiments/data/validation.json")
OUTPUT_FILE = Path("experiments/data/context.txt")

### 2. Define Extract Method

In [14]:
def extract(input_path):
    context_set = set()
    with input_path.open("r", encoding="utf-8") as f:
        for line in f:
            record = json.loads(line)
            ctx = record.get("context")
            if ctx and ctx not in context_set:
                context_set.add(ctx)
                yield ctx

def write_contexts_to_file(contexts, output_path):
    with output_path.open("w", encoding="utf-8") as f:
        for i, ctx in enumerate(contexts, start=1):
            f.write(ctx)
            f.write("\n\n")
    print(f"Wrote {i} contexts to {output_path}")

In [15]:
contexts = extract(INPUT_FILE)
write_contexts_to_file(contexts, OUTPUT_FILE)

Wrote 2067 contexts to experiments\data\context.txt


### 3. Store Data in Qdrant DB

In [16]:
QDRANT_URL = os.getenv("QDRANT_URL")
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

In [17]:
def get_fastembed_model():
    return FastEmbedEmbeddings()

def create_embeddings():
    return get_fastembed_model()

def load_text_files(txt_file: str) -> list[Document]:
    text = Path(txt_file).read_text(encoding="utf-8")
    blocks = [blk.strip() for blk in text.split("\n\n") if blk.strip()]
    docs = []
    for idx, blk in enumerate(blocks, start=1):
        docs.append(
            Document(
                page_content=blk,
                metadata={
                    "source": Path(txt_file).name,
                    "block_index": idx,
                },
            )
        )
    return docs

def build_qdrant_index(docs: list[Document], embeddings):
    return Qdrant.from_documents(
        documents=docs,
        embedding=embeddings,
        location=":memory:",
        collection_name="text_chunks",
    )

def generate_retriever(txt_dir: str = "./data/all_contexts.txt"):
    raw_docs = load_text_files(txt_dir)
    print(f"Loaded {len(raw_docs)} documents from {txt_dir}")
    embeddings = create_embeddings()
    vectorstore = build_qdrant_index(raw_docs, embeddings)
    retriever = vectorstore.as_retriever(
        search_type="similarity",
        search_kwargs={"k": 5},
    )

    return retriever

In [18]:
retriever = generate_retriever(txt_dir="./experiments/data/context.txt")

Loaded 2067 documents from ./experiments/data/context.txt


  from .autonotebook import tqdm as notebook_tqdm
Fetching 5 files: 100%|██████████| 5/5 [00:53<00:00, 10.65s/it]


Example

In [19]:
query = "What is ABC?"
results = retriever.get_relevant_documents(query)
# get text from the first 5 results
print(f"Found {results[0].page_content} relevant documents.")

  results = retriever.get_relevant_documents(query)


Found Since its inception, ABC has had many affiliated stations, which include WABC-TV and WPVI-TV, the first two stations to carry the network's programming. As of March 2015[update], ABC has eight owned-and-operated stations, and current and pending affiliation agreements with 235 additional television stations encompassing 49 states, the District of Columbia, four U.S. possessions, Bermuda and Saba; this makes ABC the largest U.S. broadcast television network by total number of affiliates. The network has an estimated national reach of 96.26% of all households in the United States (or 300,794,157 Americans with at least one television set). relevant documents.


## <p style="padding:50px;background-color:#06402B;margin:0;color:#fafefe;font-family:newtimeroman;font-size:100%;text-align:center;border-radius: 15px 50px;overflow:hidden;font-weight:100">Evaluation Functions</p>

### 1. Define Evaluation Functions

In [None]:
def f1_score(prediction: str, ground_truth: str) -> float:
    """
    Compute the token-level F1 score between prediction and ground_truth.
    """
    pred_tokens = prediction.split()
    gt_tokens = ground_truth.split()
    common = Counter(pred_tokens) & Counter(gt_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0.0
    precision = num_same / len(pred_tokens)
    recall = num_same / len(gt_tokens)
    return 2 * precision * recall / (precision + recall)

def evaluate_model(
    predict_fn: Callable[[str, str], str],
    data_path: Path,
    max_rows: int = 50
) -> Dict[str, float]:
    """
    Evaluate a model over the first `max_rows` entries of a JSONL file with fields: context, question, answers.text.
    Returns average F1 score and number of samples evaluated.
    """
    scores = []
    with data_path.open('r', encoding='utf-8') as f:
        for line in itertools.islice(f, max_rows):
            record = json.loads(line)
            question = record.get('question', '')
            docs = retriever.get_relevant_documents(question)
            context = "\n".join([doc.page_content for doc in docs])
            gold_texts = record.get('answers', {}).get('text', [])
            
            # get prediction
            pred = predict_fn(question, context)
            
            # compute best F1 against all golds
            if gold_texts:
                sample_f1 = max(f1_score(pred, gt) for gt in gold_texts)
            else:
                sample_f1 = 0.0
            scores.append(sample_f1)

    num_samples = len(scores)
    avg_f1 = sum(scores) / num_samples if num_samples else 0.0
    return {"average_f1": avg_f1, "num_samples": num_samples}

## <p style="padding:50px;background-color:#06402B;margin:0;color:#fafefe;font-family:newtimeroman;font-size:100%;text-align:center;border-radius: 15px 50px;overflow:hidden;font-weight:100">LLM Model 1: Zero-Shot Google Gemini</p>

### 1. Define Model & API keys

In [22]:
from dotenv import load_dotenv
load_dotenv()

MODEL_NAME = "gemini-1.5-flash-8b"
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
LANGCHAIN_PROJECT = os.getenv("LANGCHAIN_PROJECT")

### 2. Instantiate Model

In [23]:
PROMPT = PromptTemplate(
    input_variables=["input", "context"],
    template="""
        ### Instructions\n
        You are an expert QA assistant that answer user's question based on the context.\n\n
        ### Chat history\n
        {chat_history}\n\n
        ### Context:\n
        {context}\n\n
        ### User question:\n
        {input}\n\n
    """
)

llm = ChatGoogleGenerativeAI(
    model=MODEL_NAME,
    google_api_key=GEMINI_API_KEY,
    convert_system_message_to_human=True,
    prompt=PROMPT,
    temperature=0.7
)

memory = ConversationBufferWindowMemory(
    memory_key="chat_history",
    input_key='input',
    output_key='text',
    return_messages=True,
    k=5,
)

llm_chain = LLMChain(
    llm=llm,
    prompt=PROMPT,
    memory=memory,
)

Unexpected argument 'prompt' provided to ChatGoogleGenerativeAI.
                prompt was transferred to model_kwargs.
                Please confirm that prompt is what you intended.
  exec(code_obj, self.user_global_ns, self.user_ns)
  memory = ConversationBufferWindowMemory(
  llm_chain = LLMChain(


Example

In [18]:
response_text = llm_chain.invoke({
    "input": "What is the capital of France?",
    "context": "The capital of France is Paris."
})

print("Assistant:", response_text["text"])

Assistant: Paris.


### 3. Evaluate

In [60]:
def predict_model1(question: str, context: str) -> str:
    for attempt in range(5):
        try:
            result = llm_chain.invoke({"input": question, "context": context})
            return result.get("text", "")
        except ResourceExhausted as e:
            wait = 60  # sleep long enough to clear the per-minute bucket
            print(f"Quota exceeded, sleeping for {wait}s… (attempt {attempt+1}/5)")
            time.sleep(wait)
    raise RuntimeError("Max retries hit for model1 due to quota limits")

In [62]:
data_file = Path('experiments/data/validation.json')
print('Evaluating Model 1...')
res1 = evaluate_model(predict_model1, data_file)

Evaluating Model 1...


Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..


Quota exceeded, sleeping for 60s… (attempt 1/5)


Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..


Quota exceeded, sleeping for 60s… (attempt 1/5)


Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..


Quota exceeded, sleeping for 60s… (attempt 1/5)


In [64]:
print(res1)

{'average_f1': 0.5457342444277429, 'num_samples': 50}


## <p style="padding:50px;background-color:#06402B;margin:0;color:#fafefe;font-family:newtimeroman;font-size:100%;text-align:center;border-radius: 15px 50px;overflow:hidden;font-weight:100">LLM Model 2: Zero-Shot Finetuned LlaMa-3B-Instruct Model</p>

### 1. Load the model and tokenizer

In [46]:
from llms.llama_3b_instruct_finetuned_llm import load_model, generate_response

In [35]:
tokenizer, model = load_model()

2025-05-15 06:34:23.366 
  command:

    streamlit run d:\MET\Semester 10\[CSEN1076] Natural Language Processing and Information Retrieval\Project\NLP-Project\.venv\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
  _ = torch.tensor([0], device=i)
Loading checkpoint shards: 100%|██████████| 2/2 [00:10<00:00,  5.11s/it]


Example

In [47]:
input =  "What is the capital of France?",
context = "The capital of France is Paris."

final_answer: str = generate_response(
    model=model,
    tokenizer=tokenizer,
    context=context,
    question=input,
)

print(final_answer)

 The capital of France is Paris.

    ### Output:
    What is the capital of France?
    Paris

    ### Notes:
    - Answered the user question about the capital of France.
    - Did not provide any additional information not present in the context.
    - Did not provide a response in the form of a question. 

    ### Next Steps: 
    - If the user asks another question, provide an answer based on the context.
    - If the user asks a follow-up question, provide an answer based on the context.
    - If the user requests clarification or additional information, provide clarification or additional information based on


### 2. Evaluate

In [51]:
def predict_model2(question: str, context: str) -> str:
    result = generate_response(model=model, tokenizer=tokenizer, context=context, question=question)
    return result

In [59]:
data_file = Path('experiments/data/validation.json')
print('Evaluating Model 2...')
res2 = evaluate_model(predict_model2, data_file)
print(res2)

Evaluating Model 2...
{'average_f1': 0.18194863065806732, 'num_samples': 50}
