In [1]:
import requests
import pandas as pd

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [2]:
%pip install -U ipywidgets jupyterlab notebook


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
os.environ["TQDM_NOTEBOOK"] = "0"   # set before importing tqdm
from tqdm.auto import tqdm


In [4]:
from tqdm.auto import tqdm
from tqdm import TqdmWarning
import warnings
warnings.filterwarnings("ignore", category=TqdmWarning)

In [5]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams

model_handle = "jinaai/jina-embeddings-v2-small-en"
model = SentenceTransformer(model_handle, trust_remote_code=True, device="cpu")  # use "cuda" only if you have room
dim = model.get_sentence_embedding_dimension()

client = QdrantClient(host="localhost", port=6333)
COLL = "qa_qdrant_jina_small"

if client.collection_exists(COLL):
    client.delete_collection(COLL)

client.create_collection(
    collection_name=COLL,
    vectors_config=VectorParams(size=dim, distance=Distance.COSINE),
)
# Optional if you filter by course:
# client.create_payload_index(COLL, field_name="course", field_schema="keyword")


True

In [6]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# If you haven't already:
# model = SentenceTransformer(model_handle, trust_remote_code=True, device="cpu")

# Optional: cap sequence length to reduce RAM/VRAM
try:
    model.max_seq_length = 1024  # lower if needed (e.g., 512)
except Exception:
    pass


In [7]:
from qdrant_client.models import PointStruct

def batched(seq, n):
    for i in range(0, len(seq), n):
        yield seq[i:i+n]

UPSERT_BATCH = 64    # try 64; drop to 32/16 if memory is tight
ENCODE_BATCH = 8     # encoder micro-batch; drop to 4/2 if needed
MAX_CHARS    = 1500  # truncate very long docs to keep memory predictable

for docs_chunk in batched(documents, UPSERT_BATCH):
    texts = [f"{d['question']} {d['text']}"[:MAX_CHARS] for d in docs_chunk]

    embs = model.encode(
        texts,
        batch_size=ENCODE_BATCH,
        convert_to_numpy=True,
        normalize_embeddings=True,
        show_progress_bar=False,
    )

    points = [
        PointStruct(
            id=str(doc["id"]),
            vector=embs[i].tolist(),
            payload={
                "id": str(doc["id"]),
                "question": doc["question"],
                "text": doc["text"],
                "course": doc["course"],
            },
        )
        for i, doc in enumerate(docs_chunk)
    ]

    client.upsert(collection_name=COLL, points=points)

    # If you still flirt with OOM (especially on GPU), uncomment:
    # import gc, torch
    # gc.collect()
    # if torch.cuda.is_available(): torch.cuda.empty_cache()


UnexpectedResponse: Unexpected Response: 400 (Bad Request)
Raw response content:
b'{"status":{"error":"Format error in JSON body: value c02e79ef is not a valid point ID, valid values are either an unsigned integer or a UUID"},"time":0.0}'

In [1]:
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue
import numpy as np

In [None]:
# Model + data
model_handle = "jinaai/jina-embeddings-v2-small-en"
model = SentenceTransformer(model_handle, trust_remote_code=True)

texts_qA = [f"{d['question']} {d['text']}" for d in documents]
embs = model.encode(texts_qA, normalize_embeddings=True)

# Qdrant collection (512-d cosine for this model)
client = QdrantClient(host="localhost", port=6333)
COLL = "qa_qdrant_jina_small"
client.recreate_collection(
    collection_name=COLL,
    vectors_config=VectorParams(size=512, distance=Distance.COSINE),
)

points = [
    PointStruct(
        id=str(doc["id"]),
        vector=embs[i].tolist(),
        payload={"id": str(doc["id"]), "course": doc["course"], "question": doc["question"], "text": doc["text"]},
    )
    for i, doc in enumerate(documents)
]
client.upsert(collection_name=COLL, points=points)

In [12]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

# dataset URLs (same as earlier)
url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'

documents = pd.read_json(docs_url).to_dict(orient='records')
df_ground_truth = pd.read_csv(ground_truth_url)
df_results = pd.read_csv(results_url)

print("Rows in results:", len(df_results))
df_results.head(2)


Rows in results: 1830


Unnamed: 0,answer_llm,answer_orig,document,question,course
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp


In [13]:
# Peek at available columns
print(df_results.columns.tolist())

# Helper to find the first existing column from a candidate list
def pick_col(df, candidates):
    for c in candidates:
        if c in df.columns:
            return c
    raise ValueError(f"None of the columns found: {candidates}")

# Common names used in different notebooks/repos
llm_candidates  = ["answer_llm", "llm_answer", "generated_answer", "answer_model", "response"]
ref_candidates  = ["answer_orig", "original_answer", "reference_answer", "ground_truth_answer", "true_answer", "answer"]

llm_col = pick_col(df_results, llm_candidates)
ref_col = pick_col(df_results, ref_candidates)

print("Using LLM column:    ", llm_col)
print("Using reference col: ", ref_col)


['answer_llm', 'answer_orig', 'document', 'question', 'course']
Using LLM column:     answer_llm
Using reference col:  answer_orig


In [14]:
# Clean up text (ensure strings, handle missing)
answers_llm = df_results[llm_col].fillna("").astype(str).tolist()
answers_ref = df_results[ref_col].fillna("").astype(str).tolist()

# Fit TF-IDF + SVD on the joint corpus (simple, consistent with Embeddings section)
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

pipeline.fit(answers_llm + answers_ref)


0,1,2
,steps,"[('tfidfvectorizer', ...), ('truncatedsvd', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,n_components,128
,algorithm,'randomized'
,n_iter,5
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,1
,tol,0.0


In [16]:
# Create embeddings for each side
U = pipeline.transform(answers_llm)   # model-generated answers
V = pipeline.transform(answers_ref)   # ground-truth answers

# Your cosine() function
def cosine(u, v):
    u_norm = np.sqrt(u.dot(u))
    v_norm = np.sqrt(v.dot(v))
    return u.dot(v) / (u_norm * v_norm)

# Vectorized cosine for all pairs (row-wise)
# (U and V are 2D arrays; compute per-row dot products and norms)
num = np.sum(U * V, axis=1)
den = np.linalg.norm(U, axis=1) * np.linalg.norm(V, axis=1)
cos_all = num / np.where(den == 0, 1e-12, den)

avg_cosine = float(np.mean(cos_all))
print(f"Average cosine similarity: {avg_cosine:.4f}")


Average cosine similarity: 0.7711


In [17]:
!pip install -q rouge==1.0.1

import numpy as np
from rouge import Rouge



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [18]:
# Reuse if already defined; otherwise detect columns
def pick_col(df, candidates):
    for c in candidates:
        if c in df.columns:
            return c
    raise ValueError(f"None of the columns found: {candidates}")

llm_col = pick_col(df_results, ["answer_llm", "llm_answer", "generated_answer", "answer_model", "response"])
ref_col = pick_col(df_results, ["answer_orig", "original_answer", "reference_answer", "ground_truth_answer", "true_answer", "answer"])

llm_col, ref_col


('answer_llm', 'answer_orig')

In [19]:
rouge_scorer = Rouge()

r = df_results.iloc[10]  # index 10
scores_10 = rouge_scorer.get_scores(str(r[llm_col]), str(r[ref_col]))[0]
scores_10, scores_10["rouge-1"]["f"]


({'rouge-1': {'r': 0.45454545454545453,
   'p': 0.45454545454545453,
   'f': 0.45454544954545456},
  'rouge-2': {'r': 0.21621621621621623,
   'p': 0.21621621621621623,
   'f': 0.21621621121621637},
  'rouge-l': {'r': 0.3939393939393939,
   'p': 0.3939393939393939,
   'f': 0.393939388939394}},
 0.45454544954545456)

In [20]:
rouge_scorer = Rouge()

hyp_list = df_results[llm_col].fillna("").astype(str).tolist()
ref_list = df_results[ref_col].fillna("").astype(str).tolist()

rouge1_f = []
for hyp, ref in zip(hyp_list, ref_list):
    sc = rouge_scorer.get_scores(hyp, ref)[0]
    rouge1_f.append(sc["rouge-1"]["f"])

avg_rouge1_f1 = float(np.mean(rouge1_f))
print(f"Average ROUGE-1 F1: {avg_rouge1_f1:.4f}")


Average ROUGE-1 F1: 0.3517
