In [1]:
import pandas as pd
import ollama
import numpy as np
from rank_bm25 import BM25Okapi

In [2]:
# initialize embedding and language models
embedding_model = 'snowflake-arctic-embed'
language_model = 'llama3.1:8b'

In [3]:
# load dataset
df = pd.read_excel('../combined_slang_data_multi.xlsx')

# create a string per unique word by combining all rows for that word
combined_rows_as_strings_dict = {}
for i, row in df.iterrows():
    # append or create string entry for each word
    word = row['slang']
    row_string = "\n".join(f"{k}: {v}" for k, v in row.items())
    combined_rows_as_strings_dict[word] = combined_rows_as_strings_dict.get(word, "") + "\n" + row_string
combined_rows_as_strings = list(combined_rows_as_strings_dict.values())

VECTOR_DB = [] # a list of (text, embedding) tuples

def add_text_to_db(text):
    embedding = ollama.embed(model=embedding_model, input=text)['embeddings'][0]
    VECTOR_DB.append((text, embedding))

for row_text in combined_rows_as_strings:
    add_text_to_db(row_text)

In [4]:
df_eval = pd.read_json('eval_reverse.jsonl', lines=True)

In [5]:
def tokenize(text: str):
    # simple tokenizer; you can make this smarter if you want
    return text.lower().split()

# texts in the same order as VECTOR_DB
doc_texts = [text for text, _ in VECTOR_DB]
tokenized_docs = [tokenize(t) for t in doc_texts]

bm25 = BM25Okapi(tokenized_docs)

In [6]:
EMB_MATRIX = np.vstack([np.asarray(emb, dtype=np.float32) for _, emb in VECTOR_DB])
EMB_NORMS = np.linalg.norm(EMB_MATRIX, axis=1) + 1e-9
DOC_TEXTS = [text for text, _ in VECTOR_DB]  # keep alignment

def hybrid_search(query_text, top_k=10, alpha=0.4):
    q_embedding = np.asarray(
        ollama.embed(model=embedding_model, input=query_text)['embeddings'][0],
        dtype=np.float32
    )
    q_norm = np.linalg.norm(q_embedding) + 1e-9

    # vectorized dense scores
    dense_scores = (EMB_MATRIX @ q_embedding) / (EMB_NORMS * q_norm)

    # BM25
    query_tokens = tokenize(query_text)
    bm25_scores = np.array(bm25.get_scores(query_tokens), dtype=np.float32)

    # min-max normalize
    def minmax_normalize(x):
        x_min, x_max = x.min(), x.max()
        if x_max == x_min:
            return np.zeros_like(x)
        return (x - x_min) / (x_max - x_min)

    dense_norm = minmax_normalize(dense_scores)
    bm25_norm = minmax_normalize(bm25_scores)

    hybrid_scores = alpha * dense_norm + (1 - alpha) * bm25_norm

    # fast top-k with argpartition
    top_idx = np.argpartition(-hybrid_scores, top_k - 1)[:top_k]
    # sort those top-k
    top_idx = top_idx[np.argsort(-hybrid_scores[top_idx])]

    results = []
    for rank, idx in enumerate(top_idx, start=1):
        text = DOC_TEXTS[idx]
        results.append({
            "rank": rank,
            "index": int(idx),
            "hybrid_score": float(hybrid_scores[idx]),
            "dense_score": float(dense_scores[idx]),
            "bm25_score": float(bm25_scores[idx]),
            "text": text,
        })
    return results

In [7]:
#==================== f1 score ====================
import re
from typing import Dict, List, Any

_token_re = re.compile(r"\w+")

def _tokenize(text: str) -> List[str]:
    return _token_re.findall(text.lower())

# calculates f1 score between prediction and target
def explanation_token_f1(pred: str, target: str) -> float:
    """
    Simple token-level F1 between prediction and target.
    """
    pred_tokens = _tokenize(pred)
    tgt_tokens  = _tokenize(target)

    if not pred_tokens or not tgt_tokens:
        return 0.0

    pred_counts: Dict[str, int] = {}
    tgt_counts: Dict[str, int] = {}
    for t in pred_tokens:
        pred_counts[t] = pred_counts.get(t, 0) + 1
    for t in tgt_tokens:
        tgt_counts[t] = tgt_counts.get(t, 0) + 1

    overlap = 0
    for t, c in pred_counts.items():
        if t in tgt_counts:
            overlap += min(c, tgt_counts[t])

    precision = overlap / max(len(pred_tokens), 1)
    recall    = overlap / max(len(tgt_tokens), 1)
    if precision + recall == 0:
        return 0.0
    return 2 * precision * recall / (precision + recall)

#==================== exact match ====================

def exact_match(pred: str, target: str) -> bool:
    """
    Check if prediction exactly matches target (case-insensitive, whitespace-normalized).
    """
    return pred.strip().lower() == target.strip().lower()

#==================== average word length ====================

def avg_word_length(texts: List[str]) -> float:
    """
    Compute average word count across all texts.
    """
    if not texts:
        return 0.0
    total_words = sum(len(_tokenize(text)) for text in texts)
    return total_words / len(texts)

#==================== bertscore ====================

# compute bertscore between prediction and target lists
def compute_bertscore_f1(preds: List[str], targets: List[str]) -> float | None:
    """
    Compute mean BERTScore F1 over (pred, target) pairs.
    Requires `bert_score` package. If not available, returns None.
    """
    if not preds or not targets:
        return 0.0

    try:
        from bert_score import score as bertscore
    except ImportError:
        # You can pip install bert_score to enable this
        print("[warn] bert_score not installed; skipping BERTScore metric.")
        return None

    P, R, F1 = bertscore(
        preds,
        targets,
        lang="en",
        rescale_with_baseline=True,  # common setting
    )
    return float(F1.mean().item())

In [10]:
def build_rag_prompt(pre_rag_prompt, context):
    prompt = f"\nContext:\n{context}\n{pre_rag_prompt}"
    return prompt

def build_rag_context(prompt, top_k=5):
    results = hybrid_search(prompt, top_k=top_k)
    context = "\n".join([r['text'] for r in results])
    return context

def evaluate_rag_model(df_eval, top_k=5):
    token_f1_scores = []
    exact_matches = []
    preds = []
    targets = []
    prompts = []
    responses = []
    
    num_questions = len(df_eval)
    for i in range(len(df_eval)):
        row = df_eval.iloc[i]
        pre_rag_prompt = row['prompt']
        context = build_rag_context(pre_rag_prompt, top_k=top_k)

        rag_prompt = build_rag_prompt(pre_rag_prompt, context)
        response = ollama.chat(
            model=language_model,
            messages=[
                {"role": "system", "content": "You are a helpful assistant that explains the meanings of sentences containing slang in standard English."},
                {"role": "user", "content": rag_prompt},
            ]
        )
        response_text = response.message.content

        # handle potential None response
        if response_text is None:
            response_text = ""
        
        prompts.append(rag_prompt)
        responses.append(response_text)
        
        target_response = row['target']

        # calculate metrics
        f1 = explanation_token_f1(response_text, target_response)
        token_f1_scores.append(f1)
        exact_matches.append(exact_match(response_text, target_response))
        preds.append(response_text)
        targets.append(target_response)

        if (i + 1) % 10 == 0:
            print(i)

    # compute final metrics
    mean_f1 = float(np.mean(token_f1_scores)) if token_f1_scores else 0.0
    exact_match_rate = float(np.mean(exact_matches)) if exact_matches else 0.0
    avg_pred_len = avg_word_length(preds)
    mean_bertscore_f1 = compute_bertscore_f1(preds, targets)
    
    metrics = {
        "mean_f1": mean_f1,
        "mean_bertscore_f1": mean_bertscore_f1,
        "exact_match_rate": exact_match_rate,
        "avg_pred_length_words": avg_pred_len,
        "num_examples": num_questions,
    }
    
    return metrics, prompts, responses

In [11]:
metrics, prompts, responses = evaluate_rag_model(df_eval, top_k=5)
print("Final evaluation metrics:")
print(metrics)

9
19
29
39
49
59
69
79
89
99
109
119
129
139
149
159
169
179
189
199
209
219
229
239
249
259
269
279
289
299
309
319
329
339
349
359
369
379
389


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Final evaluation metrics:
{'mean_f1': 0.11844365862181162, 'mean_bertscore_f1': 0.018581148236989975, 'exact_match_rate': 0.0, 'avg_pred_length_words': 90.31979695431473, 'num_examples': 394}
