In [1]:
import pandas as pd
import ollama
import numpy as np
from rank_bm25 import BM25Okapi

In [2]:
# load dataset
df = pd.read_excel('../combined_slang_data_multi.xlsx')
print(f'Loaded {len(df)} entries')

# initialize embedding and language models
embedding_model = 'snowflake-arctic-embed'
language_model = 'llama3.1:8b'

Loaded 547 entries


In [3]:
df_eval = pd.read_csv('eval_mcq.csv')

In [4]:
# create a string per unique word by combining all rows for that word
combined_rows_as_strings_dict = {}
for i, row in df.iterrows():
    # append or create string entry for each word
    word = row['slang']
    row_string = "\n".join(f"{k}: {v}" for k, v in row.items())
    combined_rows_as_strings_dict[word] = combined_rows_as_strings_dict.get(word, "") + "\n" + row_string
combined_rows_as_strings = list(combined_rows_as_strings_dict.values())

In [5]:
VECTOR_DB = [] # a list of (text, embedding) tuples

def add_text_to_db(text):
    embedding = ollama.embed(model=embedding_model, input=text)['embeddings'][0]
    VECTOR_DB.append((text, embedding))

for row_text in combined_rows_as_strings:
    add_text_to_db(row_text)

In [6]:
def tokenize(text: str):
    # simple tokenizer; you can make this smarter if you want
    return text.lower().split()

# texts in the same order as VECTOR_DB
doc_texts = [text for text, _ in VECTOR_DB]
tokenized_docs = [tokenize(t) for t in doc_texts]

bm25 = BM25Okapi(tokenized_docs)

In [7]:
EMB_MATRIX = np.vstack([np.asarray(emb, dtype=np.float32) for _, emb in VECTOR_DB])
EMB_NORMS = np.linalg.norm(EMB_MATRIX, axis=1) + 1e-9
DOC_TEXTS = [text for text, _ in VECTOR_DB]  # keep alignment

def hybrid_search(query_text, top_k=10, alpha=0.4):
    q_embedding = np.asarray(
        ollama.embed(model=embedding_model, input=query_text)['embeddings'][0],
        dtype=np.float32
    )
    q_norm = np.linalg.norm(q_embedding) + 1e-9

    # vectorized dense scores
    dense_scores = (EMB_MATRIX @ q_embedding) / (EMB_NORMS * q_norm)

    # BM25
    query_tokens = tokenize(query_text)
    bm25_scores = np.array(bm25.get_scores(query_tokens), dtype=np.float32)

    # min-max normalize
    def minmax_normalize(x):
        x_min, x_max = x.min(), x.max()
        if x_max == x_min:
            return np.zeros_like(x)
        return (x - x_min) / (x_max - x_min)

    dense_norm = minmax_normalize(dense_scores)
    bm25_norm = minmax_normalize(bm25_scores)

    hybrid_scores = alpha * dense_norm + (1 - alpha) * bm25_norm

    # fast top-k with argpartition
    top_idx = np.argpartition(-hybrid_scores, top_k - 1)[:top_k]
    # sort those top-k
    top_idx = top_idx[np.argsort(-hybrid_scores[top_idx])]

    results = []
    for rank, idx in enumerate(top_idx, start=1):
        text = DOC_TEXTS[idx]
        results.append({
            "rank": rank,
            "index": int(idx),
            "hybrid_score": float(hybrid_scores[idx]),
            "dense_score": float(dense_scores[idx]),
            "bm25_score": float(bm25_scores[idx]),
            "text": text,
        })
    return results

In [8]:
def build_prompt(question, options):
    options_text = "\n".join([f"{chr(97 + i)}) {opt}" for i, opt in enumerate(options)])
    prompt = f"Question: {question}\nOptions:\n{options_text}\nAnswer:"
    return prompt

prompt = build_prompt(
    df_eval.iloc[0]['sentence'],
    [df_eval.iloc[0][opt] for opt in ['option_a', 'option_b', 'option_c', 'option_d']]
)

# test model response using language_model
response = ollama.chat(
    model=language_model,
    messages=[
        {"role": "system", "content": "You are a helpful assistant that answers multiple choice questions with just the correct letter."},
        {"role": "user", "content": 'the sun is a star. A) true, B) false'},
    ]
)

In [9]:
# now with RAG

def build_rag_prompt(question, options, context):
    options_text = "\n".join([f"{chr(97 + i)}) {opt}" for i, opt in enumerate(options)])
    prompt = f"\nContext:\n{context}\nQuestion: {question}\nOptions:\n{options_text}\nAnswer:"
    return prompt

def build_rag_context(prompt, top_k=5):
    results = hybrid_search(prompt, top_k=top_k)
    context = "\n".join([r['text'] for r in results])
    return context

def evaluate_rag_model(df_eval, top_k=5):
    prompts = []
    responses = []
    num_correct = 0
    num_questions = len(df_eval)
    for i in range(len(df_eval)):
        question = df_eval.iloc[i]['sentence']
        options = [df_eval.iloc[i][opt] for opt in ['option_a', 'option_b', 'option_c', 'option_d']]

        pre_rag_prompt = build_prompt(question, options)
        context = build_rag_context(pre_rag_prompt, top_k=top_k)

        rag_prompt = build_rag_prompt(question, options, context)
        response = ollama.chat(
            model=language_model,
            messages=[
                {"role": "system", "content": "You are a helpful assistant that answers multiple choice questions with just the correct letter."},
                {"role": "user", "content": rag_prompt},
            ]
        )
        answer = response.message.content[0]
        correct_answer = df_eval.iloc[i]['correct_answer']
        if answer.lower() == correct_answer.lower():
            num_correct += 1

        # logging
        if i % 10 == 0:
            print(f"Processed {i + 1} questions, accuracy: {num_correct / (i + 1):.2%}")

        prompts.append(rag_prompt)
        responses.append(answer)

    accuracy = num_correct / num_questions
    return accuracy, prompts, responses

In [10]:
# actually test the RAG model

accuracy_rag, prompts, responses = evaluate_rag_model(df_eval, top_k=5)
print(f"RAG Model Accuracy: {accuracy_rag:.2%}")

Processed 1 questions, accuracy: 100.00%
Processed 11 questions, accuracy: 90.91%
Processed 21 questions, accuracy: 90.48%
Processed 31 questions, accuracy: 83.87%
Processed 41 questions, accuracy: 85.37%
Processed 51 questions, accuracy: 84.31%
Processed 61 questions, accuracy: 86.89%
Processed 71 questions, accuracy: 85.92%
Processed 81 questions, accuracy: 85.19%
Processed 91 questions, accuracy: 86.81%
Processed 101 questions, accuracy: 88.12%
Processed 111 questions, accuracy: 88.29%
Processed 121 questions, accuracy: 89.26%
Processed 131 questions, accuracy: 87.79%
Processed 141 questions, accuracy: 86.52%
Processed 151 questions, accuracy: 86.09%
Processed 161 questions, accuracy: 86.34%
Processed 171 questions, accuracy: 86.55%
Processed 181 questions, accuracy: 86.19%
Processed 191 questions, accuracy: 85.34%
Processed 201 questions, accuracy: 85.07%
Processed 211 questions, accuracy: 85.78%
Processed 221 questions, accuracy: 85.97%
Processed 231 questions, accuracy: 84.85%
Pr