In [None]:
import os
import json
import pandas as pd
import numpy as np
import string
import pickle

from rank_bm25 import BM25Okapi

from sklearn.metrics import ndcg_score, precision_recall_fscore_support

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

import transformers
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

from torch.utils.data import DataLoader, Dataset
import torch

from openai import OpenAI

from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
nltk.download('punkt')
nltk.download('punkt_tab')

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

stemmer=PorterStemmer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Paths
data_path = "answer_retrieval"
subset_answers_path = os.path.join(data_path, "subset_answers.json")

In [None]:
# Function to load data from jsonl. this one makes everything into one dataframe. 
def load_subset_data(folder):
    file_path = os.path.join(data_path, folder, "subset_data.jsonl")
    return pd.read_json(file_path, lines=True)

# Load data for train, test, and val
train_data = load_subset_data("train")
test_data = load_subset_data("test")
val_data = load_subset_data("val")
display(train_data.head()) 

print(f"Train examples: {len(train_data)}, Test examples: {len(test_data)}, Val examples: {len(val_data)}")

Unnamed: 0,id,text,title,timestamp,score,views,favorite,user_id,user_questions,user_answers,tags,rel_ids,rel_scores,rel_timestamps,best_answer
0,academia_100305,What are CNRS research units and how are they ...,What are CNRS research units and how are they ...,2017-12-11 16:30:20,14,2484,2.0,1106095,"[workplace_40845, workplace_40899, workplace_9...","[travel_45926, travel_46391, travel_47403, tra...","[funding, france]",[academia_100217],[1],"[1512814966, 1513014615, 1513020822]",academia_100217
1,academia_100456,Is there a free (as in freedom) alternative to...,Is there a free (as in freedom) alternative to...,2017-12-13 19:02:32,13,1117,2.0,1106095,"[workplace_40845, workplace_40899, workplace_9...","[travel_45926, travel_46391, travel_47403, tra...","[peer-review, open-access]",[academia_100462],[1],"[1513205016, 1536615064, 1553005541, 1615097827]",academia_100462
2,academia_103390,Search for StackExchange citations with Google...,Search for StackExchange citations with Google...,2018-02-06 16:40:59,2,157,1.0,1532620,"[writers_27613, writers_29562, sound_42166, so...","[skeptics_39944, philosophy_3098, philosophy_9...","[citations, google-scholar]",[academia_103391],[1],[1517936080],academia_103391
3,academia_10481,Reproducible research and corporate identity M...,Reproducible research and corporate identity,2013-06-06 09:11:05,18,372,1.0,1106095,"[academia_1698, academia_1772, academia_1911, ...","[academia_1699, academia_1700, academia_1701, ...","[copyright, creative-commons]",[academia_10499],[1],"[1370596608, 1370601095]",academia_10499
4,academia_10649,Advantages of second marking In the UK a porti...,Advantages of second marking,2013-06-17 12:24:37,6,1235,2.0,1106095,"[academia_1698, academia_1772, academia_1911, ...","[academia_1699, academia_1700, academia_1701, ...",[assessment],[academia_10650],[1],"[1371477146, 1371477156, 1371552185]",academia_10650


Train examples: 10000, Test examples: 100, Val examples: 100


In [5]:
# Load subset_answers
def load_subset_answers() -> dict:
    with open(subset_answers_path) as file:
        data = json.load(file)
    return data


answers_dict: dict = load_subset_answers()
print(f"Total answers loaded: {len(answers_dict)}")

Total answers loaded: 9398


In [6]:
# Extract questions and answers from subset_data.jsonl
def extract_questions_and_answers(data):
    questions = data[['text']]
    answers = data[['best_answer']]
    return questions, answers

train_questions, train_answers = extract_questions_and_answers(train_data)
test_questions, test_answers = extract_questions_and_answers(test_data)
val_questions, val_answers = extract_questions_and_answers(val_data)

display(test_questions)


Unnamed: 0,text
0,After what George was Georgetown University na...
1,Can someone explain why Garou made Saitama do ...
2,Why did Madara want to resurrect if with the E...
3,What is the ~/Applications directory for? I wa...
4,How to make notification but no noise when tim...
...,...
95,What are the main criteria used by the Europea...
96,What happened to most of Five Star Movement (M...
97,How did Dumbledore know what Ron saw in the Mi...
98,Do the residents of Facade talk in a language ...


In [7]:
punctuations = list(string.punctuation)
def preprocess_text(text):
    text = text.lower()
    text = "".join(char for char in text if char not in punctuations) # Removing punctuations from the text
    text = word_tokenize(text)
    text = [word for word in text if word not in stop_words]
    return text

In [8]:
saved_bm25_model_path = os.path.join("models", "bm25_model.pkl")
if os.path.exists(saved_bm25_model_path):
    with open(saved_bm25_model_path, "rb") as file:
        bm25 = pickle.load(file)
        print("BM25 model successfully loaded")
else:
    tokenized_corpus = []
    for key, text in answers_dict.items():
        tokenized_corpus.append(preprocess_text(text))
    print(f"BM25 corpus size: {len(tokenized_corpus)}")

    bm25 = BM25Okapi(tokenized_corpus)



BM25 model successfully loaded


In [9]:
# BM25 retrieval function
def bm25_retrieve(query, top_n=5):
    tokenized_query = preprocess_text(query)
    scores = bm25.get_scores(tokenized_query)
    top_indices = np.argsort(scores)[::-1][:top_n]
    top_answers = [(list(answers_dict.keys())[idx], scores[idx]) for idx in top_indices]
    return top_answers


In [10]:
# Example retrieval
query_example = test_questions.iloc[0][0]
top_answers = bm25_retrieve(query_example)

print("Query:", str(query_example))
print("Top retrieved answers:")
for top_id, top_score in top_answers:
    print(f"Answer ID: {top_id}, Text: {answers_dict[top_id][:50]}...")

Query: After what George was Georgetown University named? After what George was Georgetown University named? Or was it named that because of where it is located?
Top retrieved answers:
Answer ID: academia_185179, Text: Georgetown University is named after the village G...
Answer ID: skeptics_37836, Text: No, this Huff Post article* which repeats the clai...
Answer ID: academia_16379, Text: Here is one side effect of a university having a f...
Answer ID: movies_116156, Text: They both being from the same neighbourhood helps ...
Answer ID: islam_17206, Text: Abul Qasim means father of Qasim which Was our pro...


  query_example = test_questions.iloc[0][0]


In [11]:
# Load pretrained BERT model and tokenizer
MODEL_NAME = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [12]:
# Dataset for training
class RankingDataset(Dataset):
    def __init__(self, queries, documents, labels):
        self.queries = queries
        self.documents = documents
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.queries[idx], self.documents[idx], self.labels[idx]

In [13]:
# Data preparation for reranking
def prepare_rerank_data(questions, answer_ids, bm25_scores):
    pairs, labels = [], []
    for query, true_id, top_ids in zip(questions, answer_ids, bm25_scores):
        for rank, (answer_id, score) in enumerate(top_ids):
            pairs.append((query, answers_dict[answer_id]))
            labels.append(1 if answer_id == true_id else 0)
    return pairs, labels


In [14]:
bm25_scores_path = os.path.join("models", "bm25_scores_train.pkl")

# Since it takes time to compute the bm25 scores everytime for each training data, I computed it once and stored it in a pickle file
if os.path.exists(bm25_scores_path):
    with open(bm25_scores_path, "rb") as file:
        bm25_scores_train = pickle.load(file) 
    print("Successfully 'loaded' the bm25 scores of the train queries")
else:
    bm25_scores_train = []
    for query in tqdm(train_questions["text"]):
        top_answers = bm25_retrieve(query)
        bm25_scores_train.append(top_answers)

    with open(bm25_scores_path, "wb") as file:
        pickle.dump(bm25_scores_train, file)

    print("Successfully 'calculated' and 'stored' the bm25 scores of the train queries")


Successfully 'loaded' the bm25 scores of the train queries


In [15]:
# Prepare train and val data
bm25_scores_train = bm25_scores_train
pairs_train, labels_train = prepare_rerank_data(
    train_data['text'], train_data['best_answer'], bm25_scores_train
)

In [16]:
# Tokenize pairs
def collate_fn(batch):
    queries, documents, labels = zip(*batch)
    inputs = tokenizer(list(queries), list(documents), padding=True, truncation=True, return_tensors="pt")
    labels = torch.tensor(labels, dtype=torch.float32)
    return inputs, labels

In [17]:
train_dataset = RankingDataset(*zip(*pairs_train), labels_train)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)

In [18]:
# Fine-tune BERT
optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = torch.nn.BCEWithLogitsLoss()



In [19]:
transformers.logging.set_verbosity_error()

In [None]:
model_path = os.path.join("models", "fine_tuned_model.pth")
if os.path.exists(model_path):
    model.load_state_dict(torch.load(model_path))
    model.eval()  # Set the model to evaluation mode
    print("Successfully 'loaded' the fine_tuned_model")

else:
    for epoch in range(3):
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader):
            inputs, labels = batch
            inputs = {k: v.to(device) for k, v in inputs.items()}
            labels = labels.to(device)
            optimizer.zero_grad()
            outputs = model(**inputs).logits.squeeze(-1)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}")

    torch.save(model.state_dict(), "fine_tuned_model.pth")

    print("Successfully 'calculated' and 'stored' the fine_tuned_model")

  model.load_state_dict(torch.load(model_path))


Successfully 'loaded' the fine_tuned_model


In [21]:
def evaluate_rerank_model(model, questions, true_answers_id, bm25_scores):
    model.eval()
    all_labels, all_preds = [], []

    for query, true_id, top_ids in zip(questions, true_answers_id, bm25_scores):
        pairs = [(query, answers_dict[answer_id]) for answer_id, _ in top_ids]
        inputs = tokenizer(
            [p[0] for p in pairs], [p[1] for p in pairs],
            padding=True, truncation=True, return_tensors="pt"
        ).to(device)

        with torch.no_grad():
            scores = model(**inputs).logits.squeeze(-1).cpu().numpy()


        all_preds.append(scores)
        all_labels.append([1 if answer_id == true_id else 0 for answer_id, _ in top_ids])

    y_true = [l for labels in all_labels for l in labels]
    y_pred = [int(p > 0) for preds in all_preds for p in preds]
    
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_true,
        y_pred,
        average="binary"
    )

    # Correct MAP calculation
    def average_precision(labels, preds):
        ap = 0.0
        relevant_count = 0
        for i, p in enumerate(preds):
            if labels[i] == 1:
                relevant_count += 1
                ap += relevant_count / (i + 1)
        if relevant_count == 0:
            return 0.0
        return ap / relevant_count

    map_score = sum(average_precision(labels, preds) for labels, preds in zip(all_labels, all_preds)) / len(all_labels)
    ndcg = ndcg_score(all_labels, all_preds)

    return precision, recall, map_score, ndcg

In [37]:
bm25_scores_val = []

for query in tqdm(val_questions["text"]):
    top_answers = bm25_retrieve(query)
    bm25_scores_val.append(top_answers)


100%|██████████| 100/100 [00:15<00:00,  6.58it/s]


In [38]:
# Evaluate on validation set
precision, recall, map_score, ndcg = evaluate_rerank_model(
    model, val_data['text'], val_data['best_answer'], bm25_scores_val
)

print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, MAP: {map_score:.4f}, nDCG: {ndcg:.4f}")

Precision: 0.8611, Recall: 0.7209, MAP: 0.7590, nDCG: 0.8202


In [36]:
# Read the CSV file into a DataFrame
df = pd.read_csv(os.path.join(data_path, "questions_with_answer.csv"))
df = df[["Id", "Tags"]]
df.head()

Unnamed: 0,Id,Tags
0,writers_1,<resources><first-time-author>
1,writers_2,<fiction><grammatical-person><third-person>
2,writers_3,<publishing><novel><agent>
3,writers_7,<fiction><genre><categories>
4,writers_11,<terminology><preparation>


In [32]:
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

def personalize_results(query, tags):
    prompt = f"""
    Modify and expand the following query: "{query}". 
    With using the following relevant tags: {tags}. Only write the query as the output.
    """

    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        store=True,
        messages=[
            {"role": "user", "content": prompt}
        ]
    )

    return completion.choices[0].message.content

In [47]:
first_row = test_data.iloc[0]
tags = df[df['Id'] == first_row["id"]]["Tags"]
tags = str(tags.astype("string")).replace(str(first_row), "").strip()
query = first_row["text"]
print("Original query: " + query)
print("----------------------")
print("Personalized and modfied/expanded query: " + personalize_results(query, tags))

Original query: After what George was Georgetown University named? After what George was Georgetown University named? Or was it named that because of where it is located?
----------------------
Personalized and modfied/expanded query: "After what George was Georgetown University named? Specifically, can you provide details about the individual or historical figure associated with its name? Additionally, was the university's name influenced by its geographical location, and if so, how does that connection play a role in its history?" 


In [50]:
bm25_test_scores_path = os.path.join("models", "bm25_scores_test_personalized.pkl")

# Since we are using OpenAI's GPT API, we just test the results on a small subset of the data to not pass the limit.
if os.path.exists(bm25_test_scores_path):
    with open(bm25_test_scores_path, "rb") as file:
        bm25_scores_test_personalized = pickle.load(file) 
    print("Successfully 'loaded' the bm25 personalized scores of the test queries")
else:
    bm25_scores_test_personalized = []
    for index, row in test_data.iterrows():
        tags = df[df['Id'] == row["id"]]["Tags"]
        tags = str(tags).replace(str(index), "").strip()
        personalized_query = personalize_results(row["text"], tags)
        top_answers = bm25_retrieve(personalized_query, top_n=3)
        bm25_scores_test_personalized.append(top_answers)

        if index == 50:
            break
    
    with open(bm25_test_scores_path, "wb") as file:
        pickle.dump(bm25_scores_test_personalized, file)

    print("bm25_scores_test_personalized calculated and saved successfully.")

bm25_scores_test_personalized calculated and saved successfully.


In [54]:
# Evaluate on validation set
personalized_precision, personalized_recall, personalized_map_score, personalized_ndcg = evaluate_rerank_model(
    model, test_data['text'], test_data['best_answer'], bm25_scores_test_personalized
)

print(f"Precision: {personalized_precision:.4f}, Recall: {personalized_recall:.4f}, MAP: {personalized_map_score:.4f}, nDCG: {personalized_ndcg:.4f}")

Precision: 0.9211, Recall: 0.7778, MAP: 0.8268, nDCG: 0.8581
