In [1]:
import os
import json
import pandas as pd
import numpy as np
import string
import pickle

from rank_bm25 import BM25Okapi

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, ndcg_score, precision_recall_fscore_support

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

import transformers
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

from torch.utils.data import DataLoader, Dataset
import torch

from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
nltk.download('punkt')
nltk.download('punkt_tab')

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

stemmer=PorterStemmer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Paths
data_path = "answer_retrieval"
subset_answers_path = os.path.join(data_path, "subset_answers.json")

In [4]:
# Function to load data from jsonl. this one makes everything into one dataframe. i changed it to a data frame, it should work better in theory
def load_subset_data(folder):
    file_path = os.path.join(data_path, folder, "subset_data.jsonl")
    return pd.read_json(file_path, lines=True)

# Load data for train, test, and val
train_data = load_subset_data("train")
test_data = load_subset_data("test")
val_data = load_subset_data("val")
display(train_data.head()) # - to print in a nice way the first rows of the matrix

print(f"Train examples: {len(train_data)}, Test examples: {len(test_data)}, Val examples: {len(val_data)}")

Unnamed: 0,id,text,title,timestamp,score,views,favorite,user_id,user_questions,user_answers,tags,rel_ids,rel_scores,rel_timestamps,best_answer
0,academia_100305,What are CNRS research units and how are they ...,What are CNRS research units and how are they ...,2017-12-11 16:30:20,14,2484,2.0,1106095,"[workplace_40845, workplace_40899, workplace_9...","[travel_45926, travel_46391, travel_47403, tra...","[funding, france]",[academia_100217],[1],"[1512814966, 1513014615, 1513020822]",academia_100217
1,academia_100456,Is there a free (as in freedom) alternative to...,Is there a free (as in freedom) alternative to...,2017-12-13 19:02:32,13,1117,2.0,1106095,"[workplace_40845, workplace_40899, workplace_9...","[travel_45926, travel_46391, travel_47403, tra...","[peer-review, open-access]",[academia_100462],[1],"[1513205016, 1536615064, 1553005541, 1615097827]",academia_100462
2,academia_103390,Search for StackExchange citations with Google...,Search for StackExchange citations with Google...,2018-02-06 16:40:59,2,157,1.0,1532620,"[writers_27613, writers_29562, sound_42166, so...","[skeptics_39944, philosophy_3098, philosophy_9...","[citations, google-scholar]",[academia_103391],[1],[1517936080],academia_103391
3,academia_10481,Reproducible research and corporate identity M...,Reproducible research and corporate identity,2013-06-06 09:11:05,18,372,1.0,1106095,"[academia_1698, academia_1772, academia_1911, ...","[academia_1699, academia_1700, academia_1701, ...","[copyright, creative-commons]",[academia_10499],[1],"[1370596608, 1370601095]",academia_10499
4,academia_10649,Advantages of second marking In the UK a porti...,Advantages of second marking,2013-06-17 12:24:37,6,1235,2.0,1106095,"[academia_1698, academia_1772, academia_1911, ...","[academia_1699, academia_1700, academia_1701, ...",[assessment],[academia_10650],[1],"[1371477146, 1371477156, 1371552185]",academia_10650


Train examples: 10000, Test examples: 100, Val examples: 100


**CHANGE**: I modified the following block of code to avoid opening the same file twice

In [5]:
# Load subset_answers, this time I uploaded it as a list, cause i had trouble working with it as a dictionary
def load_subset_answers() -> dict:
    with open(subset_answers_path) as file:
        data = json.load(file)
    return data

def convert_subset_answers_to_list(subset_answers: dict) -> list:
    return [(k, v) for k, v in subset_answers.items()]

#also as a dictionary cause at the end we need to be able to retrieve the answers from the keys
answers_dict: dict = load_subset_answers()
answers_list: list = convert_subset_answers_to_list(answers_dict)

print(f"Total answers loaded: {len(answers_list)}")

Total answers loaded: 9398


In [6]:
# Extract questions and answers from subset_data.jsonl
def extract_questions_and_answers(data):
    questions = data[['text']]
    answers = data[['best_answer']]
    return questions, answers

train_questions, train_answers = extract_questions_and_answers(train_data)
test_questions, test_answers = extract_questions_and_answers(test_data)
val_questions, val_answers = extract_questions_and_answers(val_data)

display(test_questions)
print(type(test_questions))


Unnamed: 0,text
0,After what George was Georgetown University na...
1,Can someone explain why Garou made Saitama do ...
2,Why did Madara want to resurrect if with the E...
3,What is the ~/Applications directory for? I wa...
4,How to make notification but no noise when tim...
...,...
95,What are the main criteria used by the Europea...
96,What happened to most of Five Star Movement (M...
97,How did Dumbledore know what Ron saw in the Mi...
98,Do the residents of Facade talk in a language ...


<class 'pandas.core.frame.DataFrame'>


In [7]:

punctuations = list(string.punctuation)
def preprocess_text(text):
    text = text.lower()
    text = "".join(char for char in text if char not in punctuations) # Removing punctuations from the text
    text = word_tokenize(text)
    text = [word for word in text if word not in stop_words]
    return text

for key, text in answers_dict.items():
    answers_dict[key] = preprocess_text(text)

In [8]:
tokenized_corpus = list(answers_dict.values())
print(f"BM25 corpus size: {len(tokenized_corpus)}")

BM25 corpus size: 9398


In [9]:
bm25 = BM25Okapi(tokenized_corpus)

In [10]:
# BM25 retrieval function
def bm25_retrieve(query, top_n=10):
    tokenized_query = preprocess_text(query)
    scores = bm25.get_scores(tokenized_query)
    top_indices = np.argsort(scores)[::-1][:top_n]
    top_ids = [list(answers_dict.keys())[idx] for idx in top_indices]
    return top_ids, scores[top_indices]

# Example retrieval
query_example = test_questions.iloc[0][0]
top_ids, top_scores = bm25_retrieve(query_example, top_n=5)

print("Query:", str(query_example))
print("Top retrieved answers:")
for ans_id in top_ids:
    print(f"Answer ID: {ans_id}, Text: {answers_dict[ans_id]}")

Query: After what George was Georgetown University named? After what George was Georgetown University named? Or was it named that because of where it is located?
Top retrieved answers:
Answer ID: academia_185179, Text: ['georgetown', 'university', 'named', 'village', 'georgetown', 'close', 'georgetown', 'predates', 'creation', 'washington', 'dc', 'significant', 'margin', 'one', 'speculate', 'like', 'many', 'places', 'east', 'coast', 'united', 'states', 'founded', 'settlers', 'britain', 'named', 'sorts', 'places', 'british', 'kings', 'queens', 'localities', 'origins', 'name', 'also', 'possible', 'course', 'linked', 'wikipedia', 'article', 'states', 'thissince', 'georgetown', 'founded', 'reign', 'george', 'ii', 'great', 'britain', 'speculate', 'town', 'named', 'another', 'theory', 'town', 'named', 'founders', 'george', 'gordon', 'george', 'beall']
Answer ID: skeptics_37836, Text: ['huff', 'post', 'article', 'repeats', 'claim', 'corrected', 'say', 'considers', 'university', 'california', 

  query_example = test_questions.iloc[0][0]


In [11]:
def evaluate_retrieval(retrieved_answers, test_answer_ids):
    precision_list, recall_list = [], []
    for retrieved, true_id in zip(retrieved_answers, test_answer_ids):
        relevance = [1 if ans_id == true_id else 0 for ans_id in retrieved]
        precision = sum(relevance) / len(relevance)
        recall = sum(relevance) / 1  # Only 1 relevant document
        precision_list.append(precision)
        recall_list.append(recall)
    return np.mean(precision_list), np.mean(recall_list)

In [12]:
# Evaluate BM25
def evaluate_bm25(test_questions, test_answer_ids, top_n=10):
    precision_list, recall_list = [], []
    for query, true_answer_id in zip(test_questions, test_answer_ids):
        # Retrieve top-k answers
        top_ids, _ = bm25_retrieve(query, top_n)

        # Check if the true answer ID is in top-k
        relevance = [1 if ans_id == true_answer_id else 0 for ans_id in top_ids]

        # Precision and Recall
        precision = sum(relevance) / len(relevance)
        recall = sum(relevance) / 1  # Only 1 relevant document

        precision_list.append(precision)
        recall_list.append(recall)

    return np.mean(precision_list), np.mean(recall_list)

In [13]:
test_questions_list = list(test_questions["text"])
best_test_answer_ids_list = list(test_answers["best_answer"])

precision, recall = evaluate_bm25(test_questions_list, best_test_answer_ids_list, top_n=3)
print(f"Precision: {precision:.4f}, Recall: {recall:.4f}")

Precision: 0.2633, Recall: 0.7900


In [24]:
# Load pretrained BERT model and tokenizer
MODEL_NAME = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [15]:
# Dataset for training
class RankingDataset(Dataset):
    def __init__(self, queries, documents, labels):
        self.queries = queries
        self.documents = documents
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.queries[idx], self.documents[idx], self.labels[idx]

In [16]:
# Data preparation for reranking
def prepare_rerank_data(questions, answer_texts, answer_ids, bm25_scores):
    pairs, labels = [], []
    for query, true_id, top_ids in zip(questions, answer_ids, bm25_scores):
        for rank, (answer_id, score) in enumerate(top_ids):
            pairs.append((query, answers_dict[answer_id]))
            labels.append(1 if answer_id == true_id else 0)
    return pairs, labels


In [17]:
# bm25_scores_train = []

# for query in train_questions["text"]:
#     # Tokenize the query
#     tokenized_query = preprocess_text(query)

#     # Get BM25 scores for all answers
#     scores = bm25.get_scores(tokenized_query)

#     # Retrieve the top-N results
#     top_n = 10  # Adjust as needed
#     top_indices = np.argsort(scores)[::-1][:top_n]
#     top_answers = [(list(answers_dict.keys())[idx], scores[idx]) for idx in top_indices]

#     bm25_scores_train.append(top_answers)

# Since it takes time to compute the bm25 scores everytime for each training data, I computed it once and in a pickle file that I sent to you
with open("bm25_scores_train.pkl", "rb") as file:
    bm25_scores_train = pickle.load(file)
 

In [18]:
# Prepare train and val data
bm25_scores_train = bm25_scores_train  # Use BM25 results from part 1
pairs_train, labels_train = prepare_rerank_data(
    train_data['text'], answers_list, train_data['best_answer'], bm25_scores_train
)

In [19]:
# Tokenize pairs
def collate_fn(batch):
    queries, documents, labels = zip(*batch)
    inputs = tokenizer(list(queries), list(documents), padding=True, truncation=True, return_tensors="pt")
    labels = torch.tensor(labels, dtype=torch.float32)
    return inputs, labels

In [20]:
train_dataset = RankingDataset(*zip(*pairs_train), labels_train)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)

In [21]:
# Fine-tune BERT
optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = torch.nn.BCEWithLogitsLoss()



In [23]:
transformers.logging.set_verbosity_error()

In [None]:
for epoch in range(2):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        inputs, labels = batch
        inputs = {k: v.to(device) for k, v in inputs.items()}
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = model(**inputs).logits.squeeze(-1)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}")

## Part 3

In [39]:
# Precision, Recall, MAP, and nDCG Evaluation
def evaluate_rerank_model(model, questions, true_answers, bm25_scores):
    model.eval()
    all_labels, all_preds = [], []

    for query, true_id, top_ids in zip(questions, true_answers, bm25_scores):
        pairs = [(query, answers_dict[answer_id]) for answer_id, _ in top_ids]
        inputs = tokenizer(
            [p[0] for p in pairs], [p[1] for p in pairs],
            padding=True, truncation=True, return_tensors="pt"
        ).to(device)

        with torch.no_grad():
            scores = model(**inputs).logits.squeeze(-1).cpu().numpy()

        all_preds.append(scores)
        all_labels.append([1 if answer_id == true_id else 0 for answer_id, _ in top_ids])

    precision, recall, f1, _ = precision_recall_fscore_support(
        [l for labels in all_labels for l in labels],
        [int(p > 0) for preds in all_preds for p in preds],
        average="binary"
    )
    map_score = sum([sum(l * p for l, p in zip(labels, sorted(preds, reverse=True))) for labels, preds in zip(all_labels, all_preds)]) / len(all_labels)
    ndcg = ndcg_score(all_labels, all_preds)

    return precision, recall, map_score, ndcg

In [56]:
bm25_scores_val = []

for query in val_questions["text"]:
    # Tokenize the query
    tokenized_query = preprocess_text(query)

    # Get BM25 scores for all answers
    scores = bm25.get_scores(tokenized_query)

    # Retrieve the top-N results
    top_n = 5
    top_indices = np.argsort(scores)[::-1][:top_n]
    top_answers = [(list(answers_dict.keys())[idx], scores[idx]) for idx in top_indices]

    bm25_scores_val.append(top_answers)


In [57]:
# Evaluate on validation set
bm25_scores_val = bm25_scores_val  # BM25 results for validation
precision, recall, map_score, ndcg = evaluate_rerank_model(
    model, list(val_data['text']), list(val_data['best_answer']), bm25_scores_val
)

print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, MAP: {map_score:.4f}, nDCG: {ndcg:.4f}")

Precision: 0.0000, Recall: 0.0000, MAP: -1.9950, nDCG: 0.4867


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
