# Project - Part 3

## Some imports and Preprocessing

In [15]:
# A few imports needed
import os
import csv
import string
import re
import random

# Setting drive
from google.colab import drive 
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/ProjP3')

# Getting the post reader
from post_parser_record import PostParserRecord
post_reader = PostParserRecord("SPosts.xml")

# Installing ranx and importing necessary libraries
!pip install -U ranx
from ranx import Qrels, Run, evaluate, compare, fuse

# Installing sentence transformer
!pip install -U sentence-transformers
from sentence_transformers import SentenceTransformer, util
from sentence_transformers import SentenceTransformer, SentencesDataset, InputExample, losses, util, models, evaluation

# Torch
from torch.utils.data import DataLoader

# Setting GPU card
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
!nvidia-smi -L  # Checking which GPU is mounted

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



## Step 1: Preparing Training Data

### Function to clean the text
Removes HTML Tags and removes newlines

In [16]:
# Cleans the text
def clean_string(s):
  # Removes HTML tags
  CLEANR = re.compile('<.*?>') 
  s = re.sub(CLEANR, '', s)
  # Removes newlines and adds a space so that words do not combine
  s = s.replace('\n', ' ') 
  return s

### Functions to create Triples
This training data are in the form of triples (question title, answer, label), where
the answer is the selected answer (as chosen by the question asker), and labels are 0 and 1. The negative samples are the answer
chosen randomly from other questions.

In [17]:
def get_list_of_questions_with_accepted_answer(post_parser):
    # This method takes in post_parser and returns list of question ids that has an accepted answer
    lst_question_ids = []
    for question_id in post_parser.map_questions:
        question = post_parser.map_questions[question_id]
        if question.accepted_answer_id is not None:
            lst_question_ids.append(question_id)
    return lst_question_ids


def create_triples(post_file_path):
    # This methods takes in the file path of Posts.xml file and returns a list of triplets
    # These triplets are in form of (query, text, label) where we have two labels of 0 and 1 for positive and
    # negative candidates
    training_triplets = {}
    post_parser = PostParserRecord(post_file_path)
    list_questions_with_accepted_answer = get_list_of_questions_with_accepted_answer(post_parser)

    # List filled with the question ids that's been used for evaluating systems
    except_these_question_ids = [13530,11320,430,6079,8659,7255,7028,9204,23226,1549,20274,45317,35792,17727,52888,12635,204685,91796,69252,171648]

    answer_ids = list(post_parser.map_just_answers.keys())
    random.seed = 4
    for question_id in list_questions_with_accepted_answer:
        question = post_parser.map_questions[question_id]
        query = question.title
        positive_text = post_parser.map_just_answers[question.accepted_answer_id].body
        
        # Removing html tags
        positive_text = clean_string(positive_text)
        
        lst_of_answer_ids_to_question = []
        for answer in question.answers:
            lst_of_answer_ids_to_question.append(answer.post_id)
        random.shuffle(answer_ids)
        for answer_id in answer_ids:
            if answer_id in lst_of_answer_ids_to_question:
                continue
            else:
                negative_text = clean_string(post_parser.map_just_answers[answer_id].body)
                break
        if 0 in training_triplets:
            training_triplets[0].append([query, negative_text, 0])
        else:
            training_triplets[0] = [[query, negative_text, 0]]
        if 1 in training_triplets:
            training_triplets[1].append([query, positive_text, 1])
        else:
            training_triplets[1] = [[query, positive_text, 1]]
    
    return training_triplets

### Actually making the triples

In [None]:
triplets = create_triples("SPosts.xml")

## Step 2: Using a Pre-trained model

### Functions used for pre-trained model retrieval:
#### read_corpus, get_query_text, retrieval

In [None]:
# Functions used for the pre-trained model training/retrieval of 20 queries

def read_corpus(post_file_path):
    # Takes in Posts.xml file and returns dictionary of {answer id: answer body}
    # This is the collection you will search over
    post_parser = PostParserRecord(post_file_path)
    dict_answer_id_answer = {}
    for answer_id in post_parser.map_just_answers:
        dict_answer_id_answer[answer_id] = post_parser.map_just_answers[answer_id].body
    return dict_answer_id_answer

# Helper to get query text
def get_query_text(query_id):
    question = post_reader.map_questions[query_id]
    query_text = clean_string(question.body)
    return query_text

# Retrieval method for any model 
def retrieval(model_name, post_file_path):
    final_result = {}
    print("model loaded")
    model = SentenceTransformer(model_name)
    model.max_seq_length = 512
    "This is an important part"
    candidates = read_corpus(post_file_path)
    print("corpus read")
    corpus_embeddings = model.encode(list(candidates.values()), convert_to_tensor=True)
    print("corpus encoded")

    # Build queries dict of my 20 queries
    queries = ({1 : get_query_text(13530),
                2 : get_query_text(11320),
                3 : get_query_text(430),
                4 : get_query_text(6079),
                5 : get_query_text(8659),
                6 : get_query_text(7255),
                7 : get_query_text(7028),
                8 : get_query_text(9204),
                9 : get_query_text(23226),
                10 : get_query_text(1549),
                11 : get_query_text(20274),
                12 : get_query_text(45317),
                13 : get_query_text(35792),
                14 : get_query_text(17727),
                15 : get_query_text(52888),
                16 : get_query_text(12635),
                17 : get_query_text(204685),
                18 : get_query_text(91796),
                19 : get_query_text(69252),
                20 : get_query_text(171648),
                })
    
    for topic_id in queries:
        temp_dic = {}
        query = queries[topic_id]
        query_embedding = model.encode(query, convert_to_tensor=True)
        # semantic search for each query embedding
        hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=1000)
        hits = hits[0]  # Get the hits for the first query
        for hit in hits:
            index = hit['corpus_id']
            answer_id = list(candidates.keys())[index]
            score = hit['score']
            temp_dic[answer_id] = score
        final_result[topic_id] = temp_dic
    return final_result

Uses the bi-encoding architecture of the 'all-MiniLM-L12-v2' pretrained SBERT model to retrieve the top 1000 results for each of the 20 queries that are defined in the retrieval method and then write them to a csv

In [None]:
# Defining model 
model_name = 'all-MiniLM-L12-v2'
# Doing the retrieval
retrieval_results = retrieval(model_name, 'SPosts.xml')
# Path to save results
result_file_path = "sbert_result.tsv"
# Name for the run 
run_name = "sbrt"

# Saving the top-1000 results
with open(result_file_path, mode='w', newline='') as csv_file:
    csv_writer = csv.writer(csv_file, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    for topic_id in retrieval_results:
        result_map = retrieval_results[topic_id]
        result_map = dict(sorted(result_map.items(), key=lambda item: item[1], reverse=True))
        rank = 1
        for post_id in result_map:
            score = result_map[post_id]
            csv_writer.writerow([topic_id, "Q0",  post_id, str(rank), str(score), run_name])
            rank += 1
            if rank > 1000:
                break

In [7]:
retrieval_results

{1: {14084: 0.784925639629364,
  144227: 0.6167720556259155,
  36935: 0.5733959674835205,
  185939: 0.560631275177002,
  27280: 0.5584015250205994,
  104326: 0.4886060059070587,
  113529: 0.47456493973731995,
  50861: 0.4669647812843323,
  99098: 0.4487757682800293,
  231433: 0.44824668765068054,
  136083: 0.4464132785797119,
  48626: 0.4406738877296448,
  12481: 0.4396369159221649,
  116247: 0.4374510943889618,
  72846: 0.4368029534816742,
  59667: 0.433768630027771,
  120077: 0.432645708322525,
  5204: 0.43083932995796204,
  31076: 0.43070730566978455,
  128390: 0.43038058280944824,
  134846: 0.42992815375328064,
  107301: 0.4283308684825897,
  175178: 0.42508164048194885,
  6073: 0.42506080865859985,
  2086: 0.42464199662208557,
  107305: 0.4246067404747009,
  75567: 0.42374187707901,
  83460: 0.4231877326965332,
  36188: 0.42154666781425476,
  108496: 0.4204306900501251,
  100574: 0.4199388027191162,
  238872: 0.41983041167259216,
  109669: 0.4195180833339691,
  64935: 0.4165632128

## Step 3: Using a fine-tuned model

### Functions used for fine-tuned model training

In [None]:
def split_data(data):
    # takes in list as the input data and return split of data (splitting into 10 pieces)
    length = int(len(data) / 10)  # length of each fold
    pieces = []
    for i in range(9):
        pieces.append(data[i * length: (i + 1) * length])
    pieces.append(data[9 * length:len(data)])
    return pieces


def fine_tuning(post_xml_file_path):
    # Pre-trained model that is used
    model = SentenceTransformer('all-MiniLM-L12-v2_finetuned15')     # !!!! Params are updated to use the right model for additional epoch training

    # The name of model to save after fine-tuning
    MODEL = "all-MiniLM-L12-v2_finetuned20"                # !!!! Params are updated to save the right model after tuning (to be used again)

    # This fine-tuning uses two loss functions and for each we consider one list to save the triplets
    train_samples_MultipleNegativesRankingLoss = []
    train_samples_ConstrativeLoss = []

    # parallel lists used for defining the validation set
    evaluator_samples_1 =[]
    evaluator_samples_2 = []
    evaluator_samples_score = []

    # Parameters for fine-tuning
    num_epochs = 5
    margin = 0.5
    train_batch_size = 64
    model.max_seq_length = 256
    distance_metric = losses.SiameseDistanceMetric.COSINE_DISTANCE

    # Samples for fine-tuning
    dic_instances = triplets

    #################################
    # Preparing the data for Sentence-BERT in the required format
    for label in dic_instances:
        lst_instances = dic_instances[label]
        # Randomly choosing the validation split
        random.shuffle(lst_instances)
        splits = split_data(lst_instances)
        # The last split is used as validation set
        validation = splits[-1]
        training = splits[:-1]

        # Preparing the training set
        for split in training:
            for triples in split:
                query = triples[0]
                candidate = triples[1]
                label = triples[2]
                train_samples_ConstrativeLoss.append(InputExample(texts=[query, candidate], label=label))
                train_samples_MultipleNegativesRankingLoss.append(
                        InputExample(texts=[query, candidate], label=label))
        # Preparing the validation set
        for triples in validation:
            evaluator_samples_1.append(triples[0])
            evaluator_samples_2.append(triples[1])
            evaluator_samples_score.append(triples[2])

    # Create data loader and loss for MultipleNegativesRankingLoss
    train_dataset_MultipleNegativesRankingLoss = SentencesDataset(train_samples_MultipleNegativesRankingLoss,
                                                                  model=model)
    train_dataloader_MultipleNegativesRankingLoss = DataLoader(train_dataset_MultipleNegativesRankingLoss, shuffle=True,
                                                               batch_size=train_batch_size)
    train_loss_MultipleNegativesRankingLoss = losses.MultipleNegativesRankingLoss(model)

    # Create data loader and loss for OnlineContrastiveLoss
    train_dataset_ConstrativeLoss = SentencesDataset(train_samples_ConstrativeLoss, model=model)
    train_dataloader_ConstrativeLoss = DataLoader(train_dataset_ConstrativeLoss, shuffle=True,
                                                  batch_size=train_batch_size)
    train_loss_ConstrativeLoss = losses.OnlineContrastiveLoss(model=model, distance_metric=distance_metric,
                                                              margin=margin)

    # Setting up the validation set
    evaluator = evaluation.EmbeddingSimilarityEvaluator(evaluator_samples_1, evaluator_samples_2, evaluator_samples_score, write_csv="evaluation_epoch.csv")

    # Fine-tune the model
    model.fit(
        train_objectives=[(train_dataloader_MultipleNegativesRankingLoss, train_loss_MultipleNegativesRankingLoss),
                          (train_dataloader_ConstrativeLoss, train_loss_ConstrativeLoss)],
        evaluator=evaluator,
        epochs=num_epochs,
        warmup_steps=1000,
        output_path=MODEL,
        show_progress_bar=True,
        callback=callback
        )


def callback(score, epoch, steps):
    # This method is used to write the loss for each epoch on the validation set
    csv_writer_Epochs.writerow([score, epoch, steps])

Uses the same pre-trained model as before: 'all-MiniLM-L12-v2' to be fine-tuned using the training data that was created in step 1.

The tuning parameters that are used are:

num epochs = 5

margin = 0.5

train batch size = 64

max seq length = 256

distance metric = Cosine Distance

In [None]:
# Method to do the fine-tuning and then save the epoch info into a tsv
def fine_tune_and_save_epochs (post_file):
  epoch_csv_file = "epochs.tsv"
  post_xml_file = post_file
  with open(epoch_csv_file, mode='w', newline='') as csv_file:
      csv_writer_Epochs = csv.writer(csv_file, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL)
      fine_tuning(post_xml_file)
    

In [None]:
fine_tune_save_epochs("SPosts.xml")

In [None]:
# Method to do the retrieval of the fine-tune model and save top 1000 results in a csv
def fine_tuned_retrieval (model_name):
  # Setting up model and paths
  model_name = model_name
  retrieval_results = retrieval(model_name, 'SPosts.xml')
  result_file_path = "finetunedsbert_result2.tsv"
  run_name = "ftsbrt"
  
  # Saving the top-1000 results
  with open(result_file_path, mode='w', newline='') as csv_file:
      csv_writer = csv.writer(csv_file, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL)
      for topic_id in retrieval_results:
          result_map = retrieval_results[topic_id]
          result_map = dict(sorted(result_map.items(), key=lambda item: item[1], reverse=True))
          rank = 1
          for post_id in result_map:
              score = result_map[post_id]
              csv_writer.writerow([topic_id, "Q0",  post_id, str(rank), str(score), run_name])
              rank += 1
              if rank > 1000:
                  break

In [8]:
fine_tuned_retrieval("all-MiniLM-L12-v2_finetuned20")

model loaded
corpus read
corpus encoded


## Step 4: Evaluation and Comparison

### Precision @ 5 and NDCG @ 5 for the models

In [8]:
# The qrel file that will be used for evaluation
qrel = Qrels.from_file("qrel.tsv", kind="trec")

In [9]:
# First run is for the pre-trained model
run1 = Run.from_file("sbert_result.tsv", kind="trec")
evaluate(qrel, run1, ["precision@5", "ndcg@5"])

{'precision@5': 0.6, 'ndcg@5': 0.6968200092529474}

In [10]:
# Second run is for the fine-tuned model
run2 = Run.from_file("finetunedsbert_result2.tsv", kind="trec")
evaluate(qrel, run2, ["precision@5", "ndcg@5"])

{'precision@5': 0.7000000000000001, 'ndcg@5': 0.7430542999315266}

### Per query precision/nDCG for the models

In [11]:
# Pre-trained
evaluate(qrel, run1, ["precision@5", "ndcg@5"], return_mean = False)

{'precision@5': array([1. , 0.6, 0.8, 0.6, 0.4, 0.6, 0.6, 0.8, 0.6, 0.2, 0.8, 0.6, 0.8,
        1. , 0.4, 0.4, 0.2, 0.4, 0.4, 0.8]),
 'ndcg@5': array([1.        , 0.63793652, 0.69178318, 0.94690243, 0.86708701,
        0.71483595, 0.72696463, 0.68758352, 0.68435155, 0.63092975,
        0.85393165, 0.77346804, 0.68753997, 0.92696583, 0.64217558,
        0.30078518, 0.56154438, 0.27727343, 0.49392169, 0.8304199 ])}

In [12]:
# Fine-tuned
evaluate(qrel, run2, ["precision@5", "ndcg@5"], return_mean = False)

{'precision@5': array([1. , 0.8, 0.8, 0.6, 0.4, 0.6, 1. , 0.8, 0.4, 0.2, 1. , 0.8, 0.6,
        1. , 0.4, 1. , 0.4, 0.6, 0.8, 0.8]),
 'ndcg@5': array([0.91520995, 0.71297956, 0.70353906, 0.71226307, 0.760622  ,
        0.68753997, 0.90156296, 0.69974013, 0.55314647, 1.        ,
        0.92696583, 0.85162983, 0.59412316, 0.93439746, 0.64217558,
        0.91520995, 0.70193047, 0.36437885, 0.5632605 , 0.7204112 ])}

### Significance Test

In [14]:
report = compare(
    qrels = qrel,
    runs = [run1,run2],
    metrics=["precision@5", "ndcg@5", "map@100"],
    max_p=0.01,  # P-value threshold
    stat_test="fisher",
    rounding_digits=3,
)
print(report)

#    Model      P@5    NDCG@5    MAP@100
---  -------  -----  --------  ---------
a    sbrt       0.6     0.697      0.575
b    ftsbrt     0.7     0.743      0.593


### Example Query:

In [None]:
# Pre-trained
pretrained_result = retrieval('all-MiniLM-L12-v2_finetuned20', 'SPosts.xml')

In [None]:
# Fine-tuned
finetuned_result = retrieval('all-MiniLM-L12-v2_finetuned20', 'SPosts.xml')