# L2 ReRanker Retrieval

## Imports

In [1]:
%pip install --upgrade tqdm==4.66.5 blingfire einops accelerate>=0.26.0 datasets transformers[torch] sentence-transformers

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

from tqdm import tqdm
import json
import glob

import pandas as pd
import numpy as np

In [2]:
# Json Encoder with numpy support
import numpy as np

class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        if isinstance(obj, np.float32):
            return float(obj)
        return super().default(obj)

## Data Loading

In [None]:
!git clone https://github.com/RegNLP/ObliQADataset.git
!git clone https://github.com/usnistgov/trec_eval.git
!cd trec_eval && make

In [4]:
def load_json_files_from_directory(directory_path):
    """Loads all JSON files from a given directory into a list of JSON objects."""

    json_files = glob.glob(directory_path + "/*.json")
    json_data_list = []
    for json_file in json_files:
        with open(json_file, 'r') as f:
            try:
                json_data = json.load(f)
                json_data_list.append((json_file, json_data))
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON in file {json_file}: {e}")
    return json_data_list

directory_path = "ObliQADataset/StructuredRegulatoryDocuments"
json_data_list = load_json_files_from_directory(directory_path)
flattened_json_data_list = [element for json_file, json_data in json_data_list for element in json_data]

dp_id_to_id = {f'{element["DocumentID"]}:{element["PassageID"]}'.replace(' ', '_'):element["ID"] for element in flattened_json_data_list}

In [None]:
BATCH_SIZE = 1024
all_sentences = {}

for file, data in tqdm(json_data_list[0:]):
    sentences = {e['ID']: e['Passage'] for e in data}
    all_sentences |= sentences

In [6]:
eval_set = "test"
rrf_inference_trec_file_path = "/kaggle/input/regnlp-test-l2/rrf.trec"

In [None]:
questions = {}
with open(f"ObliQADataset/ObliQA_{eval_set}.json", 'r') as file:
    data = json.load(file)
    for each_question in tqdm(data[:]):
        questions[each_question['QuestionID']]=(each_question['Question'])

In [8]:
df = pd.read_csv(rrf_inference_trec_file_path, names=["qid", "_", "pid", "ind", "score", "alg"], sep=" ")

df['passage'] = df['pid'].apply(lambda e: all_sentences[e])
df['question'] = df['qid'].apply(lambda e: questions[e])
dtest = df[['question', 'passage']].drop_duplicates().to_numpy()

## Initialization

In [12]:
model = AutoModelForSequenceClassification.from_pretrained('cross-encoder/ms-marco-MiniLM-L-6-v2')
tokenizer = AutoTokenizer.from_pretrained('cross-encoder/ms-marco-MiniLM-L-6-v2')
model = model.to('cuda').eval()

## Inference

In [None]:
scores = []
batch_size = 1
with torch.no_grad():
    for e in tqdm(range(0, len(dtest), batch_size)):
        batch_sentences1 = [each_sample[0] for each_sample in dtest[e:e+batch_size]]
        batch_sentences2 = [each_sample[1] for each_sample in dtest[e:e+batch_size]]
        tokens = tokenizer(batch_sentences1, batch_sentences2,  padding=True, return_tensors='pt')
        if tokens['input_ids'].shape[1] > 512:
            tokens = tokenizer(batch_sentences1, batch_sentences2, padding='max_length', return_tensors='pt', max_length=512, truncation=True)
    
        tokens = tokens.to('cuda')
    
        batch_score = (model(**tokens).logits.detach().cpu()).numpy()
        scores.append(batch_score)

# Trec File Generation

In [None]:
final_scores = {}
c = 0
for e0, e1 in zip(dtest, scores):
    final_scores[":".join(e0)] = e1[0]

df['l2_score'] = df.apply(lambda e: final_scores[e['question']+":"+e['passage']], axis=1)

df[['qid', '_', 'pid', 'ind', 'l2_score', 'alg']].drop_duplicates(subset=["qid", "pid"]).to_csv("l2.trec", header=False, index=False, sep=" ")

In [None]:
!trec_eval/trec_eval -m recall.10 -m map_cut.10 /kaggle/input/regnlp-test-l2/gt.qrels ./l2.trec