# Dense Retrieval

## Imports

In [None]:
%pip install tqdm==4.66.5 sentence-transformers einops

In [2]:
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer

from torch import Tensor
import torch
import torch.nn.functional as F

from tqdm import tqdm
import numpy as np
import json
import glob

In [None]:
MODEL_NAME="intfloat/e5-base-v2"
model = SentenceTransformer(MODEL_NAME)

In [None]:
if model.max_seq_length > 1024:
    model.max_seq_length = 1024

## Data Loading

In [None]:
!git clone https://github.com/RegNLP/ObliQADataset.git
!git clone https://github.com/usnistgov/trec_eval.git
!cd trec_eval && make

In [5]:
def load_json_files_from_directory(directory_path):
    """Loads all JSON files from a given directory into a list of JSON objects."""

    json_files = glob.glob(directory_path + "/*.json")
    json_data_list = []
    for json_file in json_files:
        with open(json_file, 'r') as f:
            try:
                json_data = json.load(f)
                json_data_list.append((json_file, json_data))
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON in file {json_file}: {e}")
    return json_data_list

directory_path = "ObliQADataset/StructuredRegulatoryDocuments"
json_data_list = load_json_files_from_directory(directory_path)
flattened_json_data_list = [element for json_file, json_data in json_data_list for element in json_data]

dp_id_to_id = {f'{element["DocumentID"]}:{element["PassageID"]}'.replace(' ', '_'):element["ID"] for element in flattened_json_data_list}

BATCH_SIZE = 1024
all_sentences = {}
for file, data in tqdm(json_data_list[0:]):
    sentences = {e['ID']: f"Article: {e['PassageID']}\n {e['Passage']}" for e in data  if len(e['Passage'].split(" ")) > 10}
    all_sentences |= sentences

## Initialization

In [6]:
def get_embedding(e, task=None):
    if len(e) > 0:
        embeddings = torch.Tensor(model.encode([k for k in e], batch_size=32, show_progress_bar = False))
        return np.array(F.normalize(embeddings, p=2, dim=1))
    else:
        return e

def compute_batch_embeddings(sentences, batch_size=32):
    output_embedding = []
    for e in tqdm(range(0, len(sentences), batch_size)):
        try:
            output_embedding.append([e for e in get_embedding(sentences[e:e+batch_size])])
        except:
            print(sentences[e:e+batch_size])
            stupefy

    return np.concatenate(output_embedding)

In [None]:
embeddings = {}
for each_sentence, embedding in zip(all_sentences.keys(), compute_batch_embeddings(list(all_sentences.values()))):
    embeddings[each_sentence] = embedding

## Inference

In [11]:
eval_set = "test"

In [None]:
pred_rels = open("dense_retriever.trec", "w")

with open(f'ObliQADataset/ObliQA_{eval_set}.json', 'r') as file:

    data = json.load(file)

    questions = []

    for each_question in tqdm(data[:]):
        questions.append(each_question['Question'])

    questions_embeddings = compute_batch_embeddings(questions)
    paragraph_embeddings = np.array([ embeddings[e] for e in embeddings])
    inverse_index = { i:e for i, e in enumerate(embeddings)}
    for each_embedding, each_question in tqdm(zip(questions_embeddings, data)):
        relevant_passages = each_embedding.dot(paragraph_embeddings.T)
        indices = np.argsort(relevant_passages)
        top_10_passages = indices[-200:][::-1]

        for i, e in enumerate(top_10_passages):
            line = f"{each_question['QuestionID']} 0 {inverse_index[e]} {i+1} {relevant_passages[e]} alg"
            pred_rels.write(line + "\n")

pred_rels.close()

In [None]:
!trec_eval/trec_eval -m recall.10 -m map_cut.10 /kaggle/input/regnlp-test-l2/gt.qrels ./dense_retriever.trec