In [None]:
from torch.utils.data import DataLoader
from torch.utils.data import IterableDataset
from sentence_transformers import SentenceTransformer, LoggingHandler, util, models, evaluation, losses, InputExample
from tqdm.notebook import tqdm

import numpy as np
import pandas as pd

import logging
import os
import glob
import torch
import json

In [None]:
logging.basicConfig(
    format='- %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
    level=logging.INFO,
    handlers=[LoggingHandler()]
)

class TripletsDataset(IterableDataset):
    def __init__(self, model, corpus, train_triplets):
        self.model = model
        self.corpus = corpus
        self.train_triplets = train_triplets

    def __iter__(self):
        for triplet in self.train_triplets:
            qid, pos_id, neg_id = triplet
            query_text = self.corpus[str(qid)]
            pos_text = self.corpus[str(pos_id)]
            neg_text = self.corpus[str(neg_id)]

            yield InputExample(texts=[query_text, pos_text, neg_text])

    def __len__(self):
        return len(self.train_triplets)
    
def get_triplets(Passage_dict):
    triplets = []
    for k, v in Passage_dict.items():
        for x in v[0]:
            for y in v[1]:
                # query, same_api, diff_api
                triplets.append([k, x, y])

    return triplets



In [None]:
# hyperparam
model_name = 'distilroberta-base'
batch_size = 256
model_save_path = 'models/bienc-exp7'
pretrained = False
data_folder = 'generated5'

In [None]:
with open(f'./data/{data_folder}/Corpus_dict.json', 'r') as f:
    corpus = json.load(f)

with open(f'./data/{data_folder}/Passage_dict.json', 'r') as f:
    passage = json.load(f)

df = pd.read_json(f'data/{data_folder}/dataset.json')
df = df.set_index('index')

# training data
with open(f'./data/{data_folder}/train_queries.json', 'r') as f:
    train_queries_idx = json.load(f)



with open(f'./data/{data_folder}/train_passage.json', 'r') as f:
    train_passage = json.load(f)

with open(f'./data/{data_folder}/train_corpus.json', 'r') as f:
    train_corpus = json.load(f)

train_triplets = get_triplets(train_passage)

In [None]:
# evaluation data
with open(f'./data/{data_folder}/evaluate_queries.json', 'r') as f:
    val_queries_idx = json.load(f)

with open(f'./data/{data_folder}/evaluate_rel_doc.json', 'r') as f:
    val_rel_doc_raw = json.load(f)

val_queries = {str(k): df.loc[int(k)]['Question Title'] for k in passage if int(k) in val_queries_idx}
val_corpus = {}
for rel_docs in val_rel_doc_raw.values():
    # rel_docs is a single element list
    for rel_doc in rel_docs[0]:
        if not val_corpus.get(rel_doc):
            val_corpus[str(rel_doc)] = df.loc[int(rel_doc)]['Question Title']

val_rel_doc = {}
for query, rel_doc in val_rel_doc_raw.items():
    rel_doc = [str(r) for r in rel_doc[0]]
    val_rel_doc[query] = set(rel_doc)

In [None]:
if pretrained:
    model = SentenceTransformer(model_save_path)
else:
    word_embedding_model = models.Transformer(model_name, max_seq_length=350)
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

In [None]:
train_dataset = TripletsDataset(model=model, corpus=train_corpus, train_triplets=train_triplets)
train_dataloader = DataLoader(train_dataset, shuffle=False, batch_size=batch_size)
train_loss = losses.TripletLoss(model=model)
ir_evaluator = evaluation.InformationRetrievalEvaluator(val_queries, val_corpus, val_rel_doc, name='distilroberta-train_eval')

In [None]:
model.evaluate(evaluator=ir_evaluator)

In [None]:
warmup_steps = int(len(train_dataloader) * 5 * 0.1)
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=ir_evaluator,
    epochs=12,
    warmup_steps=warmup_steps,
    output_path=model_save_path,
    evaluation_steps=750,
    save_best_model=True,
    use_amp=True
)

In [None]:
results = model.evaluate(evaluator=ir_evaluator)

In [None]:
del model
torch.cuda.empty_cache()

In [None]:
model_paths = sorted(glob.glob('./models/bi*'))
for model_path in model_paths:
    experiment = os.path.basename(model_path)
    model = SentenceTransformer(model_path)
    map100 = model.evaluate(evaluator=ir_evaluator)
    print(f'Experiment {experiment}: {map100:.3f}')
    del model
    torch.cuda.empty_cache()

In [None]:
with open(f'./data/{data_folder}/val_passage.json', 'r') as f:
    val_passage = json.load(f)

with open(f'./data/{data_folder}/val_corpus.json', 'r') as f:
    val_corpus = json.load(f)

val_triplets = get_triplets(val_passage)

In [None]:
queries = []
poss = []
negs = []
for triplet in tqdm(val_triplets):
    query = df.loc[int(triplet[0])]['Question Title']
    pos = df.loc[triplet[1]]['Question Title']
    neg = df.loc[triplet[2]]['Question Title']

    queries.append(query)
    poss.append(pos)
    negs.append(neg)

In [None]:
q_embs = model.encode(queries, batch_size=1024, show_progress_bar=True)
p_embs = model.encode(poss, batch_size=1024, show_progress_bar=True)
n_embs = model.encode(negs, batch_size=1024, show_progress_bar=True)

In [None]:
positives = []
negatives = []
for q, p, n in tqdm(zip(q_embs, p_embs, n_embs), total=len(q_embs)):
    p_cos = util.pytorch_cos_sim(q, p).numpy()[0]
    n_cos = util.pytorch_cos_sim(q, n).numpy()[0]
    positives.append(p_cos)
    negatives.append(n_cos)

print(f'Pos mean: {np.mean(positives):.3f} Neg mean: {np.mean(negatives):.3f}')
print(f'Pos std: {np.std(positives):.3f} Neg std: {np.std(negatives):.3f}')