In [1]:
import json
import pandas as pd # type: ignore
import random

In [2]:
!wget http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_distractor_v1.json
# !wget http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_fullwiki_v1.json
# !wget http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_train_v1.1.json

!mkdir hotpot_qa
!mv hotpot_dev_distractor_v1.json hotpot_qa/
# !mv hotpot_dev_fullwiki_v1.json hotpot_qa/
# !mv hotpot_train_v1.1.json hotpot_qa/

--2024-09-12 17:50:53--  http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_distractor_v1.json
Resolving curtis.ml.cmu.edu (curtis.ml.cmu.edu)... 128.2.204.193
Connecting to curtis.ml.cmu.edu (curtis.ml.cmu.edu)|128.2.204.193|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 46320117 (44M) [application/json]
Saving to: 'hotpot_dev_distractor_v1.json'


2024-09-12 17:50:56 (18.1 MB/s) - 'hotpot_dev_distractor_v1.json' saved [46320117/46320117]



In [3]:
path = "hotpot_qa/hotpot_dev_distractor_v1.json"
with open(path) as f:
    data = json.load(f)

In [4]:
q_gd = {}
for d in data:
    gd = set()
    for x in d['supporting_facts']:
        gd.add(x[0])
    q_gd[d['question']] = gd

q_gd_df = pd.DataFrame(q_gd.items(), columns=['question', 'pos_doc_heads'])

In [5]:
q_gd_df.head()

Unnamed: 0,question,pos_doc_heads
0,Were Scott Derrickson and Ed Wood of the same ...,"{Scott Derrickson, Ed Wood}"
1,What government position was held by the woman...,"{Shirley Temple, Kiss and Tell (1945 film)}"
2,"What science fantasy young adult series, told ...","{The Hork-Bajir Chronicles, Animorphs}"
3,Are the Laleli Mosque and Esma Sultan Mansion ...,"{Laleli Mosque, Esma Sultan Mansion}"
4,"The director of the romantic comedy ""Big Stone...","{Adriana Trigiani, Big Stone Gap (film)}"


In [6]:
mp = {}
for i in range(len(data)):
    d = data[i]
    for x,y in d['context']:
        mp[x] = ''.join(y)

In [7]:
doc_df = pd.DataFrame(list(mp.items()), columns=['heading', 'context'])
doc_df.head()

Unnamed: 0,heading,context
0,Ed Wood (film),Ed Wood is a 1994 American biographical period...
1,Scott Derrickson,"Scott Derrickson (born July 16, 1966) is an Am..."
2,"Woodson, Arkansas",Woodson is a census-designated place (CDP) in ...
3,Tyler Bates,"Tyler Bates (born June 5, 1965) is an American..."
4,Ed Wood,"Edward Davis Wood Jr. (October 10, 1924 – Dece..."


In [8]:
# add a coloum pos_queries to doc_df which has list of queries for which this doc is positive
doc_df['pos_queries'] = doc_df['heading'].apply(lambda x: q_gd_df[q_gd_df['pos_doc_heads'].apply(lambda y: x in y)]['question'].tolist())

In [9]:
doc_df


Unnamed: 0,heading,context,pos_queries
0,Ed Wood (film),Ed Wood is a 1994 American biographical period...,[]
1,Scott Derrickson,"Scott Derrickson (born July 16, 1966) is an Am...",[Were Scott Derrickson and Ed Wood of the same...
2,"Woodson, Arkansas",Woodson is a census-designated place (CDP) in ...,[]
3,Tyler Bates,"Tyler Bates (born June 5, 1965) is an American...",[]
4,Ed Wood,"Edward Davis Wood Jr. (October 10, 1924 – Dece...",[Were Scott Derrickson and Ed Wood of the same...
...,...,...,...
66576,Analog Devices,"Analog Devices, Inc., also known as ADI or Ana...",[Blackfin is a family of processors developed ...
66577,Zet (hardware),Zet is a clone x86 processor where its machine...,[]
66578,Xetal,Xetal is the name of a family of non commercia...,[]
66579,XAP processor,The XAP processor is a RISC processor architec...,[]


In [10]:
# create a dict for each question i thas keys as query, pos_doc, neg_doc
# and values as question, list of positive documents, list of negative documents
# if positive documents are x, then generate negative documents by randomly sampling 20-x documents which are not in x
# make a list of all dicts
import random
test_data = []
for qu,pdh in q_gd_df.values:
    pdh = list(pdh)
    pdh_context = [mp[x] for x in pdh]
    ndh = list(set(doc_df['heading']) - set(pdh))
    # sample 20 - len(pdh) negative documents
    random.shuffle(ndh)
    ndh = ndh[:20-len(pdh)]
    ndh_context = [mp[x] for x in ndh]
    test_data.append(
        {
            'query': qu,
            'pos_doc': pdh_context,
            'neg_doc': ndh_context
        }
    )

In [11]:
# create a dict for each documnet include all queries for which this document is positive(x)
# and all queries for which this document is negative(20-x)
# make a list of all dicts
import random
test_doc_data = []
for i in range(len(doc_df)):
    h = doc_df.iloc[i]['heading']
    c = doc_df.iloc[i]['context']
    pos_queries = list(doc_df.iloc[i]['pos_queries'])
    if len(pos_queries) <= 0:
        continue
    len_pos_queries = len(pos_queries)
    len_neg_queries = 20 -len_pos_queries
    # remove all the positive queries from all and randomly sample len_neg_queries
    nq = list(set(q_gd_df['question']) - set(pos_queries))
    random.shuffle(nq)
    neg_queries = nq[:len_neg_queries]
    test_doc_data.append(
        {
            'doc' : c,
           'pos_que':pos_queries,
           'neg_que' :neg_queries
        }
    )

    

In [12]:
len(test_data)

7405

In [13]:
len(test_doc_data)

13783

In [14]:

import torch
from torch import nn
import torch.nn.functional as F
from transformers import BertTokenizer, BertModel, BertForMaskedLM

import torch
from torch import nn

class ContrastiveLoss(nn.Module):
    """
        Given a list of scores s1,s2,..sn, calculates -log(e^s1/(e^s1+e^s2+...+e^sn))
    """
    def __init__(self):
        super(ContrastiveLoss, self).__init__()

    def forward(self, scores):
        scaled_scores = scores / 1.0
        max_score = torch.max(scaled_scores)
        stable_scaled_scores = scaled_scores - max_score
        log_sum_exp = max_score + torch.log(torch.sum(torch.exp(stable_scaled_scores)))
        loss = log_sum_exp - scaled_scores[0]

        return loss
    
    
class BertClsFFN(nn.Module):
    """
        A small feed forward network on top of CLS embedding, to get a score
    """
    def __init__(self):
        super(BertClsFFN, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.ffn = nn.Sequential(
            nn.Linear(768, 32),
            nn.ReLU(),
            nn.LayerNorm(32),
            nn.Linear(32, 8),
            nn.ReLU(),
            nn.LayerNorm(8),
            nn.Linear(8, 1),

        )
        self.freeze_bert()

    
    def freeze_bert(self):
        self.bert.embeddings.requires_grad_(False)
        for param in self.bert.encoder.layer[:11].parameters():
            param.requires_grad = False

    def forward(self, input_tokens):
        sentence_embed = self.bert(**input_tokens).pooler_output
        scores = self.ffn(sentence_embed).reshape(-1)
        return scores



class BertLogitScorer(nn.Module):
    """
        If the input format is [CLS] `sent1` [SEP] `sent2` [SEP], we sum the log_probs of tokens of `sent2` to get a representation of a score
    """
    def __init__(self):
        super(BertLogitScorer, self).__init__()
        self.bert = BertForMaskedLM.from_pretrained('bert-base-uncased')
        self.freeze_bert()

    def freeze_bert(self):
        self.bert.bert.embeddings.requires_grad_(False)
        for param in self.bert.bert.encoder.layer[:10].parameters():
            param.requires_grad = False

    def forward(self, input_tokens):
        input_ids = input_tokens['input_ids']
        batch_size, seq_length = input_ids.shape
        logits = self.bert(**input_tokens).logits
        log_probs = F.log_softmax(logits, dim=-1)

        sums = torch.zeros(batch_size, device=logits.device)

        for i in range(batch_size):
            sep_indices = (input_ids[i] == 102).nonzero(as_tuple=True)[0]
            idx1, idx2 = sep_indices[0].item(), sep_indices[1].item()
            token_ids_in_range = input_ids[i, idx1 + 1:idx2]
            log_probs_in_range = log_probs[i, idx1 + 1:idx2]
            gathered_log_probs = torch.gather(log_probs_in_range, dim=1, index=token_ids_in_range.unsqueeze(-1)).squeeze(-1)
            
            sums[i] = torch.sum(gathered_log_probs)

        return sums


In [15]:
class DocLH_CLS(nn.Module):
    def __init__(self, model_path=None):
        super(DocLH_CLS,self).__init__()
        self.bert_scorer = BertClsFFN()
        if model_path is not None:
            self.bert_scorer.load_state_dict(torch.load(model_path))
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    
    def forward(self, d):
        texts = []
        for pd in d['pos_doc']:
            text = d['query'] + ' [SEP] ' + pd
            texts.append(text)
        for nd in d['neg_doc']:
            text = d['query'] + ' [SEP] ' + nd
            texts.append(text)
        input_tokens = self.bert_tokenizer(texts, padding=True, truncation=True, return_tensors='pt').to('cuda')
        scores = self.bert_scorer(input_tokens)
        return scores

class DocLH_Logit(nn.Module):
    def __init__(self, model_path=None):
        super(DocLH_Logit,self).__init__()
        self.bert_scorer = BertLogitScorer()
        if model_path is not None:
            self.bert_scorer.load_state_dict(torch.load(model_path))
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def forward(self, d):
        texts = []
        for pd in d['pos_doc']:
            text = d['query'] + ' [SEP] ' + pd
            texts.append(text)
        for nd in d['neg_doc']:
            text = d['query'] + ' [SEP] ' + nd
            texts.append(text)
        input_tokens = self.bert_tokenizer(texts, padding=True, truncation=True, return_tensors='pt').to('cuda')
        scores = self.bert_scorer(input_tokens)
        return scores
    

class QueryLH_CLS(nn.Module):
    def __init__(self, model_path=None):
        super(QueryLH_CLS,self).__init__()
        self.bert_scorer = BertClsFFN()
        if model_path is not None:
            self.bert_scorer.load_state_dict(torch.load(model_path))
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    
    def forward(self, d):
        texts = []
        for pd in d['pos_que']:
            text = d['doc'] + ' [SEP] ' + pd
            texts.append(text)
        for nd in d['neg_que']:
            text = d['doc'] + ' [SEP] ' + nd
            texts.append(text)
        input_tokens = self.bert_tokenizer(texts, padding=True, truncation=True, return_tensors='pt').to('cuda')
        scores = self.bert_scorer(input_tokens)
        return scores

class QueryLH_Logit(nn.Module):
    def __init__(self, model_path=None):
        super(QueryLH_Logit,self).__init__()
        self.bert_scorer = BertLogitScorer()
        if model_path is not None:
            self.bert_scorer.load_state_dict(torch.load(model_path))
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    
    def forward(self, d):
        texts = []
        for pd in d['pos_que']:
            text = d['doc'] + ' [SEP] ' + pd
            texts.append(text)
        for nd in d['neg_que']:
            text = d['doc'] + ' [SEP] ' + nd
            texts.append(text)
        input_tokens = self.bert_tokenizer(texts, padding=True, truncation=True, return_tensors='pt').to('cuda')
        scores = self.bert_scorer(input_tokens)
        return scores

In [16]:
# !ls /kaggle/input/doclh_logit/transformers/default/1/model_checkpoints/model_task1_epoch_2.pth

In [17]:
model_Doc_log = DocLH_Logit(model_path = '/kaggle/input/doclh_logits/transformers/default/1/model_checkpoints/DocLH_Logits_epoch2.pth').to('cuda')
model_Doc_cls = DocLH_CLS(model_path = '//kaggle/input/doclh_-ls/transformers/default/1/model_checkpoints/DocLH_CLS_epoch2.pth').to('cuda')
model_que_log = QueryLH_Logit(model_path = '/kaggle/input/querylh_logits/transformers/default/1/model_checkpoints/QueryLH_Logits_epoch1.pth').to('cuda')
model_que_cls = QueryLH_CLS(model_path = '/kaggle/input/querylh_cls/transformers/default/1/model_checkpoints/QueryLH_CLS_epoch2.pth').to('cuda')



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  self.bert_scorer.load_state_dict(torch.load(model_path))


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

  self.bert_scorer.load_state_dict(torch.load(model_path))
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  self.bert_scorer.load_state_dict(torch.load(model_path))
  self.bert_scorer.load_state_dict(torch.load(model_path))


In [18]:
def prec(scores,k,gd):
    count = 0
    for i in range(k):
        count = count + scores[i][1]
    return float(count/k),float(min(k,gd)/k)

def mrr(scores):
    for i in range(len(scores)):
        if scores[i][1] == 1:
            return float(1.0/float(i+1))
        
def map_(scores,gd):
    sum = 0
    temp = 0
    for i in range(len(scores)):
        if scores[i][1] == 1:
            temp = temp + 1
            sum = sum + float((1.0*temp)/float(i+1))
            
    return float(sum/gd)
    
    

In [19]:
model_names = ['DocLH_Logit','DocLH_CLS']
models = [model_Doc_log, model_Doc_cls]


In [None]:
from tqdm import tqdm
eval_scores = []
for mo in range(2):
    mo_name = model_names[mo]
    model = models[mo]
    prec1 = 0.0
    prec10 = 0.0
    max_prec1 = 0.0
    max_prec10 = 0.0
    mrr_= 0.0
    map__ = 0.0
    for test_d in tqdm(test_data, desc="Processing"):
        gold_d = []
        gold_d.extend([1] * len(test_d['pos_doc']))
        gold_d.extend([0] * len(test_d['neg_doc']))

        scores = model(test_d)
        scores = scores.tolist()
        my_pairs = [(scores[i],gold_d[i]) for i in range(len(gold_d))]
        my_pairs = sorted(my_pairs, key=lambda x: x[0], reverse=True)
        prec1 += prec(my_pairs,1,len(test_d['pos_doc']))[0]
        max_prec1 += prec(my_pairs,1,len(test_d['pos_doc']))[1]
        prec10 += prec(my_pairs,10,len(test_d['pos_doc']))[0]
        max_prec10 += prec(my_pairs,10,len(test_d['pos_doc']))[1]
        mrr_ += mrr(my_pairs)
        map__ += map_(my_pairs,len(test_d['pos_doc']))
        
    scores_data = {}
    scores_data['model_name'] = mo_name
    scores_data['prec1'] = float(prec1/len(test_data))
    scores_data['prec10'] =float(prec10/len(test_data))
    scores_data['mrr'] = float(mrr_/len(test_data))
    scores_data['map'] = float(map__/len(test_data))
    scores_data['max_prec1'] = float(max_prec1/len(test_data))
    scores_data['max_prec10'] = float(max_prec10/len(test_data))
    
    print(scores_data)
    
    eval_scores.append(scores_data)

    

In [None]:
print(eval_scores)

In [22]:
model_names = ['QueryLH_Logit','QueryLH_CLS']
models = [model_que_log,model_que_cls]

In [None]:
from tqdm import tqdm
for mo in range(2):
    mo_name = model_names[mo]
    model = models[mo]
    prec1 = 0.0
    prec10 = 0.0
    max_prec1 = 0.0
    max_prec10 = 0.0
    mrr_= 0.0
    map__ = 0.0
    total = 0
    for test_d in tqdm(test_doc_data, desc="Processing"):
        try:
            gold_d = []
            gold_d.extend([1] * len(test_d['pos_que']))
            gold_d.extend([0] * len(test_d['neg_que']))

            scores = model(test_d)
            scores = scores.tolist()
            my_pairs = [(scores[i],gold_d[i]) for i in range(len(gold_d))]
            my_pairs = sorted(my_pairs, key=lambda x: x[0], reverse=True)
            prec1 += prec(my_pairs,1,len(test_d['pos_que']))[0]
            max_prec1 += prec(my_pairs,1,len(test_d['pos_que']))[1]
            prec10 += prec(my_pairs,10,len(test_d['pos_que']))[0]
            max_prec10 += prec(my_pairs,10,len(test_d['pos_que']))[1]
            mrr_ += mrr(my_pairs)
            map__ += map_(my_pairs,len(test_d['pos_que']))
            total+=1
        except:
            pass
        
    scores_data = {}
    scores_data['model_name'] = mo_name
    scores_data['prec1'] = float(prec1/total)
    scores_data['prec10'] =float(prec10/total)
    scores_data['mrr'] = float(mrr_/total)
    scores_data['map'] = float(map__/total)
    scores_data['max_prec1'] = float(max_prec1/total)
    scores_data['max_prec10'] = float(max_prec10/total)
    
    print(scores_data)
    
    eval_scores.append(scores_data)

    

In [None]:
print(eval_scores)

In [25]:
import json
with open('evaluate.json', 'w') as f:
    json.dump(eval_scores, f, indent=4)