In [None]:
import os
os.chdir("../input/one-file/IS-BERT")

In [None]:
! pip install transformers

In [None]:
from torch.utils.data import DataLoader
import math
from sentence_transformers import models, losses
from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import logging
from datetime import datetime
import sys
import os
import gzip
import csv
from importlib import reload
import pickle
import codecs
import torch
from transformers import AdamW
from tqdm.autonotebook import tqdm, trange

In [None]:
#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

In [None]:
# Read the dataset.
train_batch_size = 32

# os.chdir("input")
# !ls
os.chdir('..')
!ls

word_embedding_model = models.Transformer('kaggle/input/distilBERTQuine_lr_4e-5_epochs_20')
cnn = models.CNN(in_word_embedding_dimension=word_embedding_model.get_word_embedding_dimension())

# Apply mean pooling to get one fixed sized sentence vector.
pooling_model = models.Pooling(cnn.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, cnn, pooling_model])


#model.encode("telepathy is nice", convert_to_tensor=True)

In [None]:
train_samples = []
os.chdir('..')
!ls
sentences = open('kaggle/input/one-file/IS-BERT/quinev05_input_word.txt', 'r').readlines()
for s in sentences:
    sentence = s.strip().split('\n')[0]
    train_samples.append(InputExample(texts=[sentence], label=4))

# Configure the training.
train_dataset= SentencesDataset(train_samples, model=model)
for elem in train_dataset:
  ids = elem[0][0]
  if len(ids) > 512:
    del ids[512:]

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
train_loss = losses.MutualInformationLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension())

# Configure the training.
num_epochs = 13
warmup_steps = math.ceil(len(train_dataset) * num_epochs / train_batch_size * 0.1)

for elem in train_dataset:
  if len(elem[0][0])> 512:
    print("Indexing error")

In [None]:
model.fit(train_objectives=[(train_dataloader, train_loss)],
          epochs=num_epochs,
          warmup_steps=warmup_steps,
          weight_decay= 0.00,
          optimizer_params = {'lr': 2e-8, 'eps': 2e-9, 'correct_bias': False},
          scheduler= 'warmuplinear'
          ) 

In [None]:
#fit(model)

In [None]:
#model.encode('telepathy')

In [None]:
stripped = []
for sentence in sentences:
    sentence = sentence.strip('\n')
    stripped.append(sentence)
print(len(stripped))
embeddings = model.encode(stripped, convert_to_tensor=True)


In [None]:
def testISqueryrun(test_annot, query, n, docid, embeddings):

    model.max_seq_length = 512


    a = codecs.open(test_annot, encoding="utf-8")
    annot_corpus = a.readlines()
    annot_corpus = [i.split() for i in annot_corpus]

    apscore10_eval = []
    apscore100_eval = []
    apscore500_eval = []
    f10_eval = []
    f100_eval = []
    f500_eval = []
    pc10_eval = []
    pc100_eval = []
    pc500_eval = []
    pcr_eval = []
    pcr_r_eval = []
    pcr_m_eval = []
    pcr_i_eval = []
    avgrank_all_eval = []
    avgrank_500_eval = []
    avgdist_all_eval = []
    avgdist_500_eval = []

    for i in range(n):
        apscore10, apscore100, apscore500, pc10, pc100, pc500, f10, f100, f500, pcr, pcr_r, pcr_m, pcr_i, avgrank_all, avgrank_500, avgdist_all, avgdist_500 = testISquery(annot_corpus, model, embeddings, query, docid)
        #print(apscore10, apscore100, apscore500, pc10, pc100, pc500, f10, f100, f500, pcr, pcr_r, pcr_m, pcr_i, avgrank_all, avgrank_500, avgdist_all, avgdist_500)
        apscore10_eval.append(apscore10)
        apscore100_eval.append(apscore100)
        apscore500_eval.append(apscore500)
        f10_eval.append(f10)
        f100_eval.append(f100)
        f500_eval.append(f500)
        pc10_eval.append(pc10)
        pc100_eval.append(pc100)
        pc500_eval.append(pc500)
        pcr_eval.append(pcr)
        pcr_r_eval.append(pcr_r)
        pcr_m_eval.append(pcr_m)
        pcr_i_eval.append(pcr_i)
        avgrank_all_eval.append(avgrank_all)
        avgrank_500_eval.append(avgrank_500)
        avgdist_all_eval.append(avgdist_all)
        avgdist_500_eval.append(avgdist_500)

    print(pcr)

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
def most_similar(emb_matrix, qvector, n):
    sims = []
    top_k = n
    cos_scores = util.pytorch_cos_sim(qvector, emb_matrix)[0]
    cos_scores = cos_scores.cpu()

    top_results = torch.topk(cos_scores, k=top_k)
    for score, idx in zip(top_results[0], top_results[1]):
        sims.append((idx, score))
    return sims

def testISquery(annot_corpus, model, embeddings, query, docid):
    #Performs a single query and computes evaluation metrics over it. Supports summed query vectors (add), inferred query vectors (inf) and taking a trained document vector as a query vector (trn).
    #If a docID is provided, it will use that as a query, otherwise it will use 'query'.

    annot_hits = []
    annot_hits_m = []
    annot_hits_r = []
    annot_hits_i = []
    for i, line in enumerate(annot_corpus):
        if line:
            if '@' in line[0]:
                if '@1' in line[0]:
                    annot_hits.append(i)
                    annot_hits_r.append(i)
                if '@0' in line[0]:
                    annot_hits.append(i)
                    annot_hits_m.append(i)
                if '@-1' in line[0]:
                    annot_hits_i.append(i)
    #print('annot_hits', annot_hits)
    if docid:
        qvector = embeddings[docid]

    else:
        
        qvector = model.encode(query, convert_to_tensor=True)

    #logger.info('10 most similar words to query vector = {}'.format(model.similar_by_vector(qvector, topn=10)))

    sims = most_similar(embeddings, qvector, len(stripped))
    sims10 = sims[:10]
    sims100 = sims[:100]
    
    matches = {}
    matches_r = {}
    matches_m = {}
    matches_i = {}
    total = 0
    top10 = 0
    top1 = 0
    top100 = 0
    top500 = 0
    top_r = 0 #For computing precision @ R (recall, the number of target documents)
    eval_avgrank = []
    eval_avg_distance_between = []
    eval_avgrank_all = []
    eval_avg_distance_between_all = []

    total_r = 0
    top10_r = 0
    top1_r = 0
    top100_r = 0
    top500_r = 0
    top_r_r = 0
    eval_avgrank_r = []
    eval_avg_distance_between_r = []
    eval_avgrank_all_r = []
    eval_avg_distance_between_all_r = []

    total_m = 0
    top10_m = 0
    top1_m = 0
    top100_m = 0
    top500_m = 0
    top_r_m = 0
    eval_avgrank_m = []
    eval_avg_distance_between_m = []
    eval_avgrank_all_m = []
    eval_avg_distance_between_all_m = []

    total_i = 0
    top10_i = 0
    top1_i = 0
    top100_i = 0
    top500_i = 0
    top_r_i = 0
    eval_avgrank_i = []
    eval_avg_distance_between_i = []
    eval_avgrank_all_i = []
    eval_avg_distance_between_all_i = []

    total_match = len(annot_hits)
    total_match_r = len(annot_hits_r)
    total_match_m = len(annot_hits_m)
    total_match_i = len(annot_hits_i)
    total = len(annot_corpus)

    #for line in annot_hits:
    #    annotation = line.split(',')
    rank = 1
    for match in sims:
        if match[0] in annot_hits:
            #Count total hits of relevant passages
            matches[match[0]] = (rank,match[1])
            eval_avgrank_all.append(rank)
            eval_avg_distance_between_all.append(match[1])
            if rank < 501:
                eval_avgrank.append(rank)
                eval_avg_distance_between.append(match[1])
                top500 = top500 + 1
                if rank < 101:
                    top100 = top100 + 1
                    if rank < 11:
                        top10 = top10 + 1
                        if rank == 1:
                            top1 = top1 + 1
            if rank < total_match:
                top_r = top_r + 1

        if match[0] in annot_hits_r:
            matches_r[match[0]] = (rank,match[1])
            eval_avgrank_all_r.append(rank)
            eval_avg_distance_between_all_r.append(match[1])
            if rank < 501:
                eval_avgrank_r.append(rank)
                eval_avg_distance_between_r.append(match[1])
                top500_r = top500_r + 1
                if rank < 101:
                    top100_r = top100_r + 1
                    if rank < 11:
                        top10_r = top10_r + 1
                        if rank == 1:
                            top1_r = top1_r + 1
            if rank < total_match_r:
                top_r_r = top_r_r + 1

        if match[0] in annot_hits_m:
            matches_m[match[0]] = (rank,match[1])
            eval_avgrank_all_m.append(rank)
            eval_avg_distance_between_all_m.append(match[1])
            if rank < 501:
                eval_avgrank_m.append(rank)
                eval_avg_distance_between_m.append(match[1])
                top500_m = top500_m + 1
                if rank < 101:
                    top100_m = top100_m + 1
                    if rank < 11:
                        top10_m = top10_m + 1
                        if rank == 1:
                            top1_m = top1_m + 1
            if rank < total_match_m:
                top_r_m = top_r_m + 1
        #rank = rank + 1

        if match[0] in annot_hits_i:
            matches_i[match[0]] = (rank,match[1])
            eval_avgrank_all_i.append(rank)
            eval_avg_distance_between_all_i.append(match[1])
            if rank < 501:
                eval_avgrank_i.append(rank)
                eval_avg_distance_between_i.append(match[1])
                top500_i = top500_i + 1
                if rank < 101:
                    top100_i = top100_i + 1
                    if rank < 11:
                        top10_i = top10_i + 1
                        if rank == 1:
                            top1_i = top1_i + 1
            if rank < total_match_i:
                top_r_i = top_r_i + 1
        rank = rank + 1
        
    i = 0
  
    #top_r = top_r_r + top_r_m
    print(top_r,top_r_r , top_r_m, top_r_i)
    #Compute Average Precision metric for the total, scientistic and modest hits
    ap = []
    ap_r = []
    ap_m = []
    ap_i = []
    for rank in sorted(eval_avgrank):
        i=i+1
        ap.append(i/rank)
        if rank in eval_avgrank_r:
            ap_r.append(i/rank)
        if rank in eval_avgrank_m:
            ap_m.append(i/rank)
        if rank in eval_avgrank_i:
            ap_i.append(i/rank)

    if ap:
        apscore = sum(ap)/float(len(ap))
    else:
        apscore = 0
    if ap_r:
        apscore_r = sum(ap_r)/float(len(ap_r))
    else:
        apscore_r = 0
    if ap_m:
        apscore_m = sum(ap_m)/float(len(ap_m))
    else:
        apscore_m = 0
    if ap_i:
        apscore_i = sum(ap_i)/float(len(ap_i))
    else:
        apscore_i = 0

    i = 0
    ap100 = []
    ap100_r = []
    ap100_m = []
    ap100_i = []
    for rank in sorted(eval_avgrank):
        if rank < 101:
            i=i+1
            ap100.append(i/rank)
            if rank in eval_avgrank_r:
                ap100_r.append(i/rank)
            if rank in eval_avgrank_m:
                ap100_m.append(i/rank)
            if rank in eval_avgrank_i:
                ap100_i.append(i/rank)
    if ap100:
        apscore100 = sum(ap100)/float(len(ap100))
    else:
        apscore100 = 0
    if ap100_r:
        apscore100_r = sum(ap100_r)/float(len(ap100_r))
    else:
        apscore100_r = 0
    if ap100_m:
        apscore100_m = sum(ap100_m)/float(len(ap100_m))
    else:
        apscore100_m = 0
    if ap100_i:
        apscore100_i = sum(ap100_i)/float(len(ap100_i))
    else:
        apscore100_i = 0


    i = 0
    ap10 = []
    ap10_r = []
    ap10_m = []
    ap10_i = []
    for rank in sorted(eval_avgrank):
        if rank < 11:
            i=i+1
            ap10.append(i/rank)
            if rank in eval_avgrank_r:
                ap10_r.append(i/rank)
            if rank in eval_avgrank_m:
                ap10_m.append(i/rank)
            if rank in eval_avgrank_i:
                ap10_i.append(i/rank)
    if ap10:
        apscore10 = sum(ap10)/float(len(ap10))
    else:
        apscore10 = 0
    if ap10_r:
        apscore10_r = sum(ap10_r)/float(len(ap10_r))
    else:
        apscore10_r = 0
    if ap10_m:
        apscore10_m = sum(ap10_m)/float(len(ap10_m))
    else:
        apscore10_m = 0
    if ap10_i:
        apscore10_i = sum(ap10_i)/float(len(ap10_i))
    else:
        apscore10_i = 0

    #Compute overall precision
    pc10 = top10/float(10)
    pc100 = top100/float(100)
    pc500 = top500/float(500)
    pcr = top_r/float(total_match) # R-Precision

    #Compute overall recall
    rc10 = top10/float(total_match)
    rc100 = top100/float(total_match)
    rc500 = top500/float(total_match)

    #Compute specific precision
    pcr_r = top_r_r/float(total_match_r)
    pcr_m = top_r_m/float(total_match_m)
    pcr_i = top_r_i/float(total_match_i)

    #Compute F-scores
    if pc10+rc10:
        f10 = 2*(pc10*rc10)/(pc10+rc10)
    else:
        f10 = 0
    if pc100+rc100:
        f100 = 2*(pc100*rc100)/(pc100+rc100)
    else:
        f100 = 0
    if pc500+rc500:
        f500 = 2*(pc500*rc500)/(pc500+rc500)
    else:
        f500 = 0



    if eval_avgrank:
        #print(eval_avgrank)
        return apscore10, apscore100, apscore, pc10, pc100, pc500, f10, f100, f500, pcr, pcr_r, pcr_m, pcr_i, sum(eval_avgrank_all)/float(len(eval_avgrank_all)), sum(eval_avgrank)/float(len(eval_avgrank)), sum(eval_avg_distance_between_all)/float(len(eval_avg_distance_between_all)), sum(eval_avg_distance_between)/float(len(eval_avg_distance_between))
    else:
        return apscore10, apscore100, apscore, pc10, pc100, pc500, f10, f100, f500, pcr, pcr_r, pcr_m, pcr_i, sum(eval_avgrank_all)/float(len(eval_avgrank_all)), 0, sum(eval_avg_distance_between_all)/float(len(eval_avg_distance_between_all)), 0







In [None]:
os.chdir('..')
testISqueryrun('kaggle/input/one-file/IS-BERT/quinev05_annotU_input_word.txt', 'telepathy', 1, None, embeddings)

In [None]:
model.save('./')