In [None]:
from torch.utils.data import DataLoader
import math
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import logging
from datetime import datetime
import os
import gzip
import csv
import pandas as pd

dataset_path = 'from_all_entries_shuffled.tsv'

writer_positive_asc = pd.ExcelWriter('sorted_by_diff/positive_asc.xlsx', engine='xlsxwriter')
writer_positive_des = pd.ExcelWriter('sorted_by_diff/positive_des.xlsx', engine='xlsxwriter')
writer_negative_asc = pd.ExcelWriter('sorted_by_diff/negative_asc.xlsx', engine='xlsxwriter')
writer_negative_des = pd.ExcelWriter('sorted_by_diff/negative_des.xlsx', engine='xlsxwriter')
writer_all_entries = pd.ExcelWriter('sorted_by_diff/Negative_all_term_diff.xlsx', engine='xlsxwriter')

positive_pairs = []
negative_pairs = []

flag = 1

fine_model = SentenceTransformer('fine-tuned_model/fine-tuning-distilroberta-base-paraphrase-v1-2021-05-10_14-17-46')
base_model = SentenceTransformer('distilroberta-base-paraphrase-v1')

with open(dataset_path, 'rt', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        
        #Compute embedding for both lists
        embeddings1 = base_model.encode(row['sentence1'], convert_to_tensor=True)
        embeddings2 = base_model.encode(row['sentence2'], convert_to_tensor=True)

        #Compute cosine-similarits
        cosine_score_base = util.pytorch_cos_sim(embeddings1, embeddings2)

        #Compute embedding for both lists
        embeddings3 = fine_model.encode(row['sentence1'], convert_to_tensor=True)
        embeddings4 = fine_model.encode(row['sentence2'], convert_to_tensor=True)

        #Compute cosine-similarits
        cosine_score_fine = util.pytorch_cos_sim(embeddings3, embeddings4)

        row['Base_model'] = float(cosine_score_base)
        row['Fine_model'] = float(cosine_score_fine)
        row['difference'] = float(abs(cosine_score_base-cosine_score_fine))
        
        print(str(round((flag/136392)*100,3))+'% completed')
        flag=flag+1
                    
        if row['score'] == '1.0':    
            positive_pairs.append(row)
        elif row['score'] == '0.0':
            negative_pairs.append(row)
                     
ascending = pd.DataFrame(negative_pairs).sort_values(by=['difference'])
ascending.to_excel(writer_all_entries, sheet_name='Negative_all_pairs_diff',index=False) 
writer_all_entries.save()

'''
ascending = ascending.head(5000)
descending = pd.DataFrame(positive_pairs).sort_values(by=['difference'], ascending=False)
descending = descending.head(5000)
ascending.to_excel(writer_positive_asc, sheet_name='Positive_pairs_ascending',index=False) 
descending.to_excel(writer_positive_des, sheet_name='Positive_pairs_descending',index=False) 

ascending_ = pd.DataFrame(negative_pairs).sort_values(by=['difference'])
ascending_ = ascending_.head(5000)
descending_ = pd.DataFrame(negative_pairs).sort_values(by=['difference'], ascending=False)
descending_ = descending_.head(5000)
ascending_.to_excel(writer_negative_asc, sheet_name='Negative_pairs_ascending',index=False) 
descending_.to_excel(writer_negative_des, sheet_name='Negative_pairs_descending',index=False) 


writer_positive_asc.save()
writer_positive_des.save()
writer_negative_asc.save()
writer_negative_des.save()'''


In [None]:
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_excel('sorted_by_diff/positive_all_term_diff.xlsx')


def plot_dist(model):
    step = (np.max(model) - np.min(model))/100
    bin_values = np.arange(start=np.min(model), stop=np.max(model+step), step=step)
    model.hist(bins=bin_values, figsize=[14,6],legend=True)

plot_dist(df['Base_model'])
plot_dist(df['Fine_model'])

plt.figure()
plot_dist(df[df['split']=='train']['Fine_model'])
plot_dist(df[df['split']=='dev']['Fine_model'])
plot_dist(df[df['split']=='test']['Fine_model'])
labels= ["FineModel-Positive Train","FineModel-Positive Dev", "FineModel-Positive Test"]
plt.legend(labels)

plt.figure()
plot_dist(df[df['split']=='train']['Base_model'])
plot_dist(df[df['split']=='dev']['Base_model'])
plot_dist(df[df['split']=='test']['Base_model'])
labels= ["BaseModel-Positive Train","BaseModel-Positive Dev", "BaseModel-Positive Test"]
plt.legend(labels)