# Sentence similarity - SBERT
 
This notebook was created to calculate the similarity measure from the SBERT outputs.

In [2]:
import json
import pandas as pd
import numpy as np
import torch
import torch.nn as nn

In [3]:
# Load evidence dataset
# Read evidence
with open('../data/evidence.json', 'r') as f:
    evidence = json.load(f)
eviden = pd.DataFrame.from_dict(evidence, orient='index', columns=['evidence'])

# Read dev claims
with open('../data/dev-claims.json', 'r') as f:
    df_dev = pd.DataFrame(json.load(f)).transpose()

# Load encoded evidence set - This dataset is the SBERT encoded matrix for the evidence
with open('../data/SBERT_2/encoded_evidence.json', 'rb') as f:
    enc_ev = np.load(f)
# Load encoded dev set -  This dataset is the SBERT encoded matrix for the dev claims
with open('../data/SBERT_2/encoded_dev_claims.json', 'rb') as f:
    enc_dv = np.load(f)

### Measure similarity in dev set

In [1]:
# Measure similarity ( This cell takes long time to run)
cos = nn.CosineSimilarity(dim=1, eps=1e-6)
tot = enc_dv.shape[0]
dv_sim = []
dv_sim_ = []
df = pd.Series(np.array(eviden.index)).reset_index(name='id').drop(['index'], axis=1)
for i in range(tot):
    print(f'{i} of {tot} iterantions')
    similarity = cos(torch.from_numpy(enc_ev), torch.from_numpy(enc_dv[i]))
    df[df_dev.index[i]] = similarity.detach().numpy().reshape(-1)
# Write results
df.to_csv('../data/SBERT_2/sim.csv')

In [86]:
#  Score the results using thresholds
f_score = {}
for thresh in range(990,1000, 3):
    precision, recall, F1 = [], [], []
    for claim in df_dev.index:
        ds = df.set_index('id').loc[:, [claim]] 
        ds = ds[ds[claim] >= thresh/1000]
        TP = ds[ds.index.isin(df_dev.loc[claim, 'evidences'])].shape[0]
        FP = ds.shape[0]-TP
        FN = len(df_dev.loc[claim, 'evidences']) - TP

        precision = TP/(TP+FP + 1E-10)
        recall = TP/(TP+FN + 1E-10)
        F1.append((2 * precision * recall)/(precision + recall + 1E-10))
    mn = np.mean(F1)
    f_score[thresh] = mn
    print(f'thresh: {thresh} F1: {mn}')

thresh: 990 F1: 0.0001251826994173842
thresh: 993 F1: 0.00014719771697847464
thresh: 996 F1: 0.00016811728171091595
thresh: 999 F1: 0.0004276801047330198


### Measure similarity in test set

In [60]:
# Read test claims
with open('../data/test-claims-unlabelled.json', 'r') as f:
    df_test = pd.DataFrame(json.load(f)).transpose()


# Load encoded test set -  This dataset is the SBERT encoded matrix for the test claims
with open('../data/SBERT_2/encoded_ts_claims.json', 'rb') as f:
    enc_ts = np.load(f)

In [2]:
# Measure similarity ( This cell takes long time to run)
cos = nn.CosineSimilarity(dim=1, eps=1e-6)
tot = enc_ts.shape[0]
dv_sim = []
dv_sim_ = []
df = pd.Series(np.array(eviden.index)).reset_index(name='id').drop(['index'], axis=1)
for i in range(tot):
    print(f'{i} of {tot} iterantions')
    similarity = cos(torch.from_numpy(enc_ev), torch.from_numpy(enc_ts[i]))
    df[df_test.index[i]] = similarity.detach().numpy().reshape(-1)
# Write results
df.to_csv('../data/SBERT_2/ts_sim.csv')

In [157]:
# Define threshold
pr_ev = []
for claim in df_test.index:
    ds = df.set_index('id').loc[:, [claim]] 
    ds = ds[ds[claim] >= 0.995]
    pr_ev.append(list(ds.index))
df_test['evidences'] = pr_ev
# Write results
df_test.to_json('evidence_test_2.json')