# Get the limit for BertScore

In [1]:
import glob

In [2]:
glob.glob("*cnn_dm/cnn_dm/*")

['cnn_dm/cnn_dm/test.source',
 'cnn_dm/cnn_dm/test.target',
 'cnn_dm/cnn_dm/train.source',
 'cnn_dm/cnn_dm/train.target',
 'cnn_dm/cnn_dm/val.source',
 'cnn_dm/cnn_dm/val.target']

In [3]:
source = []
with open("cnn_dm/cnn_dm/train.source") as reader:
    for line in reader:
        source.append(line)

In [4]:
target = []
with open("cnn_dm/cnn_dm/train.target") as reader:
    for line in reader:
        target.append(line)

## Function definitions

In [5]:
import numpy as np
import random
from tqdm.notebook import tqdm
np.random.seed(7)
random.seed(7)

In [6]:
def pull(source,target,amount):  
    selector = random.sample(range(len(source)), amount)
    subset_source = [source[i] for i in selector]
    subset_target= [target[i] for i in selector]
    return subset_source, subset_target

def print_results(P,R,F1):
    print(f"mean P: {np.mean(P.numpy())}, var P: {np.var(P.numpy())}, min P: {np.min(P.numpy())}, max P: {np.max(P.numpy())}, lower 3%: {np.percentile(P.numpy(),5)}, lower 10%: {np.percentile(P.numpy(),10)}, 1. quantil: {np.percentile(P.numpy(),25)}")
    print(f"mean R: {np.mean(R.numpy())}, var R: {np.var(R.numpy())}, min R: {np.min(R.numpy())}, max R: {np.max(R.numpy())}, lower 3%: {np.percentile(R.numpy(),5)}, lower 10%: {np.percentile(R.numpy(),10)}, 1. quantil: {np.percentile(R.numpy(),25)}")
    print(f"mean F1: {np.mean(F1.numpy())}, var F1: {np.var(F1.numpy())}, min F1: {np.min(F1.numpy())}, max F1: {np.max(F1.numpy())}, lower 5%: {np.percentile(F1.numpy(),5)}, lower 10%: {np.percentile(F1.numpy(),10)}, 1. quantil: {np.percentile(F1.numpy(),25)}")
    

def print_results_rouge(P,R,F1):
    Pnp = np.array(P)
    Rnp = np.array(R)
    F1np = np.array(F1)
    print(f"mean P: {np.mean(Pnp)}, var P: {np.var(Pnp)}, min P: {np.min(Pnp)}, max P: {np.max(Pnp)}, lower 3%: {np.percentile(Pnp,5)}, lower 10%: {np.percentile(Pnp,10)}, 1. quantil: {np.percentile(Pnp,25)}")
    print(f"mean R: {np.mean(Rnp)}, var R: {np.var(Rnp)}, min R: {np.min(Rnp)}, max R: {np.max(Rnp)}, lower 3%: {np.percentile(Rnp,5)}, lower 10%: {np.percentile(Rnp,10)}, 1. quantil: {np.percentile(Rnp,25)}")
    print(f"mean F1: {np.mean(F1np)}, var F1: {np.var(F1np)}, min F1: {np.min(F1np)}, max F1: {np.max(F1np)}, lower 5%: {np.percentile(F1np,5)}, lower 10%: {np.percentile(F1np,10)}, 1. quantil: {np.percentile(F1np,25)}")
    
def cal_limit(R):
    import torch
    r_np = R.numpy()
    r_np = r_np[[r_np > 0]]
    limit_np = r_np[[(r_np <= np.percentile(r_np,20)) & (r_np >= np.percentile(r_np, 1))]]
    del r_np
    torch.cuda.empty_cache()
    return np.mean(limit_np)
    
def cal_limit_rouge(R):
    r_np = np.array(R)
    r_np = r_np[[r_np > 0]]
    limit_np = r_np[[(r_np <= np.percentile(r_np,3)) & (r_np >= np.percentile(r_np, 1))]]
    return np.mean(limit_np)
        
def cal_bert(subset_target, subset_source):
    import bert_score
    return bert_score.score(subset_target, subset_source, model_type="xlnet-base-cased",batch_size=2, lang="en", device="cuda:0")

def cal_rouge(subset_target, subset_source):
    from rouge import Rouge
    rouge_score = Rouge()
    scores_r1 = {}
    scores_r1["f1"] = []
    scores_r1["p"] = []
    scores_r1["r"] = []

    scores_r2 = {}
    scores_r2["f1"] = []
    scores_r2["p"] = []
    scores_r2["r"] = []

    scores_rl = {}
    scores_rl["f1"] = []
    scores_rl["p"] = []
    scores_rl["r"] = []
    scores = rouge_score.get_scores(subset_target, subset_source)
    
    scor_a = np.array(scores)
    for l in tqdm(scor_a):
        scores_r1["f1"].append(l["rouge-1"]["f"])
        scores_r1["p"].append(l["rouge-1"]["p"])
        scores_r1["r"].append(l["rouge-1"]["r"])

        scores_r2["f1"].append(l["rouge-2"]["f"])
        scores_r2["p"].append(l["rouge-2"]["p"])
        scores_r2["r"].append(l["rouge-2"]["r"])

        scores_rl["f1"].append(l["rouge-l"]["f"])
        scores_rl["p"].append(l["rouge-l"]["p"])
        scores_rl["r"].append(l["rouge-l"]["r"])

    return scores_r1, scores_r2, scores_rl

## Pull 40

In [7]:
import warnings
warnings.filterwarnings("ignore")

In [8]:
limits_p = []
limits_r = []

In [9]:
subsets_source, subsets_target = zip(*[pull(source, target, 700) for i in range(40)])

## Calculate Bert Score 

In [10]:
for i in tqdm(range(40)):
    subset_source = subsets_source[i]
    subset_target = subsets_target[i]
    P,R,F1 = cal_bert(subset_target, subset_source)
    #print_results(P,R,F1)
    limits_p.append(cal_limit(P))
    limits_r.append(cal_limit(R))

HBox(children=(FloatProgress(value=0.0, max=40.0), HTML(value='')))

PyTorch version 1.6.0 available.
TensorFlow version 2.3.1 available.





#### Cal mean and Std deviation and final limit value

#### Precision 

In [11]:
limits_mean = np.mean(limits_p)
limits_std = np.std(limits_p)

final_limit = limits_mean - limits_std
print(limits_mean)
print(limits_std)
print(f"Precision_limit: {final_limit}")

0.658549
0.0048365477
Precision_limit: 0.6537124514579773


#### Recall 

In [12]:
limits_mean = np.mean(limits_r)
limits_std = np.std(limits_r)


print(limits_mean)
print(limits_std)
final_limit = limits_mean - limits_std
print(f"Recall_limit: {final_limit}")

0.39873487
0.004283182
Recall_limit: 0.39445167779922485


# Same for rouge 

In [13]:
limits_r1_p = []
limits_r2_p = []
limits_rl_p = []

limits_r1_r = []
limits_r2_r = []
limits_rl_r = []
for i in tqdm(range(40)):
    subset_source = subsets_source[i]
    subset_target = subsets_target[i]
    r1, r2, rl = cal_rouge(subset_target, subset_source)
    #print_results_rouge(r1["p"], r1["r"], r1["f1"])
    #print_results(P,R,F1)
    limits_r1_p.append(cal_limit_rouge(r1["p"]))
    limits_r2_p.append(cal_limit_rouge(r2["p"]))
    limits_rl_p.append(cal_limit_rouge(rl["p"]))
    
    limits_r1_r.append(cal_limit_rouge(r1["r"]))
    limits_r2_r.append(cal_limit_rouge(r2["r"]))
    limits_rl_r.append(cal_limit_rouge(rl["r"]))

HBox(children=(FloatProgress(value=0.0, max=40.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=700.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=700.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=700.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=700.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=700.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=700.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=700.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=700.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=700.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=700.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=700.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=700.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=700.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=700.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=700.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=700.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=700.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=700.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=700.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=700.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=700.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=700.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=700.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=700.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=700.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=700.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=700.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=700.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=700.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=700.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=700.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=700.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=700.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=700.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=700.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=700.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=700.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=700.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=700.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=700.0), HTML(value='')))





####  R1

In [14]:
limits_mean = np.mean(limits_r1_p)
limits_std = np.std(limits_r1_p)
print(limits_mean)
print(limits_std)
final_limit = limits_mean - limits_std
print(f"precision_limit: {final_limit}")

limits_mean = np.mean(limits_r1_r)
limits_std = np.std(limits_r1_r)
print(limits_mean)
print(limits_std)
final_limit = limits_mean - limits_std
print(f"recall_limit: {final_limit}")

0.569719205348336
0.016337348014151126
precision_limit: 0.5533818573341849
0.01852583165198726
0.0012511073804912854
recall_limit: 0.017274724271495973


#### R2 

In [15]:
limits_mean = np.mean(limits_r2_p)
limits_std = np.std(limits_r2_p)
print(limits_mean)
print(limits_std)
final_limit = limits_mean - limits_std
print(f"precision_limit: {final_limit}")

limits_mean = np.mean(limits_r2_r)
limits_std = np.std(limits_r2_r)
print(limits_mean)
print(limits_std)
final_limit = limits_mean - limits_std
print(f"recall_limit: {final_limit}")

0.1090482428628583
0.00910108982379119
precision_limit: 0.09994715303906711
0.0046600786819169385
0.0005462835388596387
recall_limit: 0.0041137951430572995


#### Rl

In [16]:
limits_mean = np.mean(limits_rl_p)
limits_std = np.std(limits_rl_p)
print(limits_mean)
print(limits_std)
final_limit = limits_mean - limits_std
print(f"precision_limit: {final_limit}")

limits_mean = np.mean(limits_rl_r)
limits_std = np.std(limits_rl_r)
print(limits_mean)
print(limits_std)
final_limit = limits_mean - limits_std
print(f"recall_limit: {final_limit}")


0.5137022440113049
0.01474642137350945
precision_limit: 0.4989558226377955
0.03305311223751688
0.0022340717102069963
recall_limit: 0.030819040527309882


In [1]:
import pandas as pd

In [5]:
pd.DataFrame(index=["Experiment 1 means", "Experiment 2 means", "Experiment 3 means"], data={"Bert-Score":[0.6586, 0.6595, 0.6585], "Rouge-1":[0.5669, 0.5664, 0.5533], "Rogue-2":[0.1079, 0.1098, 0.1090], "Rouge-L": [0.5127, 0.5135, 0.5137]})

Unnamed: 0,Bert-Score,Rouge-1,Rogue-2,Rouge-L
Experiment 1 means,0.6586,0.5669,0.1079,0.5127
Experiment 2 means,0.6595,0.5664,0.1098,0.5135
Experiment 3 means,0.6585,0.5533,0.109,0.5137


In [3]:
pd.DataFrame(index=["Experiment 1 Std. Dev.", "Experiment 2 Std. Dev.", "Experiment 3 Std. Dev."], data={"Bert-Score":[0.0037, 0.0035, 0.0048], "Rouge-1":[0.0154, 0.0145, 0.0170], "Rogue-2":[0.0089, 0.0113, 0.0091], "Rouge-L": [0.0061, 0.0137, 0.0147]})

Unnamed: 0,Bert-Score,Rouge-1,Rogue-2,Rouge-L
Experiment 1 Std. Dev.,0.0037,0.0154,0.0089,0.0061
Experiment 2 Std. Dev.,0.0035,0.0145,0.0113,0.0137
Experiment 3 Std. Dev.,0.0048,0.017,0.0091,0.0147


In [7]:
pd.DataFrame(index=["limit (precision) Experiment 1", "limit (precision) Experiment 2", "limit (precision) Experiment 3", "threshold limit"], data={"Bert-Score":[0.6548, 0.6559, 0.6537, 0.65], "Rouge-1":[0.5514, 0.5518, 0.5533, 0.5], "Rogue-2":[0.0989, 0.0985, 0.0999, 0.1], "Rouge-L": [0.4989, 0.4998, 0.4989, 0.5]})

Unnamed: 0,Bert-Score,Rouge-1,Rogue-2,Rouge-L
limit (precision) Experiment 1,0.6548,0.5514,0.0989,0.4989
limit (precision) Experiment 2,0.6559,0.5518,0.0985,0.4998
limit (precision) Experiment 3,0.6537,0.5533,0.0999,0.4989
threshold limit,0.65,0.5,0.1,0.5
