In [1]:
from pathlib import Path
import torch
from sentence_transformers import models, losses, datasets
from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import pandas as pd
from tqdm import tqdm
import numpy as np

## Sentence Transformer score

In [2]:
timestamp = "2023-02-17_15-02-13"
project_base_path = Path("Guided Research WS22")
negation_dataset = project_base_path / "data/negation_dataset_labeled.tsv"


base_model = "sentence-transformers/all-mpnet-base-v2"
output_model_name = f"{base_model.split('/')[1]}-negation"  # TODO.
model_save_path = str(project_base_path / f"finetuned-models/{timestamp}/{output_model_name}")
model_save_path_wmt = "finetuned-models/all-mpnet-base-v2-negation_wmt"

In [3]:
finetuned_model = SentenceTransformer(model_save_path)
finetuned_model_wmt = SentenceTransformer(model_save_path_wmt)
base_model = SentenceTransformer(base_model)
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(device)
base_model.to(device)
finetuned_model.to(device)
finetuned_model_wmt.to(device)

cuda:0


SentenceTransformer(
  (0): Transformer({'max_seq_length': 75, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [6]:
def cos_score(reference: str, candidate: str, model:SentenceTransformer) -> float:
    emb_ref = model.encode(reference)
    emb_cand = model.encode(candidate)
    return util.cos_sim(emb_ref, emb_cand).item()

def cos_score_batched(references: list, candidates: list, model: SentenceTransformer, batch_size=8) -> torch.Tensor:
    assert len(references) == len(candidates), "Number of references and candidates must be equal"
    emb_ref = model.encode(references, batch_size=batch_size)
    emb_cand = model.encode(candidates, batch_size=batch_size)
    return torch.diag(util.cos_sim(emb_ref, emb_cand))

sents1 = ["It's rather hot in here.", "This is a red cat with a hat."]
sents2 = ["It's rather cold in here.", "This isn't a red cat with a hat."]
#print("Base model score", cos_score(sents1[0], sents2[0], base_model))
#print("Fine-tuned model score", cos_score(sents1[0], sents2[0], finetuned_model))
#print("\n")
#print("Base model score", cos_score_batched(sents1, sents2, base_model))
#print("Fine-tuned model score", cos_score_batched(sents1, sents2, finetuned_model))
#print("WMT Fine-tuned model score", cos_score_batched(sents1, sents2, finetuned_model_wmt))


sents1 = ["It's rather hot in here.", "This is a red cat with a hat.", "This is a red cat with a hat.", "Today is a beautiful day."]
sents2 = ["It's rather cold in here.", "This isn't a red cat with a hat.", "This is not a red cat with a hat.", "Today is a wonderful day."]

for s1, s2 in zip(sents1, sents2):
    print(s1)
    print(s2)
    print("Base", cos_score_batched([s1], [s2], base_model))
    print("FT", cos_score_batched([s1], [s2], finetuned_model))
    print("FT WMT", cos_score_batched([s1], [s2], finetuned_model_wmt))

It's rather hot in here.
It's rather cold in here.
Base tensor([0.6409])
FT tensor([0.3928])
FT WMT tensor([0.8731])
This is a red cat with a hat.
This isn't a red cat with a hat.
Base tensor([0.8470])
FT tensor([0.5079])
FT WMT tensor([0.8432])
This is a red cat with a hat.
This is not a red cat with a hat.
Base tensor([0.8495])
FT tensor([0.4682])
FT WMT tensor([0.8455])
Today is a beautiful day.
Today is a wonderful day.
Base tensor([0.8489])
FT tensor([0.8935])
FT WMT tensor([0.9507])


### NegBERT score

In [17]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch

tokenizer = AutoTokenizer.from_pretrained("joey234/cuenb")
#tokenizer = AutoTokenizer.from_pretrained("BERTNOT/output")

model = AutoModelForMaskedLM.from_pretrained("joey234/cuenb")
#model = AutoModelForMaskedLM.from_pretrained("BERTNOT/output_model")

In [18]:
tok = tokenizer("This is a red cat with a hat.", return_tensors='pt')
model(**tok).logits.shape

torch.Size([1, 11, 50265])

In [19]:
tokenizer.batch_decode(tok['input_ids'], skip_special_tokens=False)

['<s>This is a red cat with a hat.</s>']

In [23]:
def cos_score_batched(references: list, candidates: list, model: AutoModelForMaskedLM, batch_size=8) -> torch.Tensor:
    assert len(references) == len(candidates), "Number of references and candidates must be equal"
    #references = [r + tokenizer.eos_token for r in references]
    #candidates = [c + tokenizer.eos_token for c in candidates]
    ref_tok = tokenizer(references, return_tensors='pt', padding=True)
    cand_tok = tokenizer(candidates, return_tensors='pt', padding=True)
    emb_ref = model(**ref_tok).logits[:, -1]
    emb_cand = model(**cand_tok).logits[:, -1]
    return torch.diag(util.cos_sim(emb_ref, emb_cand))
sents1 = ["It's rather hot in here.", "This is a red cat with a hat.", "This is a red cat with a hat.", "Today is a beautiful day."]
sents2 = ["It's rather cold in here.", "This isn't a red cat with a hat.", "This is not a red cat with a hat.", "Today is a wonderful day."]

for s1, s2 in zip(sents1, sents2):
    print(s1)
    print(s2)
    print(cos_score_batched([s1], [s2], model))

It's rather hot in here.
It's rather cold in here.
tensor([0.9931], grad_fn=<DiagBackward0>)
This is a red cat with a hat.
This isn't a red cat with a hat.
tensor([0.8886], grad_fn=<DiagBackward0>)
This is a red cat with a hat.
This is not a red cat with a hat.
tensor([0.7516], grad_fn=<DiagBackward0>)
Today is a beautiful day.
Today is a wonderful day.
tensor([0.9886], grad_fn=<DiagBackward0>)


### CrossEncoder

In [3]:
from sentence_transformers.cross_encoder import CrossEncoder

model = CrossEncoder("finetuned-models/distilroberta-negation_old_wmt")

In [11]:
sents1 = ["It's rather hot in here.", "This is a red cat with a hat.", "This is a red cat with a hat.", "Today is a beautiful day."]
sents2 = ["It's rather cold in here.", "This isn't a red cat with a hat.", "This is not a red cat with a hat.", "Today is a wonderful day."]

for s1, s2 in zip(sents1, sents2):
    print(s1)
    print(s2)
    print(model.predict([s1, s2]))

You are fat.
You are not fat.


NameError: name 'model' is not defined

In [9]:
model.predict([".", "this is a test sentence."])

6.0731967e-05

In [None]:
import numpy as np
import torch
from tqdm import tqdm
def demetr_accuracy_cross_encoder(dataset: pd.DataFrame, model:CrossEncoder) -> (float, np.array, np.array):
    t_scores = []
    hat_scores = []
    empty_scores = []
    for _, row in tqdm(dataset.iterrows(), total=len(dataset)):
        #t_scores = torch.tensor(model.predict([dataset.eng_sent, dataset.mt_sent]))
        t_scores.append(model.predict([row.eng_sent, row.mt_sent]))
        #hat_scores = torch.tensor(model.predict([dataset.eng_sent, dataset.pert_sent]))
        hat_scores.append(model.predict([row.eng_sent, row.pert_sent]))
        print(t_scores[-1], hat_scores[-1])
        empty_scores.append(model.predict([row.eng_sent, "."]))
    t_scores = torch.tensor(t_scores)
    hat_scores = torch.tensor(hat_scores)
    empty_scores = torch.tensor(empty_scores)
    return sum(torch.greater(t_scores, hat_scores)) / len(dataset), t_scores, hat_scores, empty_scores


def demetr_ratio_bleurt(dataset: pd.DataFrame, model:CrossEncoder) -> None:
    acc, t_scores, hat_scores, empty_scores = demetr_accuracy_cross_encoder(dataset, model)
    print(f"Detection accuracy: {acc}")
    #empty_scores = torch.tensor(model.predict([dataset.eng_sent, ["."] * len(dataset)]))
    ratio = (t_scores - hat_scores) / (t_scores - empty_scores)
    ratio = sum(ratio) / len(dataset)
    print(f"Ratio: {ratio}")


def eval_models_on_dataset_cross_encoder(dataset: pd.DataFrame) -> None:
    #print("** Base model")
    #demetr_ratio_bleurt(dataset, bleurt_scorer_orig)
    print("** Fine-tuned model")
    demetr_ratio_bleurt(dataset, model)


for pert_name, pert_data in perturbation_datasets.items():
    print("* ", pert_name.capitalize())
    eval_models_on_dataset_cross_encoder(pert_data)
    print("\n")

## Seq2seq score

In [64]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, GenerationConfig

In [62]:
model_dir = Path("Guided Research WS22/finetuned-models/010/flan-t5-negate/checkpoint-2000")

tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)

In [None]:
sents1 = ["It's rather hot in here.", "This is a red cat with a hat."]
sents2 = ["It's rather cold in here.", "This isn't a red cat with a hat."]

In [68]:
inputs = ["negate: "+ sent for sent in sents1]
inputs = tokenizer(inputs, return_tensors="pt", padding=True)
output = model.generate(
    **inputs,
    max_length=512,
    generation_config=GenerationConfig(
        do_sample=False,
        num_beams=4,
        # penalty_alpha=0.5,
        # top_k=10
    ),
    num_return_sequences=4
)
decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)

print(decoded_output)

["It isn't rather hot in here.", "It's rather hot in here.", "It's not rather hot in here.", "It's rather cold in here.", "This isn't a red cat with a hat.", 'This is not a red cat with a hat.', 'This is a black cat with a hat.', 'This is a white cat with a hat.']


In [80]:
max_source_length = 512
max_target_length = 128

# encode the inputs
task_prefix = "negate: "

encoding = tokenizer(
    [task_prefix + sequence for sequence in sents1],
    padding="longest",
    max_length=max_source_length,
    truncation=True,
    return_tensors="pt",
)

input_ids, attention_mask = encoding.input_ids, encoding.attention_mask

# encode the targets
target_encoding = tokenizer(sents2,
    padding="longest",
    max_length=max_target_length,
    truncation=True,
    return_tensors="pt",
)
labels = target_encoding.input_ids

# replace padding token id's of the labels by -100 so it's ignored by the loss
labels[labels == tokenizer.pad_token_id] = -100

# forward pass
loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
loss.item()

0.16529177129268646

## BLEURT score

In [3]:
import sys
sys.path.append("bleurtMaster")
import tensorflow as tf

In [2]:
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 13331510361615484227
 xla_global_id: -1]

In [20]:
from bleurtMaster.bleurt.score import BleurtScorer

#bleurt_scorer_ft = BleurtScorer(checkpoint='bleurtMaster/neg_bleurt_checkpoint/export/bleurt_best/1680768470')
bleurt_scorer_ft_200 = BleurtScorer(checkpoint='bleurtMaster/neg_bleurt_new/export/bleurt_best/1683261322')
bleurt_scorer_ft_500 = BleurtScorer(checkpoint='bleurtMaster/neg_bleurt_new_500/export/bleurt_best/1683263275')
bleurt_scorer_ft_1000 = BleurtScorer(checkpoint='bleurtMaster/neg_bleurt_new_1000/export/bleurt_best/1683266066')
#with tf.device("/GPU:0")
#bleurt_scorer_ft = BleurtScorer(checkpoint='bleurtMaster/neg_-1_bleurt_checkpoint/export/bleurt_best/1680782762')
#bleurt_scorer_ft = BleurtScorer(checkpoint='bleurtMaster/neg_bleurt_all_years/export/bleurt_best/1682672013')
#bleurt_scorer_ft = BleurtScorer(checkpoint='bleurtMaster/neg_bleurt_22/export/bleurt_best/1682678300')
#bleurt_scorer_ft = BleurtScorer(checkpoint='bleurtMaster/neg_bleurt_21/export/bleurt_best/1682684660')
#bleurt_scorer_orig = BleurtScorer(checkpoint='bleurtMaster/bleurt/BLEURT-20')

INFO:tensorflow:Reading checkpoint bleurtMaster/neg_bleurt_new/export/bleurt_best/1683261322.
INFO:tensorflow:Config file found, reading.
INFO:tensorflow:Will load checkpoint bert_custom
INFO:tensorflow:Loads full paths and checks that files exists.
INFO:tensorflow:... name:bert_custom
INFO:tensorflow:... bert_config_file:bert_config.json
INFO:tensorflow:... max_seq_length:512
INFO:tensorflow:... vocab_file:vocab.txt
INFO:tensorflow:... do_lower_case:True
INFO:tensorflow:... sp_model:None
INFO:tensorflow:... dynamic_seq_length:False
INFO:tensorflow:Creating BLEURT scorer.
INFO:tensorflow:Creating WordPiece tokenizer.
INFO:tensorflow:WordPiece tokenizer instantiated.
INFO:tensorflow:Creating Eager Mode predictor.
INFO:tensorflow:Loading model.
INFO:tensorflow:BLEURT initialized.
INFO:tensorflow:Reading checkpoint bleurtMaster/neg_bleurt_new_500/export/bleurt_best/1683263275.
INFO:tensorflow:Config file found, reading.
INFO:tensorflow:Will load checkpoint bert_custom
INFO:tensorflow:Load

In [18]:
sents1 = ["It's rather hot in here.", "This is a red cat with a hat.",
          "This is a red cat with a hat.", "Today is a beautiful day.", "Today is a beautiful day."]
sents1 = ["You are fat.", "You are fat.", "You are fat."]
sents2 = ["It's rather cold in here.", "This isn't a red cat with a hat.", "This is not a red cat with a hat.", "Today is a wonderful day.", "."]
sents2 = ["You are not fat.", "You are thin.", "You are not quite thin."]

for s1, s2 in zip(sents1, sents2):
    print(s1)
    print(s2)
    print(bleurt_scorer_ft.score(references=[s1], candidates=[s2]))

You are fat.
You are not fat.
[0.156449556350708]
You are fat.
You are thin.
[-0.6134893298149109]
You are fat.
You are not quite thin.
[-0.6198661923408508]


In [23]:
import numpy as np
import torch
def demetr_accuracy_bleurt(dataset: pd.DataFrame, bleurt_scorer:BleurtScorer) -> (float, np.array, np.array):
    t_scores = torch.tensor(bleurt_scorer.score(references=dataset.eng_sent, candidates=dataset.mt_sent))
    hat_scores = torch.tensor(bleurt_scorer.score(references=dataset.eng_sent, candidates=dataset.pert_sent))
    return sum(torch.greater(t_scores, hat_scores)) / len(dataset), t_scores, hat_scores


def demetr_ratio_bleurt(dataset: pd.DataFrame, bleurt_scorer:BleurtScorer) -> float:
    acc, t_scores, hat_scores = demetr_accuracy_bleurt(dataset, bleurt_scorer)
    print(f"Detection accuracy: {acc}")
    empty_scores = torch.tensor(bleurt_scorer.score(references=dataset.eng_sent, candidates=["."] * len(dataset)))
    ratio = (t_scores - hat_scores) / (t_scores - empty_scores)
    ratio = sum(ratio) / len(dataset)
    print(f"Ratio: {ratio}")
    return ratio.item()


def eval_models_on_dataset_bleurt(dataset: pd.DataFrame) -> dict:
    dataset_scores = {}
    print("** 200 steps")
    dataset_scores["model_200"] = demetr_ratio_bleurt(dataset, bleurt_scorer_ft_200)
    print("** 500 steps")
    dataset_scores["model_500"] = demetr_ratio_bleurt(dataset, bleurt_scorer_ft_500)
    print("** 1000 steps")
    dataset_scores["model_1000"] = demetr_ratio_bleurt(dataset, bleurt_scorer_ft_1000)
    return dataset_scores
    #print("** Fine-tuned model")
    #return demetr_ratio_bleurt(dataset, bleurt_scorer_ft)

demetr_scores = {}
for pert_name, pert_data in perturbation_datasets.items():
    print("* ", pert_name.capitalize())
    dem_rat = eval_models_on_dataset_bleurt(pert_data)
    demetr_scores[pert_name] = dem_rat
    print("\n")

*  Base_id33_empty
** 200 steps
Detection accuracy: 1.0
Ratio: 1.0
** 500 steps
Detection accuracy: 0.9990000128746033
Ratio: 1.0
** 1000 steps
Detection accuracy: 0.9959999918937683
Ratio: 1.0


*  Base_id33_shuffle_trans
** 200 steps
Detection accuracy: 1.0
Ratio: 1.2333643436431885
** 500 steps
Detection accuracy: 1.0
Ratio: 1.4536385536193848
** 1000 steps
Detection accuracy: 1.0
Ratio: 1.4887135028839111


*  Base_id35_reference
** 200 steps
Detection accuracy: 0.004000000189989805
Ratio: -0.4277395009994507
** 500 steps
Detection accuracy: 0.004000000189989805
Ratio: -2.3391990661621094
** 1000 steps
Detection accuracy: 0.006000000052154064
Ratio: -0.822628915309906


*  Critical_id10_numbers_replaced
** 200 steps
Detection accuracy: 0.35499998927116394
Ratio: 0.04136919602751732
** 500 steps
Detection accuracy: 0.35600000619888306
Ratio: 0.05029292777180672
** 1000 steps
Detection accuracy: 0.3529999852180481
Ratio: 0.05777598172426224


*  Critical_id11_gender
** 200 steps
Dete

In [24]:
demetr_scores

{'base_id33_empty': None,
 'base_id33_shuffle_trans': None,
 'base_id35_reference': None,
 'critical_id10_numbers_replaced': None,
 'critical_id11_gender': None,
 'critical_id20_shuffled': None,
 'critical_id21_adj_adv_removed': None,
 'critical_id22_verb_removed': None,
 'critical_id23_noun_removed': None,
 'critical_id24_subj_removed': None,
 'critical_id25_ne_removed': None,
 'critical_id4_codemix': None,
 'critical_id6_addition': None,
 'critical_id7_antonym': None,
 'critical_id8_negation': None,
 'critical_id9_ne_replaced': None,
 'major_id17_tense': None,
 'major_id18_aspect': None,
 'major_id19_question': None,
 'major_id3_hypernym': None,
 'major_id5_pp_removed': None,
 'minor_id12_conj_removed': None,
 'minor_id13_pos_shift': None,
 'minor_id14_word_swap': None,
 'minor_id15_case': None,
 'minor_id16_function_word': None,
 'minor_id1_repeat2': None,
 'minor_id26_misspelled': None,
 'minor_id27_char_removed': None,
 'minor_id28_final_punc_removed': None,
 'minor_id29_punc_addi

## Eval on DEMETR data

In [9]:
import pandas as pd
def load_demetr_dataset(data_path:str) -> pd.DataFrame:
    df:pd.DataFrame = pd.read_json(demetr_data_path + data_path)
    return df

demetr_data_path = "demetr-main/dataset/"
perturbation_datasets = {
    "negation": load_demetr_dataset("critical_id8_negation.json"),
    "antonym": load_demetr_dataset("critical_id7_antonym.json"),
    "baseline_shuffle": load_demetr_dataset("base_id33_shuffle_trans.json"),
    "verb removed": load_demetr_dataset("critical_id22_verb_removed.json"),
    "hypernym": load_demetr_dataset("major_id3_hypernym.json"),
    #"gender": load_demetr_dataset("critical_id11_gender.json"),
    #"repeat4": load_demetr_dataset("minor_id2_repeat4.json")
}

In [22]:
import os

perturbation_datasets = {}
for filename in os.listdir(demetr_data_path):
    perturbation_datasets[filename.replace(".json", "")] = load_demetr_dataset(filename)

In [7]:
"""def demetr_accuracy_sent_transform(dataset: pd.DataFrame, model:SentenceTransformer, score_function) -> (float, np.array, np.array):
    t_scores = score_function(dataset.eng_sent, dataset.mt_sent, model)
    hat_scores = score_function(dataset.eng_sent, dataset.pert_sent, model)
    return sum(torch.greater(t_scores, hat_scores)) / len(dataset), t_scores, hat_scores
"""
batch_size = 16
def batchwise_score(sents1, sents2, model, score_function):
    scores = []
    for i in range(0, len(sents1), batch_size):
        scores.append(score_function(sents1[i:i+batch_size], sents2[i:i+batch_size], model))
    return scores

def demetr_accuracy(dataset: pd.DataFrame, model, score_function) -> (float, np.array, np.array):
    t_scores = batchwise_score(dataset.eng_sent, dataset.mt_sent, model, score_function)
    hat_scores = batchwise_score(dataset.eng_sent, dataset.pert_sent, model, score_function)
    return sum(torch.greater(t_scores, hat_scores)) / len(dataset), t_scores, hat_scores

#def demetr_ratio(dataset: pd.DataFrame, model:SentenceTransformer, score_function) -> None:
def demetr_ratio(dataset: pd.DataFrame, model, score_function) -> None:
    acc, t_scores, hat_scores = demetr_accuracy(dataset, model, score_function)
    print(f"Detection accuracy: {acc}")
    empty_scores = score_function(dataset.eng_sent, [""] * len(dataset), model)
    ratio = (t_scores - hat_scores) / (t_scores - empty_scores)
    ratio = sum(ratio) / len(dataset)
    print(f"Ratio: {ratio}")

In [None]:
def eval_models_on_dataset(dataset:pd.DataFrame, score_function) -> None:
    print("** Base model")
    #demetr_ratio(dataset, base_model, score_function)
    demetr_ratio(dataset, model, score_function)
    #print("** Fine-tuned model")
    #demetr_ratio(dataset, finetuned_model, score_function)

for pert_name, pert_data in perturbation_datasets.items():
    print("* ", pert_name.capitalize())
    eval_models_on_dataset(pert_data, cos_score_batched)
    print("\n")

*  Negation
** Base model


## Semsimilarity choose right antonym

In [4]:
import json
import pandas as pd
with open("SemAntoNeg_v1.0.json") as file_obj:
    data_list = []
    for line in file_obj:
        data_list.append(json.loads(line))
semsim = pd.DataFrame(data_list)
semsim

Unnamed: 0,idx,label,input,sentences
0,0,2,You're not fat.,"[You're not thin., You're fat., You're thin.]"
1,1,2,You're not fat.,"[You're not nonfat., You're fat., You're nonfat.]"
2,2,2,It's not healthy.,"[It's not unhealthy., It's healthy., It's unhe..."
3,3,2,That's not acceptable.,"[That's not unacceptable., That's acceptable.,..."
4,4,2,I'm not guilty.,"[I'm not innocent., I'm guilty., I'm innocent.]"
...,...,...,...,...
3147,3147,2,I know it is possible.,"[I know it is impossible., I know it is not po..."
3148,3148,2,I know it is possible.,"[I know it is actual., I know it is not possib..."
3149,3149,2,"No, it's a good idea, no.","[No, it's a bad idea, no., No, it's not a good..."
3150,3150,2,"No, it's a good idea, no.","[No, it's an evil idea, no., No, it's not a go..."


In [8]:
from tqdm import tqdm
import torch

num_samples = 3
base_prefs = []
ft_prefs = []
for inp, sents in tqdm(zip(semsim.input.values, semsim.sentences.values), total=len(semsim)):
  #base_scores = cos_score_batched([inp] * 3, sents, base_model)
  #base_scores = torch.tensor(bleurt_scorer_orig.score(references=[inp] * 3, candidates=sents))
  #base_prefs.append(torch.argmax(base_scores).item())
  #ft_scores = cos_score_batched([inp] * 3, sents, finetuned_model)
  ft_scores = torch.tensor(bleurt_scorer_ft.score(references=[inp] * 3, candidates=sents))
  ft_prefs.append(torch.argmax(ft_scores).item())
  #print("Base model score", cos_score_batched([inp] * 3, sents, base_model))
  #print("Fine-tuned model score", cos_score_batched([inp] * 3, sents, finetuned_model))

100%|██████████| 3152/3152 [06:18<00:00,  8.33it/s]


In [10]:
from sklearn.metrics import accuracy_score
labels = semsim.label.values[:len(ft_prefs)]
#print("Base model accuracy: ", accuracy_score(labels, base_prefs))
print("Fine-tuned model accuracy: ", accuracy_score(labels, ft_prefs))

Fine-tuned model accuracy:  0.015545685279187817


## EvalEval perturbations

In [16]:
import pandas as pd
human_score_mt = pd.read_csv("EvalEvalMain/data/MachineTranslation.csv")
human_score_mt

Unnamed: 0,Perturbations,annotator 1,annotator 2,annotator 3,annotator 4,annotator 5,annotator 6,annotator 7,annotator 8,annotator 9,annotator 10,annotator 11,annotator 12,annotator 13,annotator 14,annotator 15
0,Remove punctuation,0,2,1,1,1.0,1,0,2,3,1,0,2,1,0,0
1,Spelling mistake/typos,2,2,4,3,2.0,4,1,4,2,2,4,3,1,2,1
2,Missing (nltk) stopwords,4,5,2,3,3.0,1,4,5,5,2,5,4,3,3,0
3,Subject-verb disagreement,4,2,1,2,2.0,1,1,4,1,1,3,1,4,3,0
4,Jumbling words,10,9,5,10,7.0,8,10,6,6,5,10,8,9,9,4
5,Adding Negations,9,10,10,10,9.5,10,10,7,10,10,10,8,10,8,10
6,Change number values,6,10,9,8,8.0,10,9,4,10,9,10,6,10,8,9
7,Change names,7,10,7,8,9.0,6,5,7,10,8,10,5,10,8,6
8,Removing named entities,4,7,2,5,7.0,2,3,6,7,3,10,5,8,8,0
9,Retain only stop words,9,10,10,10,9.5,10,10,10,10,10,10,9,10,9,10
