In [15]:
from pathlib import Path
import torch
from sentence_transformers import models, losses, datasets
from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import pandas as pd
from tqdm import tqdm
import numpy as np

## Sentence Transformer score

In [2]:
timestamp = "2023-02-17_15-02-13"
project_base_path = Path("Guided Research WS22")
negation_dataset = project_base_path / "data/negation_dataset_labeled.tsv"


base_model = "sentence-transformers/all-mpnet-base-v2"
output_model_name = f"{base_model.split('/')[1]}-negation"  # TODO.
model_save_path = str(project_base_path / f"finetuned-models/{timestamp}/{output_model_name}")

In [3]:
finetuned_model = SentenceTransformer(model_save_path)
base_model = SentenceTransformer(base_model)
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(device)
base_model.to(device)
finetuned_model.to(device)

cuda:0


SentenceTransformer(
  (0): Transformer({'max_seq_length': 75, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [41]:
def cos_score(reference: str, candidate: str, model:SentenceTransformer) -> float:
    emb_ref = model.encode(reference)
    emb_cand = model.encode(candidate)
    return util.cos_sim(emb_ref, emb_cand).item()

def cos_score_batched(references: list, candidates: list, model: SentenceTransformer, batch_size=8) -> torch.Tensor:
    assert len(references) == len(candidates), "Number of references and candidates must be equal"
    emb_ref = model.encode(references, batch_size=batch_size)
    emb_cand = model.encode(candidates, batch_size=batch_size)
    return torch.diag(util.cos_sim(emb_ref, emb_cand))

sents1 = ["It's rather hot in here.", "This is a red cat with a hat."]
sents2 = ["It's rather cold in here.", "This isn't a red cat with a hat."]
print("Base model score", cos_score(sents1[0], sents2[0], base_model))
print("Fine-tuned model score", cos_score(sents1[0], sents2[0], finetuned_model))
print("\n")
print("Base model score", cos_score_batched(sents1, sents2, base_model))
print("Fine-tuned model score", cos_score_batched(sents1, sents2, finetuned_model))

Base model score 0.6408535838127136
Fine-tuned model score 0.3927837014198303


Base model score tensor([0.6409, 0.8470])
Fine-tuned model score tensor([0.3928, 0.5079])


## Seq2seq score

In [64]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, GenerationConfig

In [62]:
model_dir = Path("Guided Research WS22/finetuned-models/010/flan-t5-negate/checkpoint-2000")

tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)

In [None]:
sents1 = ["It's rather hot in here.", "This is a red cat with a hat."]
sents2 = ["It's rather cold in here.", "This isn't a red cat with a hat."]

In [68]:
inputs = ["negate: "+ sent for sent in sents1]
inputs = tokenizer(inputs, return_tensors="pt", padding=True)
output = model.generate(
    **inputs,
    max_length=512,
    generation_config=GenerationConfig(
        do_sample=False,
        num_beams=4,
        # penalty_alpha=0.5,
        # top_k=10
    ),
    num_return_sequences=4
)
decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)

print(decoded_output)

["It isn't rather hot in here.", "It's rather hot in here.", "It's not rather hot in here.", "It's rather cold in here.", "This isn't a red cat with a hat.", 'This is not a red cat with a hat.', 'This is a black cat with a hat.', 'This is a white cat with a hat.']


In [80]:
max_source_length = 512
max_target_length = 128

# encode the inputs
task_prefix = "negate: "

encoding = tokenizer(
    [task_prefix + sequence for sequence in sents1],
    padding="longest",
    max_length=max_source_length,
    truncation=True,
    return_tensors="pt",
)

input_ids, attention_mask = encoding.input_ids, encoding.attention_mask

# encode the targets
target_encoding = tokenizer(sents2,
    padding="longest",
    max_length=max_target_length,
    truncation=True,
    return_tensors="pt",
)
labels = target_encoding.input_ids

# replace padding token id's of the labels by -100 so it's ignored by the loss
labels[labels == tokenizer.pad_token_id] = -100

# forward pass
loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
loss.item()

0.16529177129268646

## Eval on DEMETR data

In [57]:
def load_demetr_dataset(data_path:str) -> pd.DataFrame:
    df:pd.DataFrame = pd.read_json(data_path)
    return df

demetr_data_path = "demetr-main/dataset/"
perturbation_datasets = {
    "negation": load_demetr_dataset(demetr_data_path + "critical_id8_negation.json"),
    "antonym": load_demetr_dataset(demetr_data_path + "critical_id7_antonym.json"),
    "verb removed": load_demetr_dataset(demetr_data_path + "critical_id22_verb_removed.json"),
    "hypernym": load_demetr_dataset(demetr_data_path + "major_id3_hypernym.json")
}

In [53]:
def demetr_accuracy(dataset: pd.DataFrame, model:SentenceTransformer, score_function) -> (float, np.array, np.array):
    t_scores = score_function(dataset.eng_sent, dataset.mt_sent, model)
    hat_scores = score_function(dataset.eng_sent, dataset.pert_sent, model)
    return sum(torch.greater(t_scores, hat_scores)) / len(dataset), t_scores, hat_scores

def demetr_ratio(dataset: pd.DataFrame, model:SentenceTransformer, score_function) -> None:
    acc, t_scores, hat_scores = demetr_accuracy(dataset, model, score_function)
    print(f"Detection accuracy: {acc}")
    empty_scores = score_function(dataset.eng_sent, [""] * len(dataset), model)
    ratio = (t_scores - hat_scores) / (t_scores - empty_scores)
    ratio = sum(ratio) / len(dataset)
    print(f"Ratio: {ratio}")

In [60]:
def eval_models_on_dataset(dataset:pd.DataFrame, score_function) -> None:
    print("** Base model")
    demetr_ratio(dataset, base_model, score_function)
    print("** Fine-tuned model")
    demetr_ratio(dataset, finetuned_model, score_function)

for pert_name, pert_data in perturbation_datasets.items():
    print("* ", pert_name.capitalize())
    eval_models_on_dataset(pert_data, cos_score_batched)
    print("\n")

*  Negation
** Base model
Detection accuracy: 0.972000002861023
Ratio: 0.13973526656627655
** Fine-tuned model
Detection accuracy: 0.9929999709129333
Ratio: 0.49932998418807983


*  Antonym
** Base model
Detection accuracy: 0.925000011920929
Ratio: 0.06956008076667786
** Fine-tuned model
Detection accuracy: 0.9459999799728394
Ratio: 0.2047116905450821


*  Verb removed
** Base model
Detection accuracy: 0.8149999976158142
Ratio: 0.03189469873905182
** Fine-tuned model
Detection accuracy: 0.8140000104904175
Ratio: 0.034169603139162064


*  Hypernym
** Base model
Detection accuracy: 0.7850000262260437
Ratio: 0.03764893859624863
** Fine-tuned model
Detection accuracy: 0.8069999814033508
Ratio: 0.03380375728011131




In [50]:
import numpy as np
a = torch.tensor([0.5, 0.2, 0.7])
b = torch.tensor([0.4, 0.4, 0.4])
torch.greater(a,b)

tensor([ True, False,  True])

In [51]:
a = torch.tensor([0.5, 0.2, 0.7])
b = torch.tensor([2, 2, 2])
a/b

tensor([0.2500, 0.1000, 0.3500])