## Loading the dataset

### Copied from preprocessing.ipynb

In [102]:
import pandas as pd
import numpy as np
import datasets
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
train_data_path = "./data/English dataset/train.jsonl"
test_data_path = "./data/English dataset/test.jsonl"

def preprocess_text(text): # From the labs
	# Tokenize the text into words
	words = word_tokenize(text.lower())  # Convert text to lowercase

	# Remove punctuation
	table = str.maketrans('', '', string.punctuation)
	words = [word.translate(table) for word in words if word.isalpha()]

	# Remove stopwords
	stop_words = set(stopwords.words('english'))
	words = [word for word in words if word not in stop_words]

	# Lemmatization
	lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

	# Join the words back into a string
	preprocessed_text = ' '.join(lemmatized_words)
	return preprocessed_text

train_data = pd.DataFrame(datasets.load_dataset("json", data_files=train_data_path)["train"])
test_dataset = pd.DataFrame(datasets.load_dataset("json", data_files=test_data_path)["train"])

label_map = {"Contradiction": 1, "Entailment": 0, "NotMentioned": 0}
train_data["label"] = train_data["label"].map(label_map)
test_dataset["label"] = test_dataset["label"].map(label_map)

train_data = train_data.drop("doc_id", axis=1)
train_data = train_data.drop("key", axis=1)
test_dataset = test_dataset.drop("doc_id", axis=1)
test_dataset = test_dataset.drop("key", axis=1)

train_data["label"].value_counts(normalize=True)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Domen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Domen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Domen\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


label
0    0.883048
1    0.116952
Name: proportion, dtype: float64

In [None]:
def get_data():
    train_data = pd.DataFrame(datasets.load_dataset("json", data_files=train_data_path)["train"])
    test_dataset = pd.DataFrame(datasets.load_dataset("json", data_files=test_data_path)["train"])

In [103]:
def text_splitter(p, text_lim, overlap=250, allowed_delimiters=" ,.?!"):
    i = text_lim
    while not p[i] in allowed_delimiters and i > 0:
        i -= 1
    p1 = p[:i]

    d = len(p) - i
    if d < text_lim-overlap:
        j = text_lim-overlap
        while not p[j] in allowed_delimiters and j > 0:
            j += 1
        p2 = p[j:]
        return [p1,p2]
    else:
        return p1 + text_splitter(p[(i-overlap):], text_lim, overlap)

def dataset_splitter(df, token_lim = 768, overlap=50, allowed_delimiters=" ,.?!"): # ensures the premises are roughly within the token limit
    text_lim = int(token_lim * 3.5)
    overlap = int(overlap*3.5)
    to_app = []
    for i,r in df.iterrows():
        if len(r['premise']) > text_lim:
            seg = text_splitter(r['premise'], text_lim, overlap=overlap, allowed_delimiters=allowed_delimiters)

            for s in seg:
                nr = {'premise': s, 'hypothesis': r['hypothesis'], 'label': r['label']}
                to_app.append(nr)
            
            df.drop(i)
    
    df.append(pd.DataFrame(to_app))
    return df

In [104]:
from datasets import Dataset

ds = Dataset.from_pandas(train_data)
ds = ds.select_columns(["hypothesis", "premise", "label"])
ds = ds.select_columns(["hypothesis", "premise", "label"])

dss = ds.train_test_split(0.2, seed=42)
train_dataset = dss['train']
valid_dataset = dss['test']
test_dataset = Dataset.from_pandas(test_dataset)

test_corpus = test_dataset['premise']
test_hypothesis = test_dataset['hypothesis']
print(len(test_hypothesis))

2091


In [106]:
print(len(set(ds['hypothesis'])), len(ds))

17 7191


In [110]:
print(len(set(test_dataset['hypothesis'])), len(test_dataset))

17 2091


## Creating a model

### Evaluation

#### Evaluators

In [119]:
# Custom evaluator
from sentence_transformers.evaluation import SentenceEvaluator
from collections import defaultdict
from sentence_transformers.util import cos_sim
import torch

class MyRecallEval(SentenceEvaluator):
    def structure(test_dataset):
        corpus = dict(zip(test_dataset['premise'], test_dataset['premise']))
        queries = dict(zip(test_dataset['hypothesis'], test_dataset['hypothesis']))
        relevant_docs = defaultdict(list)

        for k in range(len(test_dataset)):
            if test_dataset['label'][k] > 0:
                relevant_docs[test_dataset['hypothesis'][k]].append(test_dataset['premise'][k])

        return (queries, corpus, relevant_docs)
        
    def __init__(self, data, recall_ks=(5, 10, 20, 50), cluster_k=50,cluster_min_hits=10,name: str = ""):
        super().__init__()
        hypotheses, premises, relevant_premises = MyRecallEval.structure(data)
        self.hypotheses = hypotheses
        self.premises = premises
        self.relevant_premises = relevant_premises
        self.recall_ks = recall_ks
        self.cluster_k = cluster_k
        self.cluster_min_hits = cluster_min_hits
        self.name = name

        self.greater_is_better = True
        self.primary_metric = f"recall@{max(recall_ks)}"

        # Fixed ordering (important!)
        self.hyp_ids = list(hypotheses.keys())
        self.premise_ids = list(premises.keys())

    def __call__(self, model, output_path=None, epoch=-1, steps=-1):
        # 1. Encode
        hyp_texts = [self.hypotheses[h] for h in self.hyp_ids]
        prem_texts = [self.premises[p] for p in self.premise_ids]

        hyp_emb = model.encode(hyp_texts, convert_to_tensor=True, normalize_embeddings=True)
        prem_emb = model.encode(prem_texts, convert_to_tensor=True, normalize_embeddings=True)

        # 2. Similarity matrix
        # TODO should we use cos_sim?
        scores = cos_sim(hyp_emb, prem_emb)  # shape: [num_hyp, num_prem]

        recalls = {k: [] for k in self.recall_ks}
        normrecalls = {k: [] for k in self.recall_ks}

        cluster_success = []

        # 3. Per-hypothesis evaluation
        for i, hyp_id in enumerate(self.hyp_ids):
            relevant = self.relevant_premises[hyp_id]
            if not relevant:
                continue

            relevant_idx = {self.premise_ids.index(pid) for pid in relevant}

            ranked = torch.argsort(scores[i], descending=True)

            for k in self.recall_ks:
                topk = ranked[:k].tolist()
                hits = len(set(topk) & relevant_idx)
                normrecalls[k].append(hits / min(k,len(relevant_idx)))
                recalls[k].append(hits / len(relevant_idx))

            # Cluster recall
            top_cluster = ranked[: self.cluster_k].tolist()
            hits = len(set(top_cluster) & relevant_idx)
            cluster_success.append(hits >= self.cluster_min_hits)

        # 4. Aggregate metrics
        metrics = {
            f"recall@{k}": float(np.mean(recalls[k])) for k in self.recall_ks
        }
        for k in self.recall_ks:
            metrics[f"normalized_recall@{k}"] = float(np.mean(normrecalls[k]))
        metrics["cluster_recall"] = float(np.mean(cluster_success))

        # Optional: store in model card
        self.store_metrics_in_model_card_data(model, metrics, epoch, steps)

        return self.prefix_name_to_metrics(metrics, self.name)

In [60]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from collections import defaultdict

def get_ret_eval(test_dataset):
    corpus = dict(zip(test_dataset['premise'], test_dataset['premise']))
    queries = dict(zip(test_dataset['hypothesis'], test_dataset['hypothesis']))
    relevant_docs = defaultdict(list)

    for k in range(len(test_dataset)):
        if test_dataset['label'][k] > 0:
            relevant_docs[test_dataset['hypothesis'][k]].append(test_dataset['premise'][k])
    

    inf_ret_ev = InformationRetrievalEvaluator(
        queries= queries,
        corpus = corpus,
        relevant_docs = relevant_docs,
        #similarity_fn_names= ["cosine"],
        show_progress_bar=True,
        batch_size= 16,
        #main_score_function="Recall@10"
    )

    return inf_ret_ev


In [61]:
from sentence_transformers.evaluation import BinaryClassificationEvaluator

def get_bin_eval(test_dataset):
    """
        BinnaryClassification returns: F1, Percision, Recall, Avg Percision, Matthews Correlation, 
    """
    bin_acc_ev = BinaryClassificationEvaluator(
        sentences1= test_dataset['hypothesis'],
        sentences2= test_dataset['premise'],
        labels= test_dataset['label'],
        similarity_fn_names= ["cosine", "dot"],
        show_progress_bar= True,
        batch_size= 16
    )
    return bin_acc_ev

#### Methods

### Base model
- Straight from the box, unmodified
-  [msmarco-MiniLM-L6-cos-v5](https://huggingface.co/sentence-transformers/msmarco-MiniLM-L6-cos-v5) Trained specificly for query-passage retrieval

In [73]:
from sentence_transformers import SentenceTransformer

model_name = "models\jina-embeddings-v2-small-en" 
base_model = SentenceTransformer(model_name)

Some weights of BertModel were not initialized from the model checkpoint at models\jina-embeddings-v2-small-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'encoder.layer.2.intermediate.dense.bias', 'encoder.layer.2.intermediate.dense.weight', 'encoder.layer.2.output.LayerNorm.bias', 'encoder.layer.2.output.LayerNorm.weight', 'encoder.layer.2.output.dense.bias', 'encoder.layer.2.output.dense.weight', 'encoder.layer.3.intermediate.dense.bias', 'encoder.layer.3.intermediate.den

 float16 should speed up the model, while having minimal impact on preformance: [documentation](https://www.sbert.net/docs/sentence_transformer/usage/efficiency.html)

### Fine tuning the base model

#### Loss

In [74]:
"""
 I | I i
I I| L
"""

'\n I | I i\nI I| L\n'

In [75]:
from sentence_transformers import SentenceTransformerTrainer

fine_model = base_model

##### Setup of Trainers

In [76]:
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

k = 10

args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir="models/tuned_model",
    # Optional training parameters:
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=3e-5,
    seed=42,
    metric_for_best_model=f"eval_cosine_recall@10",
    #greater_is_better=False,
  	load_best_model_at_end=True,
  	weight_decay=0.01,
    
    #warmup_ratio=0.1,
    #fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
    #bf16=False,  # Set to True if you have a GPU that supports BF16
    #batch_sampler=BatchSamplers.NO_DUPLICATES,  # losses that use "in-batch negatives" benefit from no duplicates
    # Optional tracking/debugging parameters:
    eval_strategy="steps",
    eval_steps=50, # how often we eval
    #save_strategy="best",
    torch_empty_cache_steps = None,
    save_steps=50,
    save_total_limit=2,
    logging_steps=100,
    run_name="mpnet-base-all-nli-triplet",  # Will be used in W&B if `wandb` is installed
)

**Contrastive loss** Used for binary labled pairs

In [77]:
from sentence_transformers.losses import ContrastiveLoss, MultipleNegativesRankingLoss

def trainer_cl(m, train_dataset, valid_dataset, args): # bad trainer, assumes a hypothesis has only one correct answer
    td = {''}
    loss = ContrastiveLoss(m)

    evaluator = get_ret_eval(valid_dataset)

    trainer = SentenceTransformerTrainer(
        model = m,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        loss=loss,
        evaluator=evaluator,
        args=args
    )

    return trainer

In [92]:
from sentence_transformers.losses import MultipleNegativesRankingLoss

def trainer_mnr(m, train_dataset, valid_dataset, args):
    td = {''}
    loss = MultipleNegativesRankingLoss(m)

    evaluator = get_ret_eval(valid_dataset)

    trainer = SentenceTransformerTrainer(
        model = m,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        loss=loss,
        evaluator=evaluator,
        args=args
    )

    return trainer

##### Training

Run this if using a trainer

In [93]:
import torch
print(torch.cuda.is_available())

True


In [94]:
print(set(test_dataset['label']))

{0, 1}


In [95]:
trainer = trainer_mnr(base_model, train_dataset, valid_dataset, args)
ev = get_ret_eval(test_dataset)
test_dataset = test_dataset.select_columns(["hypothesis", "premise", "label"])

ret = trainer.evaluate(eval_dataset=test_dataset)
for k, v in ret.items():
    print(k,v)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/54 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:45<00:00, 45.02s/it]

eval_loss 4.124613285064697
eval_model_preparation_time 0.001
eval_cosine_accuracy@1 0.1111111111111111
eval_cosine_accuracy@3 0.2222222222222222
eval_cosine_accuracy@5 0.4444444444444444
eval_cosine_accuracy@10 0.4444444444444444
eval_cosine_precision@1 0.1111111111111111
eval_cosine_precision@3 0.14814814814814814
eval_cosine_precision@5 0.17777777777777776
eval_cosine_precision@10 0.15555555555555556
eval_cosine_recall@1 0.0031746031746031746
eval_cosine_recall@3 0.021869488536155203
eval_cosine_recall@5 0.06739248405915071
eval_cosine_recall@10 0.09457332790666123
eval_cosine_ndcg@10 0.1673362937459519
eval_cosine_mrr@10 0.2222222222222222
eval_cosine_map@100 0.10373468805936607
eval_runtime 350.3169
eval_samples_per_second 5.969
eval_steps_per_second 0.374





In [96]:
print(ret)

{'eval_loss': 4.124613285064697, 'eval_model_preparation_time': 0.001, 'eval_cosine_accuracy@1': 0.1111111111111111, 'eval_cosine_accuracy@3': 0.2222222222222222, 'eval_cosine_accuracy@5': 0.4444444444444444, 'eval_cosine_accuracy@10': 0.4444444444444444, 'eval_cosine_precision@1': 0.1111111111111111, 'eval_cosine_precision@3': 0.14814814814814814, 'eval_cosine_precision@5': 0.17777777777777776, 'eval_cosine_precision@10': 0.15555555555555556, 'eval_cosine_recall@1': 0.0031746031746031746, 'eval_cosine_recall@3': 0.021869488536155203, 'eval_cosine_recall@5': 0.06739248405915071, 'eval_cosine_recall@10': 0.09457332790666123, 'eval_cosine_ndcg@10': 0.1673362937459519, 'eval_cosine_mrr@10': 0.2222222222222222, 'eval_cosine_map@100': 0.10373468805936607, 'eval_runtime': 350.3169, 'eval_samples_per_second': 5.969, 'eval_steps_per_second': 0.374}


In [97]:
print(len((valid_dataset['hypothesis'])))

1439


In [98]:
trainer = trainer_mnr(fine_model, train_dataset, valid_dataset, args)
trainer.train()

Step,Training Loss,Validation Loss,Cosine Accuracy@1,Cosine Accuracy@3,Cosine Accuracy@5,Cosine Accuracy@10,Cosine Precision@1,Cosine Precision@3,Cosine Precision@5,Cosine Precision@10,Cosine Recall@1,Cosine Recall@3,Cosine Recall@5,Cosine Recall@10,Cosine Ndcg@10,Cosine Mrr@10,Cosine Map@100
50,No log,2.555478,0.222222,0.222222,0.333333,0.444444,0.222222,0.148148,0.155556,0.166667,0.005311,0.010623,0.02828,0.053482,0.171464,0.265873,0.109325
100,2.677200,2.279555,0.333333,0.444444,0.444444,0.444444,0.333333,0.333333,0.288889,0.255556,0.009427,0.03651,0.057342,0.086818,0.280941,0.388889,0.170619
150,2.677200,2.203582,0.333333,0.444444,0.444444,0.444444,0.333333,0.333333,0.311111,0.255556,0.009427,0.03651,0.061457,0.087758,0.283055,0.388889,0.194654
200,2.259500,2.162162,0.333333,0.444444,0.444444,0.555556,0.333333,0.296296,0.266667,0.233333,0.009427,0.034373,0.045937,0.10516,0.259081,0.382716,0.182064
250,2.259500,2.047812,0.333333,0.444444,0.555556,0.555556,0.333333,0.37037,0.311111,0.222222,0.009427,0.040625,0.159185,0.177097,0.29377,0.398148,0.209145
300,2.122500,2.012819,0.333333,0.444444,0.555556,0.555556,0.333333,0.333333,0.311111,0.211111,0.009427,0.03651,0.159185,0.17496,0.28627,0.411111,0.209028
350,2.122500,2.009597,0.333333,0.444444,0.555556,0.555556,0.333333,0.37037,0.311111,0.211111,0.009427,0.040625,0.159185,0.17496,0.287672,0.411111,0.208326


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/54 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:06<00:00,  6.65s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/54 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:06<00:00,  6.61s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/54 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:09<00:00,  9.85s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/54 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:10<00:00, 10.93s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/54 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:08<00:00,  8.56s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/54 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:34<00:00, 34.33s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/54 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:34<00:00, 34.23s/it]


TrainOutput(global_step=360, training_loss=2.3068416171603734, metrics={'train_runtime': 1271.6642, 'train_samples_per_second': 4.523, 'train_steps_per_second': 0.283, 'total_flos': 0.0, 'train_loss': 2.3068416171603734, 'epoch': 1.0})

In [99]:
ev = get_ret_eval(test_dataset)
test_dataset = test_dataset.select_columns(["hypothesis", "premise", "label"])

ret = trainer.evaluate(eval_dataset=test_dataset)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/54 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:35<00:00, 35.71s/it]


In [100]:
for k, v in ret.items():
    print(k,v)

eval_loss 2.1264894008636475
eval_cosine_accuracy@1 0.3333333333333333
eval_cosine_accuracy@3 0.4444444444444444
eval_cosine_accuracy@5 0.5555555555555556
eval_cosine_accuracy@10 0.5555555555555556
eval_cosine_precision@1 0.3333333333333333
eval_cosine_precision@3 0.3703703703703704
eval_cosine_precision@5 0.3111111111111111
eval_cosine_precision@10 0.2222222222222222
eval_cosine_recall@1 0.00942658164880387
eval_cosine_recall@3 0.04062542395875729
eval_cosine_recall@5 0.15918464251797584
eval_cosine_recall@10 0.17709718265273822
eval_cosine_ndcg@10 0.2937700524782729
eval_cosine_mrr@10 0.39814814814814814
eval_cosine_map@100 0.20914505536318057
eval_runtime 272.435
eval_samples_per_second 7.675
eval_steps_per_second 0.481


In [121]:
ev = MyRecallEval(test_dataset)
metrics = ev(trainer.model)
print(metrics)

{'recall@5': 0.026174972314507196, 'recall@10': 0.05922702104097453, 'recall@20': 0.12510077519379842, 'recall@50': 0.2801063122923588, 'normalized_recall@5': 0.2, 'normalized_recall@10': 0.22999999999999998, 'normalized_recall@20': 0.23499999999999996, 'normalized_recall@50': 0.3085714285714286, 'cluster_recall': 0.4}


In [123]:
def hihi(test_dataset):
    corpus = dict(zip(test_dataset['premise'], test_dataset['premise']))
    queries = dict(zip(test_dataset['hypothesis'], test_dataset['hypothesis']))
    relevant_docs = defaultdict(list)

    for k in range(len(test_dataset)):
        if test_dataset['label'][k] > 0:
            relevant_docs[test_dataset['hypothesis'][k]].append(test_dataset['premise'][k])
    
    for k, v in relevant_docs.items():
        print(len(v))

hihi(test_dataset)

88
36
25
30
3
25
4
1
7
1
