## Loading the dataset

### Copied from preprocessing.ipynb

In [2]:
import pandas as pd
import numpy as np
import datasets
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
train_data_path = "./data/English dataset/train.jsonl"
test_data_path = "./data/English dataset/test.jsonl"

def preprocess_text(text): # From the labs
	# Tokenize the text into words
	words = word_tokenize(text.lower())  # Convert text to lowercase

	# Remove punctuation
	table = str.maketrans('', '', string.punctuation)
	words = [word.translate(table) for word in words if word.isalpha()]

	# Remove stopwords
	stop_words = set(stopwords.words('english'))
	words = [word for word in words if word not in stop_words]

	# Lemmatization
	lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

	# Join the words back into a string
	preprocessed_text = ' '.join(lemmatized_words)
	return preprocessed_text

train_data = pd.DataFrame(datasets.load_dataset("json", data_files=train_data_path)["train"])
test_dataset = pd.DataFrame(datasets.load_dataset("json", data_files=test_data_path)["train"])

label_map = {"Contradiction": 1, "Entailment": 0, "NotMentioned": 0}
train_data["label"] = train_data["label"].map(label_map)
test_dataset["label"] = test_dataset["label"].map(label_map)

train_data = train_data.drop("doc_id", axis=1)
train_data = train_data.drop("key", axis=1)
test_dataset = test_dataset.drop("doc_id", axis=1)
test_dataset = test_dataset.drop("key", axis=1)

train_data["label"].value_counts(normalize=True)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Domen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Domen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Domen\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


label
0    0.883048
1    0.116952
Name: proportion, dtype: float64

This section was already in tims file so when you join the files you can just delete the upper preprocessing section

### My adition

In [3]:
def text_splitter(p, text_lim, overlap=250, allowed_delimiters=" ,.?!"):
    i = text_lim
    while not p[i] in allowed_delimiters and i > 0:
        i -= 1
    p1 = p[:i]

    d = len(p) - i
    if d < text_lim-overlap:
        j = text_lim-overlap
        while not p[j] in allowed_delimiters and j > 0:
            j += 1
        p2 = p[j:]
        return [p1,p2]
    else:
        return p1 + text_splitter(p[(i-overlap):], text_lim, overlap)

def dataset_splitter(df, token_lim = 768, overlap=50, allowed_delimiters=" ,.?!"): # ensures the premises are roughly within the token limit
    text_lim = int(token_lim * 3.5)
    overlap = int(overlap*3.5)
    to_app = []
    for i,r in df.iterrows():
        if len(r['premise']) > text_lim:
            seg = text_splitter(r['premise'], text_lim, overlap=overlap, allowed_delimiters=allowed_delimiters)

            for s in seg:
                nr = {'premise': s, 'hypothesis': r['hypothesis'], 'label': r['label']}
                to_app.append(nr)
            
            df.drop(i)
    
    df.append(pd.DataFrame(to_app))
    return df

In [4]:
from datasets import Dataset

ds = Dataset.from_pandas(train_data)
ds = ds.select_columns(["hypothesis", "premise", "label"])
ds = ds.select_columns(["hypothesis", "premise", "label"])

dss = ds.train_test_split(0.2, seed=42)
train_dataset = dss['train']
valid_dataset = dss['test']
test_dataset = Dataset.from_pandas(test_dataset)

test_corpus = test_dataset['premise']
test_hypothesis = test_dataset['hypothesis']
print(len(test_hypothesis))

2091


In [5]:
print(len(train_dataset), len(valid_dataset))

5752 1439


## Creating a model

### Evaluation

#### Evaluators

In [6]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from collections import defaultdict

def get_ret_eval(test_dataset):
    corpus = dict(zip(test_dataset['premise'], test_dataset['premise']))
    queries = dict(zip(test_dataset['hypothesis'], test_dataset['hypothesis']))
    relevant_docs = defaultdict(list)

    for k in range(len(test_dataset)):
        if test_dataset['label'][k] > 0:
            relevant_docs[test_dataset['hypothesis'][k]].append(test_dataset['premise'][k])
    

    inf_ret_ev = InformationRetrievalEvaluator(
        queries= queries,
        corpus = corpus,
        relevant_docs = relevant_docs,
        #similarity_fn_names= ["cosine"],
        show_progress_bar=True,
        batch_size= 8,
        #main_score_function="Recall@10"
    )

    return inf_ret_ev


In [7]:
from sentence_transformers.evaluation import BinaryClassificationEvaluator

def get_bin_eval(test_dataset):
    """
        BinnaryClassification returns: F1, Percision, Recall, Avg Percision, Matthews Correlation, 
    """
    bin_acc_ev = BinaryClassificationEvaluator(
        sentences1= test_dataset['hypothesis'],
        sentences2= test_dataset['premise'],
        labels= test_dataset['label'],
        similarity_fn_names= ["cosine", "dot"],
        show_progress_bar= True,
        batch_size= 8
    )
    return bin_acc_ev

#### Methods

In [8]:
from sentence_transformers.evaluation import BinaryClassificationEvaluator
from collections import defaultdict

def eval_full_inbuilt(model, test_dataset):
    """
        BinnaryClassification returns: F1, Percision, Recall, Avg Percision, Matthews Correlation, 
    """

    bin_acc_ev = get_bin_eval(test_dataset)

    inf_ret_ev = get_ret_eval(test_dataset)

    result = {}
    result["BinaryClassificaton"] = bin_acc_ev(model)
    result["InformationRetrieval"] = inf_ret_ev(model)

    for meth, d in result.items():
        print()
        print(meth, ": ")
        for k, v in d.items():
            print(k, ": ",v)




### Base model
- Straight from the box, unmodified
-  [msmarco-MiniLM-L6-cos-v5](https://huggingface.co/sentence-transformers/msmarco-MiniLM-L6-cos-v5) Trained specificly for query-passage retrieval

In [9]:
from sentence_transformers import SentenceTransformer, util
import torch

model_name = ".\models\jina-embeddings-v2-small-en" 
base_model = SentenceTransformer(model_name, model_kwargs={"dtype": "float16"})

Some weights of BertModel were not initialized from the model checkpoint at .\models\jina-embeddings-v2-small-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'encoder.layer.2.intermediate.dense.bias', 'encoder.layer.2.intermediate.dense.weight', 'encoder.layer.2.output.LayerNorm.bias', 'encoder.layer.2.output.LayerNorm.weight', 'encoder.layer.2.output.dense.bias', 'encoder.layer.2.output.dense.weight', 'encoder.layer.3.intermediate.dense.bias', 'encoder.layer.3.intermediate.d

 float16 should speed up the model, while having minimal impact on preformance: [documentation](https://www.sbert.net/docs/sentence_transformer/usage/efficiency.html)

### Fine tuning the base model

#### Loss

In [10]:
"""
 I | I i
I I| L
"""

'\n I | I i\nI I| L\n'

In [11]:
from sentence_transformers import SentenceTransformerTrainer

fine_model = base_model

##### Setup of Trainers

In [12]:
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

k = 10

args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir="models/tuned_model",
    # Optional training parameters:
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    seed=42,
    metric_for_best_model=f"eval_recall@{k}",
    greater_is_better=False,
  	load_best_model_at_end=True,
  	weight_decay=0.01,
    
    #warmup_ratio=0.1,
    fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=False,  # Set to True if you have a GPU that supports BF16
    #batch_sampler=BatchSamplers.NO_DUPLICATES,  # losses that use "in-batch negatives" benefit from no duplicates
    # Optional tracking/debugging parameters:
    eval_strategy="steps",
    eval_steps=50, # how often we eval
    #save_strategy="best",
    torch_empty_cache_steps = None,
    save_steps=50,
    save_total_limit=2,
    logging_steps=100,
    run_name="mpnet-base-all-nli-triplet",  # Will be used in W&B if `wandb` is installed
)

**Contrastive loss** Used for binary labled pairs

In [13]:
from sentence_transformers.losses import OnlineContrastiveLoss

def trainer_cl(m, train_dataset, valid_dataset, args):
    td = {''}
    loss = OnlineContrastiveLoss(m)

    evaluator = get_ret_eval(valid_dataset)

    trainer = SentenceTransformerTrainer(
        model = m,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        loss=loss,
        args=args
    )

    return trainer

##### Training

Run this if using a trainer

In [14]:
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121


In [None]:
import torch
print(torch.cuda.is_available())

False


In [None]:
trainer = trainer_cl(fine_model, train_dataset, valid_dataset, args)
trainer.train()



Step,Training Loss,Validation Loss
