## Loading the dataset

### Copied from preprocessing.ipynb

In [None]:
import pandas as pd
import numpy as np
import datasets
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
train_data_path = "./data/English dataset/train.jsonl"
test_data_path = "./data/English dataset/test.jsonl"

def preprocess_text(text): # From the labs
	# Tokenize the text into words
	words = word_tokenize(text.lower())  # Convert text to lowercase

	# Remove punctuation
	table = str.maketrans('', '', string.punctuation)
	words = [word.translate(table) for word in words if word.isalpha()]

	# Remove stopwords
	stop_words = set(stopwords.words('english'))
	words = [word for word in words if word not in stop_words]

	# Lemmatization
	lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

	# Join the words back into a string
	preprocessed_text = ' '.join(lemmatized_words)
	return preprocessed_text

train_data = pd.DataFrame(datasets.load_dataset("json", data_files=train_data_path)["train"])
test_data = pd.DataFrame(datasets.load_dataset("json", data_files=test_data_path)["train"])

label_map = {"Contradiction": 1, "Entailment": 0, "NotMentioned": 0}
train_data["label"] = train_data["label"].map(label_map)
test_data["label"] = test_data["label"].map(label_map)

train_data = train_data.drop("doc_id", axis=1)
train_data = train_data.drop("key", axis=1)
test_data = test_data.drop("doc_id", axis=1)
test_data = test_data.drop("key", axis=1)

train_data["label"].value_counts(normalize=True)

This section was already in tims file so when you join the files you can just delete the upper preprocessing section

### My adition

In [None]:
from datasets import Dataset

ds = Dataset.from_pandas(train_data)
ds = ds.select_columns(["hypothesis", "premise", "label"])
ds = ds.select_columns(["hypothesis", "premise", "label"])

dss = ds.train_test_split(0.3)
train_dataset = dss['train']
valid_dataset = dss['test']
test_dataset = Dataset.from_pandas(test_data)

test_corpus = test_dataset['premise']
test_hypothesis = test_dataset['hypothesis']
print(len(test_hypothesis))

In [None]:
print(len(train_dataset), len(valid_dataset))

## Creating a model

### Base model
- Straight from the box, unmodified
-  [msmarco-MiniLM-L6-cos-v5](https://huggingface.co/sentence-transformers/msmarco-MiniLM-L6-cos-v5) Trained specificly for query-passage retrieval

In [None]:
from sentence_transformers import SentenceTransformer, util

model_name = "./models/msmarco-MiniLM-L6-cos-v5"
base_model = SentenceTransformer(model_name, model_kwargs={"dtype": "float16"})

 float16 should speed up the model, while having minimal impact on preformance: [documentation](https://www.sbert.net/docs/sentence_transformer/usage/efficiency.html)

### Fine tuning the base model

#### Loss

In [None]:
"""
 I | I i
I I| L
"""

In [None]:
from sentence_transformers import SentenceTransformerTrainer

fine_model = base_model

##### Setup of Trainers

**Contrastive loss** Used for binary labled pairs

In [None]:
from sentence_transformers.losses import ContrastiveLoss

def trainer_cl(m, train_dataset, valid_dataset):
    td = {''}
    loss = ContrastiveLoss(m)

    trainer = SentenceTransformerTrainer(
        model = m,
        train_dataset=train_dataset,
        eval_dataset=None,
        loss=loss,
    )

    return trainer

##### Training

Run this if using a trainer

In [None]:
trainer = trainer_cl(fine_model, train_dataset, valid_dataset)
trainer.train()

## Evaluation

## Methods

In [None]:
from sentence_transformers.evaluation import BinaryClassificationEvaluator, EmbeddingSimilarityEvaluator, ParaphraseMiningEvaluator, InformationRetrievalEvaluator

def eval_full_inbuilt(mode, test_dataset):
    res = []
    res.append()