In [1]:
import pandas as pd
import numpy as np
import datasets
import nltk
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from scipy.sparse import hstack
from sklearn.model_selection import RandomizedSearchCV
from tabulate import tabulate

from transformers import Trainer
from sklearn.metrics import classification_report
import os
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
train_data_path = "./data/English dataset/train.jsonl"
test_data_path = "./data/English dataset/test.jsonl"

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\timna\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\timna\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\timna\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\timna\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
def preprocess_text(text): # From the labs
	# Tokenize the text into words
	words = word_tokenize(text.lower())  # Convert text to lowercase

	# Remove punctuation
	table = str.maketrans('', '', string.punctuation)
	words = [word.translate(table) for word in words if word.isalpha()]

	# Remove stopwords
	stop_words = set(stopwords.words('english'))
	words = [word for word in words if word not in stop_words]

	# Lemmatization
	lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

	# Join the words back into a string
	preprocessed_text = ' '.join(lemmatized_words)
	return preprocessed_text

train_data = pd.DataFrame(datasets.load_dataset("json", data_files=train_data_path)["train"])
test_data = pd.DataFrame(datasets.load_dataset("json", data_files=test_data_path)["train"])

label_map = {"Contradiction": 1, "Entailment": 0, "NotMentioned": 0}
train_data["label"] = train_data["label"].map(label_map)
test_data["label"] = test_data["label"].map(label_map)

train_data["premise"] = train_data["premise"].map(preprocess_text)
train_data["hypothesis"] = train_data["hypothesis"].map(preprocess_text)

test_data["premise"] = test_data["premise"].map(preprocess_text)
test_data["hypothesis"] = test_data["hypothesis"].map(preprocess_text)

train_data = train_data.drop("doc_id", axis=1)
train_data = train_data.drop("key", axis=1)
test_data = test_data.drop("doc_id", axis=1)
test_data = test_data.drop("key", axis=1)

In [3]:
from sklearn.utils import resample

# Split by class
class_0 = train_data[train_data["label"] == 0]
class_1 = train_data[train_data["label"] == 1]

# Oversample class 1
class_1_oversampled = resample(
    class_1,
    replace=True,                       # sample with replacement
    n_samples=len(class_0),             # match majority class
)

# Combine and shuffle
train_data = pd.concat([class_0, class_1_oversampled]) \
                .sample(frac=1, random_state=42) \
                .reset_index(drop=True)

In [4]:
from sentence_transformers import InputExample
train_examples = []
for row in train_data.itertuples():
    train_examples.append(InputExample(
        texts=[str(row.premise), str(row.hypothesis)], 
        label=float(row.label) # Ensuring it's a float
    ))

In [None]:
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader

# 1. Load a pre-trained base model
model = SentenceTransformer('jinaai/jina-embeddings-v2-small-en')

# 2. Define your training data
# In a real scenario, you'd load the SNLI dataset from Hugging Face
# Label 1.0 = Contradiction (what we want to find)
# Label 0.0 = Not a contradiction

# 3. Create a DataLoader and a Loss function
# CosineSimilarityLoss is standard for mapping sentence pairs to a float value
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(model)

# 4. Fine-tune the model
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=3,
    warmup_steps=200
)
model.save('domen-model3')

Some weights of BertModel were not initialized from the model checkpoint at jinaai/jina-embeddings-v2-small-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'encoder.layer.2.intermediate.dense.bias', 'encoder.layer.2.intermediate.dense.weight', 'encoder.layer.2.output.LayerNorm.bias', 'encoder.layer.2.output.LayerNorm.weight', 'encoder.layer.2.output.dense.bias', 'encoder.layer.2.output.dense.weight', 'encoder.layer.3.intermediate.dense.bias', 'encoder.layer.3.intermediate.den

Step,Training Loss
500,0.1138
1000,0.0576
1500,0.0475


In [6]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SimilarityFunction

dev_evaluator = EmbeddingSimilarityEvaluator(
    sentences1=test_data["premise"],
    sentences2=test_data["hypothesis"],
    scores=test_data["label"],
    main_similarity=SimilarityFunction.COSINE,
    name="sts-dev",
)
#1epoc, no fuckery {'sts-dev_pearson_cosine': 0.7037046292102951, 'sts-dev_spearman_cosine': 0.4533442418855714} Model 2
#3epoc, balanced classes {'sts-dev_pearson_cosine': 0.8121980013884701, 'sts-dev_spearman_cosine': 0.5052373658606608} Model 3
print(dev_evaluator(model))

{'sts-dev_pearson_cosine': 0.7529591039273611, 'sts-dev_spearman_cosine': 0.503403597116724}


In [7]:
# 2. Extract the columns as lists for the model
premises = test_data['premise'].astype(str).tolist()
hypotheses = test_data['hypothesis'].astype(str).tolist()

# 3. Encode the sentences into vectors (embeddings)
# We use convert_to_tensor=True to do the math quickly on the GPU/CPU
premise_embeddings = model.encode(premises, convert_to_tensor=True)
hypothesis_embeddings = model.encode(hypotheses, convert_to_tensor=True)

# 4. Calculate Cosine Similarity row by row
# This uses the formula: 
# $$\text{cosine\_similarity}(A, B) = \frac{A \cdot B}{\|A\| \|B\|}$$
cosine_scores = torch.nn.functional.cosine_similarity(premise_embeddings, hypothesis_embeddings, dim=1)

# 5. Add the scores back to your Pandas table
test_data['model_score'] = cosine_scores.cpu().detach().numpy()

# Display the result
print(test_data[['premise', 'hypothesis', 'label', 'model_score']].head())

                                             premise  \
0                                                      
1  recipient shall immediately return redeliver t...   
2  right license whether expressed implied confid...   
3                                                      
4  purpose agreement confidential information mea...   

                                          hypothesis  label  model_score  
0  receiving party shall reverse engineer object ...      0    -0.042443  
1  receiving party shall destroy return confident...      0    -0.008818  
2  agreement shall grant receiving party right co...      0     0.145420  
3  receiving party shall disclose fact agreement ...      0    -0.099918  
4  confidential information shall include technic...      1     0.980617  


In [8]:
test_data['model_score'] = test_data['model_score'].map(abs)
test_data.sort_values(by=['premise', 'model_score'], ascending=[False, False])

Unnamed: 0,premise,hypothesis,label,model_score
989,xiii required disclosed pursuant governmental ...,receiving party shall disclose fact agreement ...,0,0.006897
1810,written confidential information delivered one...,receiving party may retain confidential inform...,1,0.935611
1804,written confidential information delivered one...,agreement shall grant receiving party right co...,0,0.390556
1625,without prior written consent company neither ...,receiving party shall solicit disclosing party...,0,0.172139
1754,without prior written consent company except m...,receiving party shall disclose fact agreement ...,0,0.103088
...,...,...,...,...
1308,,receiving party shall use confidential informa...,0,0.005733
1359,,receiving party shall use confidential informa...,0,0.005733
1461,,receiving party shall use confidential informa...,0,0.005733
1512,,receiving party shall use confidential informa...,0,0.005733
