In [181]:
import pandas as pd
import numpy as np
import datasets
import nltk
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from scipy.sparse import hstack
from sklearn.model_selection import RandomizedSearchCV
from tabulate import tabulate

from transformers import Trainer
from sklearn.metrics import classification_report
import os
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
train_data_path = "./data/English dataset/train.jsonl"
test_data_path = "./data/English dataset/test.jsonl"

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Domen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Domen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Domen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Domen\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [182]:
def pretty_print_report_dict(report):
	report_df = pd.DataFrame(report).transpose()
	report_df = report_df.round(3)

	class_metrics = report_df.iloc[:-3, :].copy()

	summary_metrics = report_df.iloc[-3:, :].copy()
	summary_metrics = summary_metrics.drop(columns=['support'])

	print("CLASS PERFORMANCE")
	print(tabulate(class_metrics, headers='keys', tablefmt='heavy_outline', numalign="center"))
	print()
	print("GLOBAL AVERAGES")
	print(tabulate(summary_metrics, headers='keys', tablefmt='heavy_outline', numalign="center"))

# Data preprocessing

In [183]:
def preprocess_text(text): # From the labs
	# Tokenize the text into words
	words = word_tokenize(text.lower())  # Convert text to lowercase

	# Remove punctuation
	table = str.maketrans('', '', string.punctuation)
	words = [word.translate(table) for word in words if word.isalpha()]

	# Remove stopwords
	stop_words = set(stopwords.words('english'))
	words = [word for word in words if word not in stop_words]

	# Lemmatization
	lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

	# Join the words back into a string
	preprocessed_text = ' '.join(lemmatized_words)
	return preprocessed_text

(We load the dataset. We join togeder eintailment and not mentioned, so we can focus on predicting only if something is a contradiction)

In [184]:
train_data = pd.DataFrame(datasets.load_dataset("json", data_files=train_data_path)["train"])
test_data = pd.DataFrame(datasets.load_dataset("json", data_files=test_data_path)["train"])

label_map = {"Contradiction": 1, "Entailment": 0, "NotMentioned": 0}
train_data["label"] = train_data["label"].map(label_map)
test_data["label"] = test_data["label"].map(label_map)

train_data = train_data.drop("doc_id", axis=1)
train_data = train_data.drop("key", axis=1)
test_data = test_data.drop("doc_id", axis=1)
test_data = test_data.drop("key", axis=1)

(After we load the dataset, we inspect it for class inbalance)

In [185]:
train_data["label"].value_counts(normalize=True)

label
0    0.883048
1    0.116952
Name: proportion, dtype: float64

(We can see that most of the data isn't contradictions. The data is quite imbalanced)

In [186]:
longest_premise = max(train_data['premise'].apply(len).max(), test_data['premise'].apply(len).max())
longest_hypotises = max(train_data['hypothesis'].apply(len).max(), test_data['hypothesis'].apply(len).max())
longest_sentance = max(longest_premise, longest_hypotises)
print("Longest premise: ", longest_premise)
print("Longest hypothesis: ", longest_hypotises)

print("---------------------------------")
mean = np.mean(train_data['premise'].apply(len))
std = np.std(train_data['premise'].apply(len))

print("Mean premise length: ", mean)
print("+1 std: ", mean+std)
print("+2 std: ", mean+2*std)
print("+3 std: ", mean+3*std)

Longest premise:  3098
Longest hypothesis:  162
---------------------------------
Mean premise length:  296.27826449728826
+1 std:  651.2505192635402
+2 std:  1006.2227740297922
+3 std:  1361.1950287960442


(We inspect the lenght of the data. We do this to see if it would be beneficial only keeping smaller sizes of the data, so we can cleanly feed it into BERT model. We conclude that we would need to thin our data too much to be worth it)

# Traditional ML

In [187]:
train_data_preprocessed = train_data.copy()
test_data_preprocessed = test_data.copy()

train_data_preprocessed["premise"] = train_data_preprocessed["premise"].map(preprocess_text)
train_data_preprocessed["hypothesis"] = train_data_preprocessed["hypothesis"].map(preprocess_text)

test_data_preprocessed["premise"] = test_data_preprocessed["premise"].map(preprocess_text)
test_data_preprocessed["hypothesis"] = test_data_preprocessed["hypothesis"].map(preprocess_text)

vectorizer_premise = TfidfVectorizer()
vectorizer_hypothesis = TfidfVectorizer()

train_data_vectorised = train_data.copy()

X_premise= vectorizer_premise.fit_transform(train_data["premise"])
X_hypothesis = vectorizer_hypothesis.fit_transform(train_data["hypothesis"])
train_data_vectorised = hstack([X_premise, X_hypothesis])

Y_premise = vectorizer_premise.transform(test_data_preprocessed["premise"])
Y_hypothesis = vectorizer_hypothesis.transform(test_data_preprocessed["hypothesis"])
test_data_vectorised = hstack([Y_premise, Y_hypothesis])

## Logistic regression

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

grid_serch_dict = {
	"l1_ratio": [0, 0.5, 1],
	"C": [0.1, 0.5, 1.0, 2.0, 10, 50],
	"class_weight": [None, "balanced"],
	"solver": ["saga"]
}

logreg_model = RandomizedSearchCV(LogisticRegression(max_iter=5000), grid_serch_dict, n_iter=10, cv=3, scoring='f1')
logreg_model.fit(train_data_vectorised, train_data["label"])

predictions = logreg_model.predict(test_data_vectorised)

display_params = [[k, str(v)] for k, v in logreg_model.best_params_.items()]
print(tabulate(display_params, headers=["Hyperparameter", "Value"], tablefmt="heavy_outline"))



┏━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┓
┃ Hyperparameter   ┃ Value    ┃
┣━━━━━━━━━━━━━━━━━━╋━━━━━━━━━━┫
┃ solver           ┃ saga     ┃
┃ l1_ratio         ┃ 1        ┃
┃ class_weight     ┃ balanced ┃
┃ C                ┃ 50       ┃
┗━━━━━━━━━━━━━━━━━━┻━━━━━━━━━━┛


In [9]:
report_dict = classification_report(test_data["label"], predictions, zero_division=0, output_dict=True)
pretty_print_report_dict(report_dict)

CLASS PERFORMANCE
┏━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┓
┃    ┃  precision  ┃  recall  ┃  f1-score  ┃  support  ┃
┣━━━━╋━━━━━━━━━━━━━╋━━━━━━━━━━╋━━━━━━━━━━━━╋━━━━━━━━━━━┫
┃ 0  ┃    0.926    ┃  0.997   ┃    0.96    ┃   1871    ┃
┃ 1  ┃    0.923    ┃  0.327   ┃   0.483    ┃    220    ┃
┗━━━━┻━━━━━━━━━━━━━┻━━━━━━━━━━┻━━━━━━━━━━━━┻━━━━━━━━━━━┛

GLOBAL AVERAGES
┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━┓
┃              ┃  precision  ┃  recall  ┃  f1-score  ┃
┣━━━━━━━━━━━━━━╋━━━━━━━━━━━━━╋━━━━━━━━━━╋━━━━━━━━━━━━┫
┃ accuracy     ┃    0.926    ┃  0.926   ┃   0.926    ┃
┃ macro avg    ┃    0.925    ┃  0.662   ┃   0.722    ┃
┃ weighted avg ┃    0.926    ┃  0.926   ┃    0.91    ┃
┗━━━━━━━━━━━━━━┻━━━━━━━━━━━━━┻━━━━━━━━━━┻━━━━━━━━━━━━┛


## Random Forest

In [10]:
from sklearn.ensemble import RandomForestClassifier

grid_serch_dict = {
	"n_estimators": [50, 100, 200],
	"max_depth": [None, 5, 10, 20],
	"min_samples_split": [2, 5, 10, 20, 50],
	"class_weight": [None, "balanced"]
}

rf_model = RandomizedSearchCV(RandomForestClassifier(random_state=67), grid_serch_dict, n_iter=15, cv=3, scoring='f1')
rf_model.fit(train_data_vectorised, train_data["label"])

predictions = rf_model.predict(test_data_vectorised)

display_params = [[k, str(v)] for k, v in rf_model.best_params_.items()]
print(tabulate(display_params, headers=["Hyperparameter", "Value"], tablefmt="heavy_outline"))

┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┓
┃ Hyperparameter    ┃ Value   ┃
┣━━━━━━━━━━━━━━━━━━━╋━━━━━━━━━┫
┃ n_estimators      ┃ 200     ┃
┃ min_samples_split ┃ 5       ┃
┃ max_depth         ┃ None    ┃
┃ class_weight      ┃ None    ┃
┗━━━━━━━━━━━━━━━━━━━┻━━━━━━━━━┛


In [11]:
report_dict = classification_report(test_data["label"], predictions, zero_division=0, output_dict=True)
pretty_print_report_dict(report_dict)

CLASS PERFORMANCE
┏━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┓
┃    ┃  precision  ┃  recall  ┃  f1-score  ┃  support  ┃
┣━━━━╋━━━━━━━━━━━━━╋━━━━━━━━━━╋━━━━━━━━━━━━╋━━━━━━━━━━━┫
┃ 0  ┃    0.923    ┃    1     ┃    0.96    ┃   1871    ┃
┃ 1  ┃      1      ┃  0.295   ┃   0.456    ┃    220    ┃
┗━━━━┻━━━━━━━━━━━━━┻━━━━━━━━━━┻━━━━━━━━━━━━┻━━━━━━━━━━━┛

GLOBAL AVERAGES
┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━┓
┃              ┃  precision  ┃  recall  ┃  f1-score  ┃
┣━━━━━━━━━━━━━━╋━━━━━━━━━━━━━╋━━━━━━━━━━╋━━━━━━━━━━━━┫
┃ accuracy     ┃    0.926    ┃  0.926   ┃   0.926    ┃
┃ macro avg    ┃    0.962    ┃  0.648   ┃   0.708    ┃
┃ weighted avg ┃    0.932    ┃  0.926   ┃   0.907    ┃
┗━━━━━━━━━━━━━━┻━━━━━━━━━━━━━┻━━━━━━━━━━┻━━━━━━━━━━━━┛


## SVC

In [188]:
from sklearn.svm import SVC

grid_serch_dict = {
	"C": [0.1, 0.5, 1.0, 2.0, 10, 50],
	"kernel": ["linear", "sigmoid", "rbf"],
	"class_weight": [None, "balanced"]
}

svm_model = RandomizedSearchCV(SVC(), grid_serch_dict, n_iter=10, cv=3, scoring='f1')
svm_model.fit(train_data_vectorised, train_data["label"])

predictions = svm_model.predict(test_data_vectorised)

display_params = [[k, str(v)] for k, v in svm_model.best_params_.items()]
print(tabulate(display_params, headers=["Hyperparameter", "Value"], tablefmt="heavy_outline"))

┏━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┓
┃ Hyperparameter   ┃ Value   ┃
┣━━━━━━━━━━━━━━━━━━╋━━━━━━━━━┫
┃ kernel           ┃ rbf     ┃
┃ class_weight     ┃ None    ┃
┃ C                ┃ 10      ┃
┗━━━━━━━━━━━━━━━━━━┻━━━━━━━━━┛


In [189]:
report_dict = classification_report(test_data["label"], predictions, zero_division=0, output_dict=True)
pretty_print_report_dict(report_dict)

CLASS PERFORMANCE
┏━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┓
┃    ┃  precision  ┃  recall  ┃  f1-score  ┃  support  ┃
┣━━━━╋━━━━━━━━━━━━━╋━━━━━━━━━━╋━━━━━━━━━━━━╋━━━━━━━━━━━┫
┃ 0  ┃    0.946    ┃  0.997   ┃   0.971    ┃   1871    ┃
┃ 1  ┃    0.95     ┃  0.518   ┃   0.671    ┃    220    ┃
┗━━━━┻━━━━━━━━━━━━━┻━━━━━━━━━━┻━━━━━━━━━━━━┻━━━━━━━━━━━┛

GLOBAL AVERAGES
┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━┓
┃              ┃  precision  ┃  recall  ┃  f1-score  ┃
┣━━━━━━━━━━━━━━╋━━━━━━━━━━━━━╋━━━━━━━━━━╋━━━━━━━━━━━━┫
┃ accuracy     ┃    0.946    ┃  0.946   ┃   0.946    ┃
┃ macro avg    ┃    0.948    ┃  0.757   ┃   0.821    ┃
┃ weighted avg ┃    0.947    ┃  0.946   ┃   0.939    ┃
┗━━━━━━━━━━━━━━┻━━━━━━━━━━━━━┻━━━━━━━━━━┻━━━━━━━━━━━━┛


# Transformer-Based Classifier

## Training

In [16]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset

def preprocess_function(examples):
	inputs = tokenizer(examples["premise"], examples["hypothesis"], 
			max_length=1024, truncation="only_first", padding="max_length")

	global_attention_mask = [[0] * len(ids) for ids in inputs["input_ids"]]

	for mask in global_attention_mask:
		mask[0] = 1 
		
	inputs["global_attention_mask"] = global_attention_mask
	return inputs

model_name = "kiddothe2b/longformer-mini-1024"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at kiddothe2b/longformer-mini-1024 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
train_dataset = Dataset.from_pandas(train_data)
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)

dataset_split = tokenized_train_dataset.train_test_split(test_size=0.1)
tokenized_train_dataset = dataset_split["train"]
tokenized_eval_dataset = dataset_split["test"]

test_dataset = Dataset.from_pandas(test_data)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 7191/7191 [00:09<00:00, 777.74 examples/s]
Map: 100%|██████████| 2091/2091 [00:02<00:00, 814.66 examples/s]


In [None]:
RUN = False

from transformers import TrainingArguments, Trainer
from sklearn.metrics import f1_score
from torch import nn

class WeightedTrainer(Trainer):
	def __init__(self, *args, class_weights=None, **kwargs):
		super().__init__(*args, **kwargs)
		if class_weights is not None:
			self.class_weights = torch.tensor(class_weights, dtype=torch.float).to(self.args.device)
		else:
			self.class_weights = None

	def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None, **kwargs):
		labels = inputs.get("labels")
		outputs = model(**inputs)
		logits = outputs.get("logits")
		
		if self.class_weights is not None:
			loss_fct = nn.CrossEntropyLoss(weight=self.class_weights)
			loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
		else:
			# Fallback to default loss if no weights are provided
			loss = outputs.loss if isinstance(outputs, dict) else outputs[0]
			
		return (loss, outputs) if return_outputs else loss

def compute_metrics(eval_pred):
	logits, labels = eval_pred
	predictions = np.argmax(logits, axis=-1)

	f1 = f1_score(labels, predictions, pos_label=1, average='binary')
	return {"f1_score_class_1": f1}

class_weights = [1.0, 9.0] # Weight class 0, weight class 1

training_args = TrainingArguments(
	output_dir="./artifacts",
	learning_rate=2e-4,
	per_device_train_batch_size=2,
	gradient_accumulation_steps=16,
	num_train_epochs=2,
	weight_decay=0.01,
	save_strategy="steps",
	save_steps=50,
	save_total_limit=3, 
	load_best_model_at_end=True,
	metric_for_best_model="f1_score_class_1", 
	eval_strategy="steps",
	eval_steps=50,
	greater_is_better=True,
	resume_from_checkpoint=False
)


path = "./trained_model_ex3_f1_class1_weighted"
if (not os.path.exists(path) and RUN):
	trainer = WeightedTrainer(
		model=model,
		args=training_args,
		train_dataset=tokenized_train_dataset,
		eval_dataset=tokenized_eval_dataset,
		compute_metrics=compute_metrics,
		class_weights=class_weights,
	)

	trainer.train(resume_from_checkpoint=True)

	tokenizer.save_pretrained(path)
	trainer.save_model(path)
else:
	print("Model not trained")

Model not trained


## Evaluation

In [None]:
path = "trained_model_ex3_precision_class1_v1"
if (os.path.exists(path)):
	model = AutoModelForSequenceClassification.from_pretrained(path, num_labels=2)

	trainer = Trainer(model=model)
	predictions_procentages = trainer.predict(tokenized_test_dataset)[0]
	predictions = predictions_procentages.argmax(-1)
	report_dict = classification_report(test_data["label"], predictions, zero_division=0, output_dict=True)
	pretty_print_report_dict(report_dict)
else:
	print("Model not found")



CLASS PERFORMANCE
┏━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┓
┃    ┃  precision  ┃  recall  ┃  f1-score  ┃  support  ┃
┣━━━━╋━━━━━━━━━━━━━╋━━━━━━━━━━╋━━━━━━━━━━━━╋━━━━━━━━━━━┫
┃ 0  ┃    0.981    ┃  0.972   ┃   0.976    ┃   1871    ┃
┃ 1  ┃    0.776    ┃  0.836   ┃   0.805    ┃    220    ┃
┗━━━━┻━━━━━━━━━━━━━┻━━━━━━━━━━┻━━━━━━━━━━━━┻━━━━━━━━━━━┛

GLOBAL AVERAGES
┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━┓
┃              ┃  precision  ┃  recall  ┃  f1-score  ┃
┣━━━━━━━━━━━━━━╋━━━━━━━━━━━━━╋━━━━━━━━━━╋━━━━━━━━━━━━┫
┃ accuracy     ┃    0.957    ┃  0.957   ┃   0.957    ┃
┃ macro avg    ┃    0.878    ┃  0.904   ┃   0.891    ┃
┃ weighted avg ┃    0.959    ┃  0.957   ┃   0.958    ┃
┗━━━━━━━━━━━━━━┻━━━━━━━━━━━━━┻━━━━━━━━━━┻━━━━━━━━━━━━┛


In [None]:
path = "trained_model_ex3_v2_macro_f1_v1"
if (os.path.exists(path)):
	model = AutoModelForSequenceClassification.from_pretrained(path, num_labels=2)

	trainer = Trainer(model=model)
	predictions_procentages = trainer.predict(tokenized_test_dataset)[0]
	predictions = predictions_procentages.argmax(-1)
	report_dict = classification_report(test_data["label"], predictions, zero_division=0, output_dict=True)
	pretty_print_report_dict(report_dict)
else:
	print("Model not found")



CLASS PERFORMANCE
┏━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┓
┃    ┃  precision  ┃  recall  ┃  f1-score  ┃  support  ┃
┣━━━━╋━━━━━━━━━━━━━╋━━━━━━━━━━╋━━━━━━━━━━━━╋━━━━━━━━━━━┫
┃ 0  ┃    0.935    ┃  0.975   ┃   0.955    ┃   1871    ┃
┃ 1  ┃    0.667    ┃  0.427   ┃   0.521    ┃    220    ┃
┗━━━━┻━━━━━━━━━━━━━┻━━━━━━━━━━┻━━━━━━━━━━━━┻━━━━━━━━━━━┛

GLOBAL AVERAGES
┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━┓
┃              ┃  precision  ┃  recall  ┃  f1-score  ┃
┣━━━━━━━━━━━━━━╋━━━━━━━━━━━━━╋━━━━━━━━━━╋━━━━━━━━━━━━┫
┃ accuracy     ┃    0.917    ┃  0.917   ┃   0.917    ┃
┃ macro avg    ┃    0.801    ┃  0.701   ┃   0.738    ┃
┃ weighted avg ┃    0.907    ┃  0.917   ┃   0.909    ┃
┗━━━━━━━━━━━━━━┻━━━━━━━━━━━━━┻━━━━━━━━━━┻━━━━━━━━━━━━┛


In [None]:
path = "trained_model_ex3_f1_class1_weighted"
if (os.path.exists(path)):
	model = AutoModelForSequenceClassification.from_pretrained(path, num_labels=2)

	trainer = Trainer(model=model)
	predictions_procentages = trainer.predict(tokenized_test_dataset)[0]
	predictions = predictions_procentages.argmax(-1)
	report_dict = classification_report(test_data["label"], predictions, zero_division=0, output_dict=True)
	pretty_print_report_dict(report_dict)
else:
	print("Model not found")



CLASS PERFORMANCE
┏━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┓
┃    ┃  precision  ┃  recall  ┃  f1-score  ┃  support  ┃
┣━━━━╋━━━━━━━━━━━━━╋━━━━━━━━━━╋━━━━━━━━━━━━╋━━━━━━━━━━━┫
┃ 0  ┃    0.981    ┃  0.975   ┃   0.978    ┃   1871    ┃
┃ 1  ┃    0.797    ┃  0.836   ┃   0.816    ┃    220    ┃
┗━━━━┻━━━━━━━━━━━━━┻━━━━━━━━━━┻━━━━━━━━━━━━┻━━━━━━━━━━━┛

GLOBAL AVERAGES
┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━┓
┃              ┃  precision  ┃  recall  ┃  f1-score  ┃
┣━━━━━━━━━━━━━━╋━━━━━━━━━━━━━╋━━━━━━━━━━╋━━━━━━━━━━━━┫
┃ accuracy     ┃    0.96     ┃   0.96   ┃    0.96    ┃
┃ macro avg    ┃    0.889    ┃  0.906   ┃   0.897    ┃
┃ weighted avg ┃    0.961    ┃   0.96   ┃   0.961    ┃
┗━━━━━━━━━━━━━━┻━━━━━━━━━━━━━┻━━━━━━━━━━┻━━━━━━━━━━━━┛


In [None]:
path = "trained_model_ex3_f1_class1_weighted_2_epoc"
if (os.path.exists(path)):
	model = AutoModelForSequenceClassification.from_pretrained(path, num_labels=2)

	trainer = Trainer(model=model)
	predictions_procentages = trainer.predict(tokenized_test_dataset)[0]
	predictions = predictions_procentages.argmax(-1)
	report_dict = classification_report(test_data["label"], predictions, zero_division=0, output_dict=True)
	pretty_print_report_dict(report_dict)
else:
	print("Model not found")



CLASS PERFORMANCE
┏━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┓
┃    ┃  precision  ┃  recall  ┃  f1-score  ┃  support  ┃
┣━━━━╋━━━━━━━━━━━━━╋━━━━━━━━━━╋━━━━━━━━━━━━╋━━━━━━━━━━━┫
┃ 0  ┃    0.982    ┃  0.974   ┃   0.978    ┃   1871    ┃
┃ 1  ┃    0.791    ┃  0.845   ┃   0.818    ┃    220    ┃
┗━━━━┻━━━━━━━━━━━━━┻━━━━━━━━━━┻━━━━━━━━━━━━┻━━━━━━━━━━━┛

GLOBAL AVERAGES
┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━┓
┃              ┃  precision  ┃  recall  ┃  f1-score  ┃
┣━━━━━━━━━━━━━━╋━━━━━━━━━━━━━╋━━━━━━━━━━╋━━━━━━━━━━━━┫
┃ accuracy     ┃    0.96     ┃   0.96   ┃    0.96    ┃
┃ macro avg    ┃    0.887    ┃   0.91   ┃   0.898    ┃
┃ weighted avg ┃    0.962    ┃   0.96   ┃   0.961    ┃
┗━━━━━━━━━━━━━━┻━━━━━━━━━━━━━┻━━━━━━━━━━┻━━━━━━━━━━━━┛


# Task 4

In [57]:
from sentence_transformers.training_args import SentenceTransformerTrainingArguments
from sentence_transformers import SentenceTransformer
from sentence_transformers.losses import ContrastiveLoss, MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers

# 3. Define the Hyperparameter Search Space
def search_space(trial):
    return {
        'learning_rate': trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True),
        'weight_decay': trial.suggest_float("weight_decay", 0.01, 0.1),
        "warmup_ratio": trial.suggest_float("warmup_ratio", 0.05, 0.2),
        'max_grad_norm': trial.suggest_float("max_grad_norm", 0.5, 1.0, log=True),
        'num_train_epochs': trial.suggest_int("num_train_epochs", 1, 10), # ! lower when using a slower model 
    }

# 4. Define the Model Initialization
def model_init(trial):
    return SentenceTransformer("models\jina-embeddings-v2-small-en", device="cuda" )

# 5. Define the Loss Initialization
def cl_loss_init(model):
    return ContrastiveLoss(model)
def mnlr_loss_init(model):
    return MultipleNegativesRankingLoss(model)

# 6. Define the Objective Function
def hpo_compute_objective(metrics):
    print(metrics)
    return metrics["eval_sts-dev_cosine_recall@10"]

In [42]:
from datasets import Dataset, load_dataset
import pandas as pd

def get_data(valid_split=0.1):
    train_data_path = "./data/English dataset/train.jsonl"
    test_data_path = "./data/English dataset/test.jsonl"

    train_data = pd.DataFrame(load_dataset("json", data_files=train_data_path)["train"])
    test_dataset = pd.DataFrame(load_dataset("json", data_files=test_data_path)["train"])

    label_map = {"Contradiction": 1, "Entailment": 0, "NotMentioned": 0}
    train_data["label"] = train_data["label"].map(label_map)
    test_dataset["label"] = test_dataset["label"].map(label_map)

    train_data = train_data.drop("doc_id", axis=1)
    train_data = train_data.drop("key", axis=1)
    test_dataset = test_dataset.drop("doc_id", axis=1)
    test_dataset = test_dataset.drop("key", axis=1)

    train_data["label"].value_counts(normalize=True)

    ds = Dataset.from_pandas(train_data)
    ds = ds.select_columns(["hypothesis", "premise", "label"])

    dss = ds.train_test_split(valid_split, seed=42)
    train_dataset = dss['train']
    valid_dataset = dss['test']
    test_dataset = Dataset.from_pandas(test_dataset)
    test_dataset = test_dataset.select_columns(["hypothesis", "premise", "label"])

    return train_dataset, valid_dataset, test_dataset

In [158]:
train_dataset, valid_dataset, test_dataset = get_data(valid_split=0.15)

In [10]:
def filter_neg(data, model, params):
    d = {'hypothesis':[], 'premise':[]}
    for k in range(len(data)):
        if data['label'][k] == 1:
            d['hypothesis'].append(data['hypothesis'][k])
            d['premise'].append(data['premise'][k])
    meow = Dataset.from_dict(d)
    return (meow, None)

In [156]:
import random
from datasets import Dataset
from datasets import Dataset
from sentence_transformers.data_collator import SentenceTransformerDataCollator
from torch._tensor import Tensor
from typing import Any

class MyDataCollator(SentenceTransformerDataCollator): # We need this so we can insert negatives to the batches
    universal_negatives = ['', '']
    negatives_per_batch = 4

    def __call__(self, features: list[dict[str, Any]]) -> dict[str, Tensor]:
        k = list(features[0].keys())[1]
        for i, feature in enumerate(features):
            features[i][k] = [features[i][k]] + random.sample(
            self.universal_negatives, 
            min(self.negatives_per_batch, len(self.universal_negatives))
        )
        #print(features[i][k])
        batch = super().__call__(features)
        
        return batch

def get_data_col(data, model, params):
    h = len(set(data['hypothesis']))
    p = len(set(data['premise']))
    
    hmm = dict(zip(set(data['premise']), range(p)))
    d = {'hypothesis':[], 'premise':[]}
    for k in range(len(data)):
        if data['label'][k] == 1:
            d['hypothesis'].append(data['hypothesis'][k])
            d['premise'].append(data['premise'][k])
        else:
            hmm[data['premise'][k]] += 1
    universal_negatives = []
    for k, v in hmm.items():
        if v >= h: 
            universal_negatives.append(k) # if this premise is negative for all hypothesis
    
    data = Dataset.from_dict(d)
    data_collator = MyDataCollator(model.tokenize)
    data_collator.universal_negatives = universal_negatives
    data_collator.negatives_per_batch = params['negatives_per_batch']

    return data, data_collator


In [60]:
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir="models/checkpoints",
    # Optional training parameters:
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=1.2247359733257542e-05,
    seed=42,
    metric_for_best_model=f"eval_cosine_recall@10",
    load_best_model_at_end=True,
    weight_decay=0.09092585204374326,
    warmup_ratio=0.05503071687326718,
    batch_sampler=None,
    #batch_sampler=BatchSamplers.NO_DUPLICATES,  # losses that use "in-batch negatives" benefit from no duplicates
    # Optional tracking/debugging parameters:
    eval_strategy="steps",
    eval_steps=50,
    torch_empty_cache_steps = None,
    save_steps=50,
    save_total_limit=2,
    max_grad_norm= 0.8774817671930895,
    logging_steps=100,
    
)

In [172]:
from sentence_transformers.training_args import BatchSamplers
from copy import deepcopy

args_mnlr = deepcopy(args)
args_mnlr.num_train_epochs = 10
args_mnlr.batch_sampler = BatchSamplers.NO_DUPLICATES
args_mnlr2 = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir="models/checkpoints",
    # Optional training parameters:
    num_train_epochs=30,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=4.890585631921125e-05,
    seed=42,
    metric_for_best_model=f"eval_cosine_recall@10",
    #load_best_model_at_end=True,
    weight_decay=0.010800797401617856,
    warmup_ratio=0.0765080638605733,
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # losses that use "in-batch negatives" benefit from no duplicates
    # Optional tracking/debugging parameters:
    eval_strategy="no",
    eval_steps=100,
    torch_empty_cache_steps = None,
    max_grad_norm= 0.5013390304609416,
    logging_steps=100,
)

MNLR assumes that in each batch every hypothesis has only one correct premise. It treates every other premise as a wrong answer, and tries to distance the hypothesis from it. This isn't true in general, so we need to ensure that during training, there aren't many overlaping premises. It's still possible that multiple correct premises for a hypothesis make it in the batch but NO_DUPLICATES tries to at least minimize that. 

In [173]:
from sentence_transformers.losses import ContrastiveLoss, MultipleNegativesRankingLoss
# Kinda bad, and slow but simple
model_cl_kwargs = {"trained_model_path":"models/trained/model_cl", "loss": cl_loss_init, "args":args, "model_init":model_init}

# kinda bad, but fast, and simple
model_mnlr_kwargs = {
    "trained_model_path":"models/trained/model_mnlr", 
    "loss":mnlr_loss_init, 
    "args":args_mnlr, 
    "data_preprocesser": filter_neg
    }

# our SOTA model
model_mnlr2_kwargs = {
    "trained_model_path":"models/trained/model_mnlr2_test", 
    "loss":mnlr_loss_init, 
    "args":args_mnlr2, 
    "data_preprocesser": get_data_col, 
    "params": {"negatives_per_batch":3},
    "model_init":model_init
    }

In [82]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from collections import defaultdict

def get_ret_eval(test_dataset, name=''):
    corpus = dict(zip(test_dataset['premise'], test_dataset['premise']))
    queries = dict(zip(test_dataset['hypothesis'], test_dataset['hypothesis']))
    relevant_docs = defaultdict(list)

    for k in range(len(test_dataset)):
        if test_dataset['label'][k] > 0:
            relevant_docs[test_dataset['hypothesis'][k]].append(test_dataset['premise'][k])
    

    inf_ret_ev = InformationRetrievalEvaluator(
        queries= queries,
        corpus = corpus,
        relevant_docs = relevant_docs,
        #similarity_fn_names= ["cosine"],
        show_progress_bar=True,
        batch_size= 16,
        #main_score_function="Recall@10",
        name=name
    )

    return inf_ret_ev

In [165]:
from sentence_transformers.losses import ContrastiveLoss
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers import SentenceTransformer
from transformers import EarlyStoppingCallback


def get_trainer(
        train_dataset, 
        valid_dataset, 
        model_init=model_init,
        loss=mnlr_loss_init, 
        args=None, 
        evaluator=None, 
        data_preprocesser=None, 
        params=None,
        trained_model_path=None,
        early_stopping=False
    ):
    #base_model = SentenceTransformer(model_name)
    
    #loss = loss(base_model)
    data_collator = None
    if early_stopping:
        early_stopper = [EarlyStoppingCallback(
            early_stopping_patience=5,
            early_stopping_threshold=0.05
        )]
    else:
        early_stopper = None

    if data_preprocesser is not None:
        train_dataset, data_collator = data_preprocesser(train_dataset, model_init(None), params)

    if valid_dataset is not None and evaluator is None:
        evaluator = get_ret_eval(valid_dataset)

    trainer = SentenceTransformerTrainer(
        model_init = model_init,
        train_dataset = train_dataset,
        eval_dataset = valid_dataset,
        loss = loss,
        evaluator = evaluator,
        args = args,
        data_collator = data_collator,
        callbacks=early_stopper
    )

    return trainer

In [52]:
def hyper_search(train_dataset, valid_dataset, model_kwargs):
    dev_evaluator = get_ret_eval(valid_dataset, name="sts-dev")
    model_kwargs = deepcopy(model_kwargs)
    # 7. Define the training arguments
    hpo_args = SentenceTransformerTrainingArguments(
        # Required parameter:
        num_train_epochs=1.0,
        per_device_train_batch_size=16,
        seed=42,
        metric_for_best_model=f"eval_cosine_recall@10",
        output_dir="checkpoints",
        batch_sampler=BatchSamplers.NO_DUPLICATES, # Remove if searching for the Contrastive loss or triplet model
        # Optional tracking/debugging parameters:
        eval_strategy="no", # We don't need to evaluate/save during HPO
        save_strategy="no",
        logging_steps=40,
        run_name="hpo",  # Will be used in W&B if `wandb` is installed
        
    )
    model_kwargs['args'] = hpo_args
    trainer = get_trainer(train_dataset, valid_dataset, evaluator=dev_evaluator, **model_kwargs)

    best_trial = trainer.hyperparameter_search(
        hp_space=search_space,
        compute_objective=hpo_compute_objective,
        n_trials=25,
        direction="maximize",
        backend="optuna",

    )
    print(best_trial)

In [175]:
def train(train_dataset, valid_dataset, model_kwargs):
    trainer = get_trainer(train_dataset, valid_dataset,early_stopping=False, **model_kwargs)
    trainer.train()
    trainer.model.save_pretrained(model_kwargs['trained_model_path'])

In [112]:
TRAIN = True
HYPER_PARAMETER_SEARCH = False

In [176]:
if TRAIN:
    train(train_dataset, valid_dataset, model_mnlr2_kwargs)

Some weights of BertModel were not initialized from the model checkpoint at models\jina-embeddings-v2-small-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'encoder.layer.2.intermediate.dense.bias', 'encoder.layer.2.intermediate.dense.weight', 'encoder.layer.2.output.LayerNorm.bias', 'encoder.layer.2.output.LayerNorm.weight', 'encoder.layer.2.output.dense.bias', 'encoder.layer.2.output.dense.weight', 'encoder.layer.3.intermediate.dense.bias', 'encoder.layer.3.intermediate.den

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Some weights of BertModel were not initialized from the model checkpoint at models\jina-embeddings-v2-small-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'encoder.layer.2.intermediate.dense.bias', 'encoder.layer.2.intermediate.dense.weight', 'encoder.layer.2.output.LayerNorm.bias', 'encoder.layer.2.output.LayerNorm.weight', 'encoder.layer.2.output.dense.bias', 'encoder.layer.2.output.dense.weight', 'encoder.layer.3.intermediate.dense.bias', 'encoder.layer.3.intermediate.den

Step,Training Loss
100,1.7637
200,1.1623
300,0.5679
400,0.3826
500,0.4057
600,0.3388
700,0.3216
800,0.2741
900,0.2183
1000,0.1477


In [58]:
if HYPER_PARAMETER_SEARCH:
    hyper_search(train_dataset, valid_dataset, model_mnlr2_kwargs)
    

Some weights of BertModel were not initialized from the model checkpoint at models\jina-embeddings-v2-small-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'encoder.layer.2.intermediate.dense.bias', 'encoder.layer.2.intermediate.dense.weight', 'encoder.layer.2.output.LayerNorm.bias', 'encoder.layer.2.output.LayerNorm.weight', 'encoder.layer.2.output.dense.bias', 'encoder.layer.2.output.dense.weight', 'encoder.layer.3.intermediate.dense.bias', 'encoder.layer.3.intermediate.den

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

[I 2026-01-04 12:00:36,693] A new study created in memory with name: no-name-bfa4578b-9222-4053-aa49-01dc690d65bf
Some weights of BertModel were not initialized from the model checkpoint at models\jina-embeddings-v2-small-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'encoder.layer.2.intermediate.dense.bias', 'encoder.layer.2.intermediate.dense.weight', 'encoder.layer.2.output.LayerNorm.bias', 'encoder.layer.2.output.LayerNorm.weight', 'encoder.layer.2.output.dense.bias', '

Step,Training Loss
40,1.8784


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/27 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.35s/it]


{'eval_loss': 0.458769291639328, 'eval_sts-dev_cosine_accuracy@1': 0.0, 'eval_sts-dev_cosine_accuracy@3': 0.2222222222222222, 'eval_sts-dev_cosine_accuracy@5': 0.4444444444444444, 'eval_sts-dev_cosine_accuracy@10': 0.5555555555555556, 'eval_sts-dev_cosine_precision@1': 0.0, 'eval_sts-dev_cosine_precision@3': 0.07407407407407407, 'eval_sts-dev_cosine_precision@5': 0.08888888888888889, 'eval_sts-dev_cosine_precision@10': 0.06666666666666667, 'eval_sts-dev_cosine_recall@1': 0.0, 'eval_sts-dev_cosine_recall@3': 0.012051734273956496, 'eval_sts-dev_cosine_recall@5': 0.1453850676072898, 'eval_sts-dev_cosine_recall@10': 0.1618685840908063, 'eval_sts-dev_cosine_ndcg@10': 0.10374839921798337, 'eval_sts-dev_cosine_mrr@10': 0.13994708994708993, 'eval_sts-dev_cosine_map@100': 0.05429728015926467, 'eval_runtime': 42.2138, 'eval_samples_per_second': 17.056, 'eval_steps_per_second': 2.132}


[I 2026-01-04 12:01:53,100] Trial 0 finished with value: 0.1618685840908063 and parameters: {'learning_rate': 1.4059023675392549e-05, 'weight_decay': 0.08652978100312161, 'warmup_ratio': 0.056111534603025565, 'max_grad_norm': 0.7201799702327789, 'num_train_epochs': 1}. Best is trial 0 with value: 0.1618685840908063.
Some weights of BertModel were not initialized from the model checkpoint at models\jina-embeddings-v2-small-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'encod

Step,Training Loss
40,1.9245
80,1.7453


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/27 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.32s/it]


{'eval_loss': 0.4526253640651703, 'eval_sts-dev_cosine_accuracy@1': 0.0, 'eval_sts-dev_cosine_accuracy@3': 0.4444444444444444, 'eval_sts-dev_cosine_accuracy@5': 0.5555555555555556, 'eval_sts-dev_cosine_accuracy@10': 0.5555555555555556, 'eval_sts-dev_cosine_precision@1': 0.0, 'eval_sts-dev_cosine_precision@3': 0.14814814814814814, 'eval_sts-dev_cosine_precision@5': 0.1111111111111111, 'eval_sts-dev_cosine_precision@10': 0.08888888888888888, 'eval_sts-dev_cosine_recall@1': 0.0, 'eval_sts-dev_cosine_recall@3': 0.04282096504318727, 'eval_sts-dev_cosine_recall@5': 0.15393207615429835, 'eval_sts-dev_cosine_recall@10': 0.17453081897526337, 'eval_sts-dev_cosine_ndcg@10': 0.1347407111633424, 'eval_sts-dev_cosine_mrr@10': 0.21296296296296294, 'eval_sts-dev_cosine_map@100': 0.06724181459323575, 'eval_runtime': 28.71, 'eval_samples_per_second': 25.078, 'eval_steps_per_second': 3.135}


[I 2026-01-04 12:03:30,629] Trial 1 finished with value: 0.17453081897526337 and parameters: {'learning_rate': 1.1307152640025565e-05, 'weight_decay': 0.03213871826104624, 'warmup_ratio': 0.18149397611707963, 'max_grad_norm': 0.886928100598736, 'num_train_epochs': 2}. Best is trial 1 with value: 0.17453081897526337.
Some weights of BertModel were not initialized from the model checkpoint at models\jina-embeddings-v2-small-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'encod

Step,Training Loss
40,1.8448
80,1.5276
120,1.4852


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/27 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:05<00:00,  5.26s/it]


{'eval_loss': 0.4388158619403839, 'eval_sts-dev_cosine_accuracy@1': 0.2222222222222222, 'eval_sts-dev_cosine_accuracy@3': 0.2222222222222222, 'eval_sts-dev_cosine_accuracy@5': 0.2222222222222222, 'eval_sts-dev_cosine_accuracy@10': 0.3333333333333333, 'eval_sts-dev_cosine_precision@1': 0.2222222222222222, 'eval_sts-dev_cosine_precision@3': 0.18518518518518517, 'eval_sts-dev_cosine_precision@5': 0.13333333333333336, 'eval_sts-dev_cosine_precision@10': 0.13333333333333333, 'eval_sts-dev_cosine_recall@1': 0.012051734273956496, 'eval_sts-dev_cosine_recall@3': 0.02821869488536155, 'eval_sts-dev_cosine_recall@5': 0.03233392122281011, 'eval_sts-dev_cosine_recall@10': 0.06527834305612083, 'eval_sts-dev_cosine_ndcg@10': 0.1458495452599458, 'eval_sts-dev_cosine_mrr@10': 0.23809523809523808, 'eval_sts-dev_cosine_map@100': 0.09419719689678425, 'eval_runtime': 54.1976, 'eval_samples_per_second': 13.285, 'eval_steps_per_second': 1.661}


[I 2026-01-04 12:06:39,825] Trial 2 finished with value: 0.06527834305612083 and parameters: {'learning_rate': 3.305749542157708e-05, 'weight_decay': 0.03861318175267994, 'warmup_ratio': 0.09864380096271419, 'max_grad_norm': 0.8235775320157928, 'num_train_epochs': 3}. Best is trial 1 with value: 0.17453081897526337.
Some weights of BertModel were not initialized from the model checkpoint at models\jina-embeddings-v2-small-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'encod

Step,Training Loss
40,1.9546
80,1.6955
120,1.5537
160,1.3775
200,1.2174
240,1.0067
280,1.0656
320,0.7621
360,0.7276
400,0.7494


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/27 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:10<00:00, 10.55s/it]


{'eval_loss': 0.46335354447364807, 'eval_sts-dev_cosine_accuracy@1': 0.2222222222222222, 'eval_sts-dev_cosine_accuracy@3': 0.3333333333333333, 'eval_sts-dev_cosine_accuracy@5': 0.4444444444444444, 'eval_sts-dev_cosine_accuracy@10': 0.4444444444444444, 'eval_sts-dev_cosine_precision@1': 0.2222222222222222, 'eval_sts-dev_cosine_precision@3': 0.2222222222222222, 'eval_sts-dev_cosine_precision@5': 0.17777777777777776, 'eval_sts-dev_cosine_precision@10': 0.15555555555555556, 'eval_sts-dev_cosine_recall@1': 0.012051734273956496, 'eval_sts-dev_cosine_recall@3': 0.05044091710758378, 'eval_sts-dev_cosine_recall@5': 0.16566725455614342, 'eval_sts-dev_cosine_recall@10': 0.19800117577895354, 'eval_sts-dev_cosine_ndcg@10': 0.21564108349937677, 'eval_sts-dev_cosine_mrr@10': 0.30000000000000004, 'eval_sts-dev_cosine_map@100': 0.17312402206760572, 'eval_runtime': 60.7544, 'eval_samples_per_second': 11.851, 'eval_steps_per_second': 1.481}


[I 2026-01-04 12:15:23,345] Trial 3 finished with value: 0.19800117577895354 and parameters: {'learning_rate': 2.4008763114912577e-05, 'weight_decay': 0.0639786658560054, 'warmup_ratio': 0.13520976901078668, 'max_grad_norm': 0.9083848011763486, 'num_train_epochs': 10}. Best is trial 3 with value: 0.19800117577895354.
Some weights of BertModel were not initialized from the model checkpoint at models\jina-embeddings-v2-small-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'enco

Step,Training Loss
40,1.9177
80,1.6289
120,1.517
160,1.3405
200,1.2503
240,1.0534


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/27 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.58s/it]


{'eval_loss': 0.4271654188632965, 'eval_sts-dev_cosine_accuracy@1': 0.2222222222222222, 'eval_sts-dev_cosine_accuracy@3': 0.3333333333333333, 'eval_sts-dev_cosine_accuracy@5': 0.3333333333333333, 'eval_sts-dev_cosine_accuracy@10': 0.4444444444444444, 'eval_sts-dev_cosine_precision@1': 0.2222222222222222, 'eval_sts-dev_cosine_precision@3': 0.14814814814814814, 'eval_sts-dev_cosine_precision@5': 0.1111111111111111, 'eval_sts-dev_cosine_precision@10': 0.14444444444444446, 'eval_sts-dev_cosine_recall@1': 0.012051734273956496, 'eval_sts-dev_cosine_recall@3': 0.03838918283362728, 'eval_sts-dev_cosine_recall@5': 0.046325690770135215, 'eval_sts-dev_cosine_recall@10': 0.09132184687740244, 'eval_sts-dev_cosine_ndcg@10': 0.15565542226436846, 'eval_sts-dev_cosine_mrr@10': 0.27037037037037037, 'eval_sts-dev_cosine_map@100': 0.10618913599860787, 'eval_runtime': 26.3252, 'eval_samples_per_second': 27.35, 'eval_steps_per_second': 3.419}


[I 2026-01-04 12:18:29,231] Trial 4 finished with value: 0.09132184687740244 and parameters: {'learning_rate': 2.5231364409635767e-05, 'weight_decay': 0.02873923782471332, 'warmup_ratio': 0.17544082335077604, 'max_grad_norm': 0.7311139395028325, 'num_train_epochs': 5}. Best is trial 3 with value: 0.19800117577895354.
Some weights of BertModel were not initialized from the model checkpoint at models\jina-embeddings-v2-small-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'enco

Step,Training Loss
40,1.8275
80,1.4675
120,1.3852
160,1.0972
200,1.0243
240,0.7946


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/27 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.54s/it]


{'eval_loss': 0.4230436384677887, 'eval_sts-dev_cosine_accuracy@1': 0.3333333333333333, 'eval_sts-dev_cosine_accuracy@3': 0.3333333333333333, 'eval_sts-dev_cosine_accuracy@5': 0.3333333333333333, 'eval_sts-dev_cosine_accuracy@10': 0.4444444444444444, 'eval_sts-dev_cosine_precision@1': 0.3333333333333333, 'eval_sts-dev_cosine_precision@3': 0.2222222222222222, 'eval_sts-dev_cosine_precision@5': 0.24444444444444446, 'eval_sts-dev_cosine_precision@10': 0.1888888888888889, 'eval_sts-dev_cosine_recall@1': 0.020598742820965042, 'eval_sts-dev_cosine_recall@3': 0.036765703432370095, 'eval_sts-dev_cosine_recall@5': 0.07384796273685162, 'eval_sts-dev_cosine_recall@10': 0.12107809885587664, 'eval_sts-dev_cosine_ndcg@10': 0.21224070030521797, 'eval_sts-dev_cosine_mrr@10': 0.34444444444444444, 'eval_sts-dev_cosine_map@100': 0.15282343537855797, 'eval_runtime': 26.5233, 'eval_samples_per_second': 27.146, 'eval_steps_per_second': 3.393}


[I 2026-01-04 12:21:43,519] Trial 5 finished with value: 0.12107809885587664 and parameters: {'learning_rate': 3.881667226159113e-05, 'weight_decay': 0.05515720194350113, 'warmup_ratio': 0.0556556309422569, 'max_grad_norm': 0.5290798862185233, 'num_train_epochs': 5}. Best is trial 3 with value: 0.19800117577895354.
Some weights of BertModel were not initialized from the model checkpoint at models\jina-embeddings-v2-small-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'encode

Step,Training Loss
40,1.96
80,1.7421
120,1.6854
160,1.6183
200,1.532
240,1.405


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/27 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.53s/it]


{'eval_loss': 0.4461458921432495, 'eval_sts-dev_cosine_accuracy@1': 0.2222222222222222, 'eval_sts-dev_cosine_accuracy@3': 0.2222222222222222, 'eval_sts-dev_cosine_accuracy@5': 0.2222222222222222, 'eval_sts-dev_cosine_accuracy@10': 0.5555555555555556, 'eval_sts-dev_cosine_precision@1': 0.2222222222222222, 'eval_sts-dev_cosine_precision@3': 0.1111111111111111, 'eval_sts-dev_cosine_precision@5': 0.1111111111111111, 'eval_sts-dev_cosine_precision@10': 0.13333333333333333, 'eval_sts-dev_cosine_recall@1': 0.012051734273956496, 'eval_sts-dev_cosine_recall@3': 0.016166960611405056, 'eval_sts-dev_cosine_recall@5': 0.024397413286302173, 'eval_sts-dev_cosine_recall@10': 0.18655994211549765, 'eval_sts-dev_cosine_ndcg@10': 0.1724591662635654, 'eval_sts-dev_cosine_mrr@10': 0.2669753086419753, 'eval_sts-dev_cosine_map@100': 0.08920091005029075, 'eval_runtime': 26.002, 'eval_samples_per_second': 27.69, 'eval_steps_per_second': 3.461}


[I 2026-01-04 12:24:55,704] Trial 6 finished with value: 0.18655994211549765 and parameters: {'learning_rate': 1.0620933794507456e-05, 'weight_decay': 0.03635588704482753, 'warmup_ratio': 0.12884989894318657, 'max_grad_norm': 0.7598787704170058, 'num_train_epochs': 5}. Best is trial 3 with value: 0.19800117577895354.
Some weights of BertModel were not initialized from the model checkpoint at models\jina-embeddings-v2-small-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'enco

Step,Training Loss
40,1.9502
80,1.6849
120,1.4971
160,1.2671
200,1.0679
240,0.8044
280,0.7969
320,0.6232
360,0.5354
400,0.6011


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/27 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:03<00:00,  3.58s/it]


{'eval_loss': 0.48427870869636536, 'eval_sts-dev_cosine_accuracy@1': 0.3333333333333333, 'eval_sts-dev_cosine_accuracy@3': 0.4444444444444444, 'eval_sts-dev_cosine_accuracy@5': 0.4444444444444444, 'eval_sts-dev_cosine_accuracy@10': 0.4444444444444444, 'eval_sts-dev_cosine_precision@1': 0.3333333333333333, 'eval_sts-dev_cosine_precision@3': 0.25925925925925924, 'eval_sts-dev_cosine_precision@5': 0.2222222222222222, 'eval_sts-dev_cosine_precision@10': 0.14444444444444446, 'eval_sts-dev_cosine_recall@1': 0.020598742820965042, 'eval_sts-dev_cosine_recall@3': 0.059598426265092944, 'eval_sts-dev_cosine_recall@5': 0.07576538687649798, 'eval_sts-dev_cosine_recall@10': 0.1100393433726767, 'eval_sts-dev_cosine_ndcg@10': 0.19206181648008988, 'eval_sts-dev_cosine_mrr@10': 0.3703703703703704, 'eval_sts-dev_cosine_map@100': 0.17846353900861278, 'eval_runtime': 27.2704, 'eval_samples_per_second': 26.402, 'eval_steps_per_second': 3.3}


[I 2026-01-04 12:30:43,679] Trial 7 finished with value: 0.1100393433726767 and parameters: {'learning_rate': 3.524440852268008e-05, 'weight_decay': 0.08347971655397256, 'warmup_ratio': 0.18411295981410541, 'max_grad_norm': 0.5454412406726281, 'num_train_epochs': 10}. Best is trial 3 with value: 0.19800117577895354.
Some weights of BertModel were not initialized from the model checkpoint at models\jina-embeddings-v2-small-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'encod

Step,Training Loss
40,1.8694
80,1.5346
120,1.3633
160,1.1653
200,0.9694
240,0.708
280,0.7318
320,0.5759


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/27 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:07<00:00,  7.13s/it]


{'eval_loss': 0.44954603910446167, 'eval_sts-dev_cosine_accuracy@1': 0.1111111111111111, 'eval_sts-dev_cosine_accuracy@3': 0.3333333333333333, 'eval_sts-dev_cosine_accuracy@5': 0.4444444444444444, 'eval_sts-dev_cosine_accuracy@10': 0.5555555555555556, 'eval_sts-dev_cosine_precision@1': 0.1111111111111111, 'eval_sts-dev_cosine_precision@3': 0.18518518518518517, 'eval_sts-dev_cosine_precision@5': 0.20000000000000004, 'eval_sts-dev_cosine_precision@10': 0.17777777777777778, 'eval_sts-dev_cosine_recall@1': 0.004115226337448559, 'eval_sts-dev_cosine_recall@3': 0.03265047709492154, 'eval_sts-dev_cosine_recall@5': 0.16053904942793829, 'eval_sts-dev_cosine_recall@10': 0.23381268936824492, 'eval_sts-dev_cosine_ndcg@10': 0.2136758139143756, 'eval_sts-dev_cosine_mrr@10': 0.2444444444444444, 'eval_sts-dev_cosine_map@100': 0.16171511441995806, 'eval_runtime': 44.9039, 'eval_samples_per_second': 16.034, 'eval_steps_per_second': 2.004}


[I 2026-01-04 12:35:58,631] Trial 8 finished with value: 0.23381268936824492 and parameters: {'learning_rate': 4.481427220776346e-05, 'weight_decay': 0.018681887598305823, 'warmup_ratio': 0.10699351633723574, 'max_grad_norm': 0.5582517475197748, 'num_train_epochs': 7}. Best is trial 8 with value: 0.23381268936824492.
Some weights of BertModel were not initialized from the model checkpoint at models\jina-embeddings-v2-small-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'enco

Step,Training Loss
40,1.8827
80,1.5642
120,1.3938
160,1.1709
200,0.9138
240,0.6941
280,0.6909
320,0.5385
360,0.4965
400,0.61


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/27 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.23s/it]


{'eval_loss': 0.46136242151260376, 'eval_sts-dev_cosine_accuracy@1': 0.1111111111111111, 'eval_sts-dev_cosine_accuracy@3': 0.4444444444444444, 'eval_sts-dev_cosine_accuracy@5': 0.4444444444444444, 'eval_sts-dev_cosine_accuracy@10': 0.4444444444444444, 'eval_sts-dev_cosine_precision@1': 0.1111111111111111, 'eval_sts-dev_cosine_precision@3': 0.2592592592592593, 'eval_sts-dev_cosine_precision@5': 0.2222222222222222, 'eval_sts-dev_cosine_precision@10': 0.20000000000000004, 'eval_sts-dev_cosine_recall@1': 0.004115226337448559, 'eval_sts-dev_cosine_recall@3': 0.05898792565459232, 'eval_sts-dev_cosine_recall@5': 0.07576538687649798, 'eval_sts-dev_cosine_recall@10': 0.12901460679238458, 'eval_sts-dev_cosine_ndcg@10': 0.20837158948889034, 'eval_sts-dev_cosine_mrr@10': 0.25925925925925924, 'eval_sts-dev_cosine_map@100': 0.17943237040210175, 'eval_runtime': 31.076, 'eval_samples_per_second': 23.169, 'eval_steps_per_second': 2.896}


[I 2026-01-04 12:42:31,368] Trial 9 finished with value: 0.12901460679238458 and parameters: {'learning_rate': 4.44362668830459e-05, 'weight_decay': 0.07644873292094105, 'warmup_ratio': 0.08770045454444267, 'max_grad_norm': 0.7461189353702702, 'num_train_epochs': 10}. Best is trial 8 with value: 0.23381268936824492.
Some weights of BertModel were not initialized from the model checkpoint at models\jina-embeddings-v2-small-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'encod

Step,Training Loss
40,1.9752
80,1.7345
120,1.6378
160,1.514
200,1.4102
240,1.2203
280,1.3301
320,1.2191
360,1.204


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/27 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:12<00:00, 12.72s/it]


{'eval_loss': 0.423377126455307, 'eval_sts-dev_cosine_accuracy@1': 0.2222222222222222, 'eval_sts-dev_cosine_accuracy@3': 0.3333333333333333, 'eval_sts-dev_cosine_accuracy@5': 0.3333333333333333, 'eval_sts-dev_cosine_accuracy@10': 0.4444444444444444, 'eval_sts-dev_cosine_precision@1': 0.2222222222222222, 'eval_sts-dev_cosine_precision@3': 0.18518518518518517, 'eval_sts-dev_cosine_precision@5': 0.17777777777777776, 'eval_sts-dev_cosine_precision@10': 0.15555555555555556, 'eval_sts-dev_cosine_recall@1': 0.012051734273956496, 'eval_sts-dev_cosine_recall@3': 0.042504409171075834, 'eval_sts-dev_cosine_recall@5': 0.0586713697824809, 'eval_sts-dev_cosine_recall@10': 0.0960475738253516, 'eval_sts-dev_cosine_ndcg@10': 0.16929536599952635, 'eval_sts-dev_cosine_mrr@10': 0.271604938271605, 'eval_sts-dev_cosine_map@100': 0.12717345594367999, 'eval_runtime': 69.5385, 'eval_samples_per_second': 10.354, 'eval_steps_per_second': 1.294}


[I 2026-01-04 12:49:09,289] Trial 10 finished with value: 0.0960475738253516 and parameters: {'learning_rate': 1.6009592922779148e-05, 'weight_decay': 0.013124712974743321, 'warmup_ratio': 0.1531321794002561, 'max_grad_norm': 0.6015599074781625, 'num_train_epochs': 8}. Best is trial 8 with value: 0.23381268936824492.
Some weights of BertModel were not initialized from the model checkpoint at models\jina-embeddings-v2-small-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'enco

Step,Training Loss
40,1.9427
80,1.6804
120,1.5584
160,1.4274
200,1.3035
240,1.0657
280,1.1603
320,1.0376
360,1.0081


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/27 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:12<00:00, 12.18s/it]


{'eval_loss': 0.41803425550460815, 'eval_sts-dev_cosine_accuracy@1': 0.1111111111111111, 'eval_sts-dev_cosine_accuracy@3': 0.3333333333333333, 'eval_sts-dev_cosine_accuracy@5': 0.3333333333333333, 'eval_sts-dev_cosine_accuracy@10': 0.5555555555555556, 'eval_sts-dev_cosine_precision@1': 0.1111111111111111, 'eval_sts-dev_cosine_precision@3': 0.1111111111111111, 'eval_sts-dev_cosine_precision@5': 0.15555555555555556, 'eval_sts-dev_cosine_precision@10': 0.15555555555555556, 'eval_sts-dev_cosine_recall@1': 0.004115226337448559, 'eval_sts-dev_cosine_recall@3': 0.03427395649617872, 'eval_sts-dev_cosine_recall@5': 0.07648442092886537, 'eval_sts-dev_cosine_recall@10': 0.2211504544837878, 'eval_sts-dev_cosine_ndcg@10': 0.18746048675771756, 'eval_sts-dev_cosine_mrr@10': 0.2361111111111111, 'eval_sts-dev_cosine_map@100': 0.12564987518767798, 'eval_runtime': 63.8752, 'eval_samples_per_second': 11.272, 'eval_steps_per_second': 1.409}


[I 2026-01-04 12:55:39,072] Trial 11 finished with value: 0.2211504544837878 and parameters: {'learning_rate': 2.119476921266994e-05, 'weight_decay': 0.06241016459553604, 'warmup_ratio': 0.12387983002574698, 'max_grad_norm': 0.9612691535017999, 'num_train_epochs': 8}. Best is trial 8 with value: 0.23381268936824492.
Some weights of BertModel were not initialized from the model checkpoint at models\jina-embeddings-v2-small-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'encod

Step,Training Loss
40,1.9251
80,1.6706
120,1.5515
160,1.4494
200,1.3467
240,1.1645
280,1.2876
320,1.201


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/27 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:07<00:00,  7.74s/it]


{'eval_loss': 0.43358680605888367, 'eval_sts-dev_cosine_accuracy@1': 0.1111111111111111, 'eval_sts-dev_cosine_accuracy@3': 0.3333333333333333, 'eval_sts-dev_cosine_accuracy@5': 0.3333333333333333, 'eval_sts-dev_cosine_accuracy@10': 0.5555555555555556, 'eval_sts-dev_cosine_precision@1': 0.1111111111111111, 'eval_sts-dev_cosine_precision@3': 0.14814814814814814, 'eval_sts-dev_cosine_precision@5': 0.1111111111111111, 'eval_sts-dev_cosine_precision@10': 0.16666666666666663, 'eval_sts-dev_cosine_recall@1': 0.007936507936507936, 'eval_sts-dev_cosine_recall@3': 0.12727807172251615, 'eval_sts-dev_cosine_recall@5': 0.13139329805996472, 'eval_sts-dev_cosine_recall@10': 0.22465518021073574, 'eval_sts-dev_cosine_ndcg@10': 0.20502840413067921, 'eval_sts-dev_cosine_mrr@10': 0.23809523809523808, 'eval_sts-dev_cosine_map@100': 0.15311991867035382, 'eval_runtime': 39.6906, 'eval_samples_per_second': 18.14, 'eval_steps_per_second': 2.268}


[I 2026-01-04 13:00:35,548] Trial 12 finished with value: 0.22465518021073574 and parameters: {'learning_rate': 1.8240995963110922e-05, 'weight_decay': 0.09874664134664762, 'warmup_ratio': 0.0999724069653248, 'max_grad_norm': 0.6325094738820446, 'num_train_epochs': 7}. Best is trial 8 with value: 0.23381268936824492.
Some weights of BertModel were not initialized from the model checkpoint at models\jina-embeddings-v2-small-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'enco

Step,Training Loss
40,1.9214
80,1.6604
120,1.5356
160,1.4252
200,1.3384
240,1.1477
280,1.294
320,1.1864


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/27 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:05<00:00,  5.68s/it]


{'eval_loss': 0.4368283152580261, 'eval_sts-dev_cosine_accuracy@1': 0.1111111111111111, 'eval_sts-dev_cosine_accuracy@3': 0.2222222222222222, 'eval_sts-dev_cosine_accuracy@5': 0.2222222222222222, 'eval_sts-dev_cosine_accuracy@10': 0.5555555555555556, 'eval_sts-dev_cosine_precision@1': 0.1111111111111111, 'eval_sts-dev_cosine_precision@3': 0.1111111111111111, 'eval_sts-dev_cosine_precision@5': 0.1111111111111111, 'eval_sts-dev_cosine_precision@10': 0.16666666666666666, 'eval_sts-dev_cosine_recall@1': 0.004115226337448559, 'eval_sts-dev_cosine_recall@3': 0.016166960611405056, 'eval_sts-dev_cosine_recall@5': 0.03203997648442093, 'eval_sts-dev_cosine_recall@10': 0.21036946592502148, 'eval_sts-dev_cosine_ndcg@10': 0.18971109549742082, 'eval_sts-dev_cosine_mrr@10': 0.21494708994708991, 'eval_sts-dev_cosine_map@100': 0.13166637096223963, 'eval_runtime': 35.6228, 'eval_samples_per_second': 20.212, 'eval_steps_per_second': 2.526}


[I 2026-01-04 13:05:58,531] Trial 13 finished with value: 0.21036946592502148 and parameters: {'learning_rate': 1.8213748517391996e-05, 'weight_decay': 0.09911914992277315, 'warmup_ratio': 0.09416079657814064, 'max_grad_norm': 0.6185226223920901, 'num_train_epochs': 7}. Best is trial 8 with value: 0.23381268936824492.
Some weights of BertModel were not initialized from the model checkpoint at models\jina-embeddings-v2-small-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'enc

Step,Training Loss
40,1.8948
80,1.5932
120,1.4893
160,1.2585
200,1.1114
240,0.8676
280,0.9885
320,0.74


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/27 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:06<00:00,  6.54s/it]


{'eval_loss': 0.40545687079429626, 'eval_sts-dev_cosine_accuracy@1': 0.3333333333333333, 'eval_sts-dev_cosine_accuracy@3': 0.3333333333333333, 'eval_sts-dev_cosine_accuracy@5': 0.4444444444444444, 'eval_sts-dev_cosine_accuracy@10': 0.4444444444444444, 'eval_sts-dev_cosine_precision@1': 0.3333333333333333, 'eval_sts-dev_cosine_precision@3': 0.2222222222222222, 'eval_sts-dev_cosine_precision@5': 0.2, 'eval_sts-dev_cosine_precision@10': 0.17777777777777776, 'eval_sts-dev_cosine_recall@1': 0.03427395649617872, 'eval_sts-dev_cosine_recall@3': 0.05044091710758378, 'eval_sts-dev_cosine_recall@5': 0.17360376249265136, 'eval_sts-dev_cosine_recall@10': 0.2281599059376837, 'eval_sts-dev_cosine_ndcg@10': 0.25580872808689054, 'eval_sts-dev_cosine_mrr@10': 0.3611111111111111, 'eval_sts-dev_cosine_map@100': 0.19412605809565242, 'eval_runtime': 38.0251, 'eval_samples_per_second': 18.935, 'eval_steps_per_second': 2.367}


[I 2026-01-04 13:10:53,557] Trial 14 finished with value: 0.2281599059376837 and parameters: {'learning_rate': 2.957310669752699e-05, 'weight_decay': 0.017492853519975777, 'warmup_ratio': 0.10854236410882438, 'max_grad_norm': 0.6356174549758085, 'num_train_epochs': 7}. Best is trial 8 with value: 0.23381268936824492.
Some weights of BertModel were not initialized from the model checkpoint at models\jina-embeddings-v2-small-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'enco

Step,Training Loss
40,1.8351
80,1.4194
120,1.2543
160,0.9273
200,0.7116
240,0.5428
280,0.6228
320,0.5221


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/27 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.49s/it]


{'eval_loss': 0.4794270098209381, 'eval_sts-dev_cosine_accuracy@1': 0.2222222222222222, 'eval_sts-dev_cosine_accuracy@3': 0.4444444444444444, 'eval_sts-dev_cosine_accuracy@5': 0.5555555555555556, 'eval_sts-dev_cosine_accuracy@10': 0.5555555555555556, 'eval_sts-dev_cosine_precision@1': 0.2222222222222222, 'eval_sts-dev_cosine_precision@3': 0.2592592592592593, 'eval_sts-dev_cosine_precision@5': 0.2666666666666667, 'eval_sts-dev_cosine_precision@10': 0.21111111111111114, 'eval_sts-dev_cosine_recall@1': 0.02633744855967078, 'eval_sts-dev_cosine_recall@3': 0.05898792565459232, 'eval_sts-dev_cosine_recall@5': 0.194813005924117, 'eval_sts-dev_cosine_recall@10': 0.27251842807398363, 'eval_sts-dev_cosine_ndcg@10': 0.28315354091008627, 'eval_sts-dev_cosine_mrr@10': 0.3611111111111111, 'eval_sts-dev_cosine_map@100': 0.20286949962284767, 'eval_runtime': 30.5611, 'eval_samples_per_second': 23.559, 'eval_steps_per_second': 2.945}


[I 2026-01-04 13:15:58,692] Trial 15 finished with value: 0.27251842807398363 and parameters: {'learning_rate': 4.890585631921125e-05, 'weight_decay': 0.010800797401617856, 'warmup_ratio': 0.0765080638605733, 'max_grad_norm': 0.5013390304609416, 'num_train_epochs': 7}. Best is trial 15 with value: 0.27251842807398363.
Some weights of BertModel were not initialized from the model checkpoint at models\jina-embeddings-v2-small-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'enc

Step,Training Loss
40,1.8364
80,1.4364
120,1.3012
160,0.996
200,0.7622
240,0.626
280,0.733


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/27 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:04<00:00,  4.94s/it]


{'eval_loss': 0.4111362397670746, 'eval_sts-dev_cosine_accuracy@1': 0.2222222222222222, 'eval_sts-dev_cosine_accuracy@3': 0.4444444444444444, 'eval_sts-dev_cosine_accuracy@5': 0.4444444444444444, 'eval_sts-dev_cosine_accuracy@10': 0.4444444444444444, 'eval_sts-dev_cosine_precision@1': 0.2222222222222222, 'eval_sts-dev_cosine_precision@3': 0.2592592592592593, 'eval_sts-dev_cosine_precision@5': 0.2, 'eval_sts-dev_cosine_precision@10': 0.17777777777777776, 'eval_sts-dev_cosine_recall@1': 0.012662234884457106, 'eval_sts-dev_cosine_recall@3': 0.1484873151539818, 'eval_sts-dev_cosine_recall@5': 0.16114955003843892, 'eval_sts-dev_cosine_recall@10': 0.20646226201781756, 'eval_sts-dev_cosine_ndcg@10': 0.25246467392391536, 'eval_sts-dev_cosine_mrr@10': 0.31481481481481477, 'eval_sts-dev_cosine_map@100': 0.21046999000919103, 'eval_runtime': 29.3319, 'eval_samples_per_second': 24.547, 'eval_steps_per_second': 3.068}


[I 2026-01-04 13:19:52,214] Trial 16 finished with value: 0.20646226201781756 and parameters: {'learning_rate': 4.714612267183037e-05, 'weight_decay': 0.021105657644510604, 'warmup_ratio': 0.07411727571172634, 'max_grad_norm': 0.5003133766692207, 'num_train_epochs': 6}. Best is trial 15 with value: 0.27251842807398363.
Some weights of BertModel were not initialized from the model checkpoint at models\jina-embeddings-v2-small-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'en

Step,Training Loss
40,1.8159
80,1.4229
120,1.2311
160,0.9695


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/27 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.19s/it]


{'eval_loss': 0.41396859288215637, 'eval_sts-dev_cosine_accuracy@1': 0.2222222222222222, 'eval_sts-dev_cosine_accuracy@3': 0.3333333333333333, 'eval_sts-dev_cosine_accuracy@5': 0.4444444444444444, 'eval_sts-dev_cosine_accuracy@10': 0.4444444444444444, 'eval_sts-dev_cosine_precision@1': 0.2222222222222222, 'eval_sts-dev_cosine_precision@3': 0.2222222222222222, 'eval_sts-dev_cosine_precision@5': 0.24444444444444446, 'eval_sts-dev_cosine_precision@10': 0.18888888888888888, 'eval_sts-dev_cosine_recall@1': 0.012051734273956496, 'eval_sts-dev_cosine_recall@3': 0.05044091710758378, 'eval_sts-dev_cosine_recall@5': 0.0830913942025053, 'eval_sts-dev_cosine_recall@10': 0.13475331253109032, 'eval_sts-dev_cosine_ndcg@10': 0.21436115424660712, 'eval_sts-dev_cosine_mrr@10': 0.30000000000000004, 'eval_sts-dev_cosine_map@100': 0.15402947916523368, 'eval_runtime': 37.6509, 'eval_samples_per_second': 19.123, 'eval_steps_per_second': 2.39}


[I 2026-01-04 13:23:36,548] Trial 17 finished with value: 0.13475331253109032 and parameters: {'learning_rate': 4.819804776100613e-05, 'weight_decay': 0.011193435549142865, 'warmup_ratio': 0.07430928210629331, 'max_grad_norm': 0.5473684287520074, 'num_train_epochs': 4}. Best is trial 15 with value: 0.27251842807398363.
Some weights of BertModel were not initialized from the model checkpoint at models\jina-embeddings-v2-small-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'en

Step,Training Loss
40,1.8666
80,1.5203
120,1.421
160,1.162
200,0.9611
240,0.706
280,0.7708
320,0.5227
360,0.5027


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/27 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:06<00:00,  6.06s/it]


{'eval_loss': 0.445940226316452, 'eval_sts-dev_cosine_accuracy@1': 0.2222222222222222, 'eval_sts-dev_cosine_accuracy@3': 0.4444444444444444, 'eval_sts-dev_cosine_accuracy@5': 0.5555555555555556, 'eval_sts-dev_cosine_accuracy@10': 0.6666666666666666, 'eval_sts-dev_cosine_precision@1': 0.2222222222222222, 'eval_sts-dev_cosine_precision@3': 0.25925925925925924, 'eval_sts-dev_cosine_precision@5': 0.22222222222222227, 'eval_sts-dev_cosine_precision@10': 0.2, 'eval_sts-dev_cosine_recall@1': 0.012051734273956496, 'eval_sts-dev_cosine_recall@3': 0.1478768145434812, 'eval_sts-dev_cosine_recall@5': 0.1821507710396599, 'eval_sts-dev_cosine_recall@10': 0.2458644236422014, 'eval_sts-dev_cosine_ndcg@10': 0.26340519645847693, 'eval_sts-dev_cosine_mrr@10': 0.3611111111111111, 'eval_sts-dev_cosine_map@100': 0.1985251904435118, 'eval_runtime': 39.3006, 'eval_samples_per_second': 18.32, 'eval_steps_per_second': 2.29}


[I 2026-01-04 13:29:34,398] Trial 18 finished with value: 0.2458644236422014 and parameters: {'learning_rate': 3.903330335485093e-05, 'weight_decay': 0.044324840264284585, 'warmup_ratio': 0.0737898083064729, 'max_grad_norm': 0.5814222614025382, 'num_train_epochs': 8}. Best is trial 15 with value: 0.27251842807398363.
Some weights of BertModel were not initialized from the model checkpoint at models\jina-embeddings-v2-small-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'enco

Step,Training Loss
40,1.8741
80,1.5361
120,1.3573
160,1.1054
200,0.898
240,0.6423
280,0.6649
320,0.554
360,0.4471
400,0.5676


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/27 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:18<00:00, 18.44s/it]


{'eval_loss': 0.53395676612854, 'eval_sts-dev_cosine_accuracy@1': 0.2222222222222222, 'eval_sts-dev_cosine_accuracy@3': 0.3333333333333333, 'eval_sts-dev_cosine_accuracy@5': 0.3333333333333333, 'eval_sts-dev_cosine_accuracy@10': 0.5555555555555556, 'eval_sts-dev_cosine_precision@1': 0.2222222222222222, 'eval_sts-dev_cosine_precision@3': 0.2222222222222222, 'eval_sts-dev_cosine_precision@5': 0.17777777777777776, 'eval_sts-dev_cosine_precision@10': 0.18888888888888888, 'eval_sts-dev_cosine_recall@1': 0.012051734273956496, 'eval_sts-dev_cosine_recall@3': 0.05044091710758378, 'eval_sts-dev_cosine_recall@5': 0.0586713697824809, 'eval_sts-dev_cosine_recall@10': 0.23731741509519283, 'eval_sts-dev_cosine_ndcg@10': 0.23041291580965229, 'eval_sts-dev_cosine_mrr@10': 0.3027777777777778, 'eval_sts-dev_cosine_map@100': 0.1648766473559311, 'eval_runtime': 69.3921, 'eval_samples_per_second': 10.376, 'eval_steps_per_second': 1.297}


[I 2026-01-04 13:38:33,528] Trial 19 finished with value: 0.23731741509519283 and parameters: {'learning_rate': 3.793025413027487e-05, 'weight_decay': 0.04473207350417616, 'warmup_ratio': 0.07326011861969792, 'max_grad_norm': 0.6689413499652381, 'num_train_epochs': 9}. Best is trial 15 with value: 0.27251842807398363.
Some weights of BertModel were not initialized from the model checkpoint at models\jina-embeddings-v2-small-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'enc

Step,Training Loss
40,1.8749
80,1.5789
120,1.4468
160,1.2631
200,1.1027
240,0.8526
280,0.9096
320,0.698
360,0.731
400,0.6798


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/27 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:10<00:00, 10.17s/it]


{'eval_loss': 0.4347043037414551, 'eval_sts-dev_cosine_accuracy@1': 0.1111111111111111, 'eval_sts-dev_cosine_accuracy@3': 0.3333333333333333, 'eval_sts-dev_cosine_accuracy@5': 0.3333333333333333, 'eval_sts-dev_cosine_accuracy@10': 0.5555555555555556, 'eval_sts-dev_cosine_precision@1': 0.1111111111111111, 'eval_sts-dev_cosine_precision@3': 0.18518518518518517, 'eval_sts-dev_cosine_precision@5': 0.17777777777777776, 'eval_sts-dev_cosine_precision@10': 0.2, 'eval_sts-dev_cosine_recall@1': 0.004115226337448559, 'eval_sts-dev_cosine_recall@3': 0.042504409171075834, 'eval_sts-dev_cosine_recall@5': 0.06249265138154028, 'eval_sts-dev_cosine_recall@10': 0.24968570524126082, 'eval_sts-dev_cosine_ndcg@10': 0.23623685948103837, 'eval_sts-dev_cosine_mrr@10': 0.25661375661375657, 'eval_sts-dev_cosine_map@100': 0.1683845085375392, 'eval_runtime': 65.8867, 'eval_samples_per_second': 10.928, 'eval_steps_per_second': 1.366}


[I 2026-01-04 13:47:05,819] Trial 20 finished with value: 0.24968570524126082 and parameters: {'learning_rate': 2.8068401086894654e-05, 'weight_decay': 0.04698765510395305, 'warmup_ratio': 0.05124379716477055, 'max_grad_norm': 0.5009236516133213, 'num_train_epochs': 9}. Best is trial 15 with value: 0.27251842807398363.
Some weights of BertModel were not initialized from the model checkpoint at models\jina-embeddings-v2-small-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'en

Step,Training Loss
40,1.8693
80,1.57
120,1.4366
160,1.1751
200,1.0341
240,0.8156
280,0.872
320,0.6039
360,0.6011
400,0.6044


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/27 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:10<00:00, 10.64s/it]


{'eval_loss': 0.4603928029537201, 'eval_sts-dev_cosine_accuracy@1': 0.3333333333333333, 'eval_sts-dev_cosine_accuracy@3': 0.3333333333333333, 'eval_sts-dev_cosine_accuracy@5': 0.4444444444444444, 'eval_sts-dev_cosine_accuracy@10': 0.5555555555555556, 'eval_sts-dev_cosine_precision@1': 0.3333333333333333, 'eval_sts-dev_cosine_precision@3': 0.18518518518518517, 'eval_sts-dev_cosine_precision@5': 0.2, 'eval_sts-dev_cosine_precision@10': 0.18888888888888888, 'eval_sts-dev_cosine_recall@1': 0.03427395649617872, 'eval_sts-dev_cosine_recall@3': 0.042504409171075834, 'eval_sts-dev_cosine_recall@5': 0.06721837832948944, 'eval_sts-dev_cosine_recall@10': 0.24113869669425225, 'eval_sts-dev_cosine_ndcg@10': 0.24934925457204096, 'eval_sts-dev_cosine_mrr@10': 0.3714285714285714, 'eval_sts-dev_cosine_map@100': 0.1783763748008684, 'eval_runtime': 62.1964, 'eval_samples_per_second': 11.576, 'eval_steps_per_second': 1.447}


[I 2026-01-04 13:54:26,937] Trial 21 finished with value: 0.24113869669425225 and parameters: {'learning_rate': 2.9172756085030926e-05, 'weight_decay': 0.04855095567449332, 'warmup_ratio': 0.05016085581775753, 'max_grad_norm': 0.5018701496819169, 'num_train_epochs': 9}. Best is trial 15 with value: 0.27251842807398363.
Some weights of BertModel were not initialized from the model checkpoint at models\jina-embeddings-v2-small-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'en

Step,Training Loss
40,1.8778
80,1.5731
120,1.4231
160,1.3125
200,1.111
240,0.8623
280,0.9391
320,0.7147
360,0.7537


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/27 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:09<00:00,  9.46s/it]


{'eval_loss': 0.425436794757843, 'eval_sts-dev_cosine_accuracy@1': 0.1111111111111111, 'eval_sts-dev_cosine_accuracy@3': 0.3333333333333333, 'eval_sts-dev_cosine_accuracy@5': 0.3333333333333333, 'eval_sts-dev_cosine_accuracy@10': 0.6666666666666666, 'eval_sts-dev_cosine_precision@1': 0.1111111111111111, 'eval_sts-dev_cosine_precision@3': 0.18518518518518517, 'eval_sts-dev_cosine_precision@5': 0.17777777777777776, 'eval_sts-dev_cosine_precision@10': 0.17777777777777778, 'eval_sts-dev_cosine_recall@1': 0.004115226337448559, 'eval_sts-dev_cosine_recall@3': 0.046325690770135215, 'eval_sts-dev_cosine_recall@5': 0.06249265138154028, 'eval_sts-dev_cosine_recall@10': 0.21952697508253063, 'eval_sts-dev_cosine_ndcg@10': 0.20704628482999607, 'eval_sts-dev_cosine_mrr@10': 0.26432980599647266, 'eval_sts-dev_cosine_map@100': 0.15307262796557874, 'eval_runtime': 49.0083, 'eval_samples_per_second': 14.691, 'eval_steps_per_second': 1.836}


[I 2026-01-04 14:00:35,756] Trial 22 finished with value: 0.21952697508253063 and parameters: {'learning_rate': 3.0255348919761388e-05, 'weight_decay': 0.05441466150215371, 'warmup_ratio': 0.06980712881799675, 'max_grad_norm': 0.5817314341453028, 'num_train_epochs': 8}. Best is trial 15 with value: 0.27251842807398363.
Some weights of BertModel were not initialized from the model checkpoint at models\jina-embeddings-v2-small-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'en

Step,Training Loss
40,1.8777
80,1.5764
120,1.3828
160,1.1651
200,0.9875
240,0.7435
280,0.7881
320,0.6682
360,0.538
400,0.5753


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/27 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:12<00:00, 12.34s/it]


{'eval_loss': 0.4586995542049408, 'eval_sts-dev_cosine_accuracy@1': 0.1111111111111111, 'eval_sts-dev_cosine_accuracy@3': 0.3333333333333333, 'eval_sts-dev_cosine_accuracy@5': 0.4444444444444444, 'eval_sts-dev_cosine_accuracy@10': 0.4444444444444444, 'eval_sts-dev_cosine_precision@1': 0.1111111111111111, 'eval_sts-dev_cosine_precision@3': 0.14814814814814814, 'eval_sts-dev_cosine_precision@5': 0.17777777777777776, 'eval_sts-dev_cosine_precision@10': 0.16666666666666666, 'eval_sts-dev_cosine_recall@1': 0.004115226337448559, 'eval_sts-dev_cosine_recall@3': 0.024713969158413604, 'eval_sts-dev_cosine_recall@5': 0.06753493420160088, 'eval_sts-dev_cosine_recall@10': 0.13095464206575316, 'eval_sts-dev_cosine_ndcg@10': 0.1689766081947906, 'eval_sts-dev_cosine_mrr@10': 0.25, 'eval_sts-dev_cosine_map@100': 0.13957577582698347, 'eval_runtime': 64.928, 'eval_samples_per_second': 11.089, 'eval_steps_per_second': 1.386}


[I 2026-01-04 14:08:25,720] Trial 23 finished with value: 0.13095464206575316 and parameters: {'learning_rate': 3.9933733637536604e-05, 'weight_decay': 0.0254051217551559, 'warmup_ratio': 0.08234597476389718, 'max_grad_norm': 0.5105163755160586, 'num_train_epochs': 9}. Best is trial 15 with value: 0.27251842807398363.
Some weights of BertModel were not initialized from the model checkpoint at models\jina-embeddings-v2-small-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'enc

Step,Training Loss
40,1.8693
80,1.5836
120,1.4811
160,1.3228
200,1.25
240,1.0166
280,1.1464


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/27 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:06<00:00,  6.16s/it]


{'eval_loss': 0.41465243697166443, 'eval_sts-dev_cosine_accuracy@1': 0.2222222222222222, 'eval_sts-dev_cosine_accuracy@3': 0.2222222222222222, 'eval_sts-dev_cosine_accuracy@5': 0.5555555555555556, 'eval_sts-dev_cosine_accuracy@10': 0.5555555555555556, 'eval_sts-dev_cosine_precision@1': 0.2222222222222222, 'eval_sts-dev_cosine_precision@3': 0.14814814814814814, 'eval_sts-dev_cosine_precision@5': 0.2222222222222222, 'eval_sts-dev_cosine_precision@10': 0.2, 'eval_sts-dev_cosine_recall@1': 0.012051734273956496, 'eval_sts-dev_cosine_recall@3': 0.020282186948853614, 'eval_sts-dev_cosine_recall@5': 0.17832948944060054, 'eval_sts-dev_cosine_recall@10': 0.22836792836792835, 'eval_sts-dev_cosine_ndcg@10': 0.23898477030516066, 'eval_sts-dev_cosine_mrr@10': 0.30000000000000004, 'eval_sts-dev_cosine_map@100': 0.1512150030499169, 'eval_runtime': 35.8248, 'eval_samples_per_second': 20.098, 'eval_steps_per_second': 2.512}


[I 2026-01-04 14:12:27,142] Trial 24 finished with value: 0.22836792836792835 and parameters: {'learning_rate': 2.653783078613784e-05, 'weight_decay': 0.041641233385403445, 'warmup_ratio': 0.06315440369727221, 'max_grad_norm': 0.5784105499771178, 'num_train_epochs': 6}. Best is trial 15 with value: 0.27251842807398363.


BestRun(run_id='15', objective=0.27251842807398363, hyperparameters={'learning_rate': 4.890585631921125e-05, 'weight_decay': 0.010800797401617856, 'warmup_ratio': 0.0765080638605733, 'max_grad_norm': 0.5013390304609416, 'num_train_epochs': 7}, run_summary=None)


In [None]:
if TRAIN:
    train(train_dataset, valid_dataset, model_cl_kwargs)

In [None]:
if TRAIN:
    train(train_dataset, valid_dataset, model_mnlr_kwargs)

In [70]:
if TRAIN:
    train(train_dataset, valid_dataset, model_mnlr2_kwargs)

Some weights of BertModel were not initialized from the model checkpoint at models\jina-embeddings-v2-small-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'encoder.layer.2.intermediate.dense.bias', 'encoder.layer.2.intermediate.dense.weight', 'encoder.layer.2.output.LayerNorm.bias', 'encoder.layer.2.output.LayerNorm.weight', 'encoder.layer.2.output.dense.bias', 'encoder.layer.2.output.dense.weight', 'encoder.layer.3.intermediate.dense.bias', 'encoder.layer.3.intermediate.den

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Some weights of BertModel were not initialized from the model checkpoint at models\jina-embeddings-v2-small-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'encoder.layer.2.intermediate.dense.bias', 'encoder.layer.2.intermediate.dense.weight', 'encoder.layer.2.output.LayerNorm.bias', 'encoder.layer.2.output.LayerNorm.weight', 'encoder.layer.2.output.dense.bias', 'encoder.layer.2.output.dense.weight', 'encoder.layer.3.intermediate.dense.bias', 'encoder.layer.3.intermediate.den

Step,Training Loss,Validation Loss,Cosine Accuracy@1,Cosine Accuracy@3,Cosine Accuracy@5,Cosine Accuracy@10,Cosine Precision@1,Cosine Precision@3,Cosine Precision@5,Cosine Precision@10,Cosine Recall@1,Cosine Recall@3,Cosine Recall@5,Cosine Recall@10,Cosine Ndcg@10,Cosine Mrr@10,Cosine Map@100
50,No log,0.346475,0.222222,0.333333,0.333333,0.444444,0.222222,0.148148,0.133333,0.111111,0.012052,0.024714,0.032944,0.071334,0.127283,0.271605,0.073633
100,1.627300,0.363814,0.111111,0.222222,0.222222,0.333333,0.111111,0.148148,0.133333,0.133333,0.004115,0.020282,0.032334,0.065278,0.135533,0.179012,0.103947
150,1.627300,0.349454,0.222222,0.444444,0.555556,0.555556,0.222222,0.259259,0.222222,0.2,0.012662,0.045313,0.079587,0.129625,0.218582,0.361111,0.164131
200,1.029500,0.366128,0.111111,0.333333,0.333333,0.444444,0.111111,0.148148,0.2,0.177778,0.004115,0.024714,0.057975,0.099466,0.177054,0.240741,0.156489
250,1.029500,0.401706,0.222222,0.333333,0.444444,0.444444,0.222222,0.222222,0.244444,0.2,0.012662,0.037376,0.071248,0.11595,0.214017,0.305556,0.178424
300,0.603000,0.428423,0.222222,0.444444,0.444444,0.444444,0.222222,0.296296,0.266667,0.188889,0.012662,0.05447,0.079184,0.111835,0.213354,0.333333,0.165686
350,0.603000,0.515953,0.333333,0.333333,0.333333,0.777778,0.333333,0.222222,0.222222,0.211111,0.020599,0.037376,0.06209,0.288355,0.268694,0.385317,0.18183
400,0.497200,0.452074,0.333333,0.333333,0.333333,0.666667,0.333333,0.222222,0.222222,0.211111,0.020599,0.037376,0.06209,0.343911,0.285478,0.375441,0.19736
450,0.497200,0.46971,0.333333,0.333333,0.333333,0.444444,0.333333,0.259259,0.222222,0.2,0.020599,0.045923,0.06209,0.129625,0.224893,0.344444,0.188016


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/27 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:06<00:00,  6.89s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/27 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:06<00:00,  6.87s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/27 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:15<00:00, 15.33s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/27 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:17<00:00, 17.87s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/27 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:19<00:00, 19.69s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/27 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:18<00:00, 18.12s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/27 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:15<00:00, 15.03s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/27 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:22<00:00, 22.77s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/27 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:17<00:00, 17.10s/it]


In [71]:
from sentence_transformers.util import cos_sim

def eval_retriever(test_dataset,model_kwargs=None):
    if model_kwargs is None:
        model = model_init()
    else:
        model = SentenceTransformer(model_kwargs['trained_model_path'])
    
    ev = get_ret_eval(test_dataset)

    metrics = ev(model)
    print(metrics)

    test_cases = [
        ("It is raining", "It is not raining", 1),  # Contradict - should be CLOSE
        ("It is raining", "The weather is wet", 0),  # Confirm - should be FAR
    ]

    for s1, s2, expected in test_cases:
        emb1 = model.encode(s1)
        emb2 = model.encode(s2)
        sim = cos_sim(emb1, emb2).item()
        print(f"'{s1}' vs '{s2}'")
        print(f"  Similarity: {sim:.3f} (Expected: {'HIGH' if expected==1 else 'LOW'})")
        print()

In [231]:
from sentence_transformers import util
import numpy as np
from numpy import hstack

class OurClassifier:
    def __init__(self, retriever, final_classifier, corpus):
        self.retriever = retriever
        self.final_classifier = final_classifier
        self.corpus = corpus
        self.embeded_corpus = self.retriever.encode(corpus, convert_to_tensor=True)
        self.k = min(len(corpus), 10)

    def predict(self, x):
        embedded_x = self.retriever.encode(x, convert_to_tensor=True)
        top_k = util.semantic_search(embedded_x, self.embeded_corpus, top_k=self.k, score_function=self.retriever.similarity)
        predictions = []
        vectorizer_premise = TfidfVectorizer()
        vectorizer_hypothesis = TfidfVectorizer()
        for query_id, query in enumerate(x):
            prediction = 0  # Default to 0 if no match found
            for res in top_k[query_id]:
                corpus_id = res['corpus_id']
                score = res['score']
                
                # Get premise embedding and move to CPU, convert to numpy
                premise_embedding = self.embeded_corpus[corpus_id].cpu().numpy()
                
                # Get current query embedding, move to CPU, convert to numpy
                query_embedding = embedded_x[query_id].cpu().numpy()
                
                # Stack the embeddings
                combined_features = hstack([query, self.corpus[corpus_id]])
                
                # Make prediction
                prediction = self.final_classifier.predict([combined_features])[0]
                
                if prediction == 1:
                    break
                train_data_preprocessed = train_data.copy()
                test_data_preprocessed = test_data.copy()

                train_data_vectorised = train_data.copy()

                X_premise= vectorizer_premise.fit_transform(query)
                X_hypothesis = vectorizer_hypothesis.fit_transform(self.corpus[corpus_id])
                train_data_vectorised = hstack([X_premise, X_hypothesis])

                Y_premise = vectorizer_premise.transform(test_data_preprocessed["premise"])
                Y_hypothesis = vectorizer_hypothesis.transform(test_data_preprocessed["hypothesis"])
                test_data_vectorised = hstack([Y_premise, Y_hypothesis])
            predictions.append(prediction)
        
        return np.array(predictions)

In [232]:
from sentence_transformers import SentenceTransformer

retr = SentenceTransformer(model_mnlr2_kwargs['trained_model_path'])
m = OurClassifier(retr, svm_model, test_dataset['premise'])

In [233]:
print("predicting")
predictions = m.predict(np.array(list(set(test_dataset['hypothesis']))))
print("predicted")
h = set(test_dataset['hypothesis'])
dlabels = dict(zip(h, [0]*len(h)))

for i in range(len(test_dataset)):
    if test_dataset[i]['label'] == 1:
        dlabels[test_dataset[i]['X_hypothesis']] = 1

labels = np.array([v for k,v in dlabels.items()])
print(labels)
report_dict = classification_report(labels, predictions, zero_division=0, output_dict=True)
pretty_print_report_dict(report_dict)

predicting


ValueError: could not convert string to float: np.str_('Receiving Party may acquire information similar to Confidential Information from a third party.')

In [177]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SimilarityFunction

model = SentenceTransformer(model_mnlr2_kwargs['trained_model_path'])
em_sim_ev = EmbeddingSimilarityEvaluator(
    sentences1=test_dataset['hypothesis'],
    sentences2=test_dataset['premise'],
    scores=test_dataset['label'],
    main_similarity=SimilarityFunction.COSINE,
    show_progress_bar=True,
)

results = em_sim_ev(model)
print(results)

Batches:   0%|          | 0/131 [00:00<?, ?it/s]

Batches:   0%|          | 0/131 [00:00<?, ?it/s]

{'pearson_cosine': 0.3273217698261732, 'spearman_cosine': 0.3149717892857355}


In [78]:
model = SentenceTransformer(model_mnlr2_kwargs['trained_model_path'])
ev = get_ret_eval(valid_dataset)
res = ev(model)
for k,v in res.items():
    print(k,v)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/27 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:39<00:00, 39.93s/it]

cosine_accuracy@1 0.3333333333333333
cosine_accuracy@3 0.3333333333333333
cosine_accuracy@5 0.3333333333333333
cosine_accuracy@10 0.6666666666666666
cosine_precision@1 0.3333333333333333
cosine_precision@3 0.2222222222222222
cosine_precision@5 0.2222222222222222
cosine_precision@10 0.2111111111111111
cosine_recall@1 0.020598742820965042
cosine_recall@3 0.03737620404287071
cosine_recall@5 0.06209017320128431
cosine_recall@10 0.3439108216885995
cosine_ndcg@10 0.285478223191996
cosine_mrr@10 0.37544091710758376
cosine_map@100 0.19736013904280394





In [79]:
for k,v in res.items():
    print(k,v)

cosine_accuracy@1 0.3333333333333333
cosine_accuracy@3 0.3333333333333333
cosine_accuracy@5 0.3333333333333333
cosine_accuracy@10 0.6666666666666666
cosine_precision@1 0.3333333333333333
cosine_precision@3 0.2222222222222222
cosine_precision@5 0.2222222222222222
cosine_precision@10 0.2111111111111111
cosine_recall@1 0.020598742820965042
cosine_recall@3 0.03737620404287071
cosine_recall@5 0.06209017320128431
cosine_recall@10 0.3439108216885995
cosine_ndcg@10 0.285478223191996
cosine_mrr@10 0.37544091710758376
cosine_map@100 0.19736013904280394


In [76]:
for k,v in res.items():
    print(k,v)

cosine_accuracy@1 0.1
cosine_accuracy@3 0.2
cosine_accuracy@5 0.5
cosine_accuracy@10 0.5
cosine_precision@1 0.1
cosine_precision@3 0.13333333333333333
cosine_precision@5 0.22000000000000003
cosine_precision@10 0.22000000000000003
cosine_recall@1 0.0011363636363636363
cosine_recall@3 0.006742424242424242
cosine_recall@5 0.02645959595959596
cosine_recall@10 0.053585858585858584
cosine_ndcg@10 0.19891295583084478
cosine_mrr@10 0.2033333333333333
cosine_map@100 0.14136472872345446
