In [1]:
import pandas as pd
import numpy as np
import datasets
import nltk
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from scipy.sparse import hstack
from sklearn.model_selection import RandomizedSearchCV
from tabulate import tabulate

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
train_data_path = "./data/English dataset/train.jsonl"
test_data_path = "./data/English dataset/test.jsonl"

def preprocess_text(text): # From the labs
	# Tokenize the text into words
	words = word_tokenize(text.lower())  # Convert text to lowercase

	# Remove punctuation
	table = str.maketrans('', '', string.punctuation)
	words = [word.translate(table) for word in words if word.isalpha()]

	# Remove stopwords
	stop_words = set(stopwords.words('english'))
	words = [word for word in words if word not in stop_words]

	# Lemmatization
	lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

	# Join the words back into a string
	preprocessed_text = ' '.join(lemmatized_words)
	return preprocessed_text

train_data = pd.DataFrame(datasets.load_dataset("json", data_files=train_data_path)["train"])
test_data = pd.DataFrame(datasets.load_dataset("json", data_files=test_data_path)["train"])

label_map = {"Contradiction": 1, "Entailment": 0, "NotMentioned": 0}
train_data["label"] = train_data["label"].map(label_map)
test_data["label"] = test_data["label"].map(label_map)

train_data = train_data.drop("doc_id", axis=1)
train_data = train_data.drop("key", axis=1)
test_data = test_data.drop("doc_id", axis=1)
test_data = test_data.drop("key", axis=1)

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\timna\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\timna\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\timna\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset

#model_name = "kiddothe2b/longformer-mini-1024"
model_name = "./trained_model_ex3_f1_class1_weighted"


tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

In [3]:
def preprocess_function(examples):
	t =  tokenizer(examples["premise"], examples["hypothesis"], max_length=1024, truncation="only_first", padding="max_length")
	t["labels"] = examples["label"]
	return t

def preprocess_function(examples):
    # Tokenize as usual
    inputs = tokenizer(examples["premise"], examples["hypothesis"], 
                       max_length=1024, truncation="only_first", padding="max_length")
    
    # Initialize mask with 0s (Local Attention)
    global_attention_mask = [[0] * len(ids) for ids in inputs["input_ids"]]
    
    # Set the first token (index 0) to 1 (Global Attention)
    for mask in global_attention_mask:
        mask[0] = 1 
        
    inputs["global_attention_mask"] = global_attention_mask
    return inputs

In [4]:
dataset = Dataset.from_pandas(train_data)

tokenized_dataset = dataset.map(preprocess_function, batched=True)

eval_dataset = Dataset.from_pandas(test_data)
tokenized_eval_dataset = eval_dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 7191/7191 [00:10<00:00, 706.79 examples/s]
Map: 100%|██████████| 2091/2091 [00:02<00:00, 699.78 examples/s]


In [17]:
import numpy as np
from sklearn.metrics import precision_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    precision_c1 = precision_score(labels, predictions, pos_label=1, average='binary')
    
    return {"precision_class_1": precision_c1}

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
	output_dir="./artifacts",
	learning_rate=2e-4,
	per_device_train_batch_size=2,
	gradient_accumulation_steps=16,
	num_train_epochs=1,
	weight_decay=0.01,
	save_strategy="steps",
	save_steps=50, 
	save_total_limit=3, 

    metric_for_best_model="precision_class_1", 
)

trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=tokenized_dataset,
	compute_metrics=compute_metrics
)

trainer.train(resume_from_checkpoint=False)
tokenizer.save_pretrained("./trained_model_ex3")
trainer.save_model("./trained_model_ex3")

Step,Training Loss


In [None]:
# Super bad
from transformers import TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import f1_score
import torch

def compute_metrics(eval_pred):
	logits, labels = eval_pred
	
	predictions = np.argmax(logits, axis=-1)

	f1_score_class_1 = f1_score(labels, predictions, pos_label=1, average='binary')

	return {"f1_score_class_1": f1_score_class_1}

training_args = TrainingArguments(
	output_dir="./artifacts",
	learning_rate=2e-4,
	per_device_train_batch_size=2,
	gradient_accumulation_steps=16,
	num_train_epochs=1,
	weight_decay=0.01,
	save_strategy="steps",
	save_steps=100,
	save_total_limit=3, 
	load_best_model_at_end=True,
    metric_for_best_model="f1_score_class_1", 
    
	eval_strategy="steps",
    eval_steps=100,
	greater_is_better=True
    )

trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=tokenized_dataset,
	compute_metrics=compute_metrics,
	eval_dataset = tokenized_eval_dataset
)

trainer.train(resume_from_checkpoint=False)
tokenizer.save_pretrained("./trained_model_ex3")
trainer.save_model("./trained_model_ex3")



Step,Training Loss,Validation Loss,Precision Class 1
100,No log,0.336579,0.0
200,No log,0.336391,0.0




In [13]:
from transformers import TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import f1_score
import torch
from torch import nn

# --- 1. Define the Weighted Trainer ---
class WeightedTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        # Move weights to the correct device (GPU/CPU)
        if class_weights is not None:
            self.class_weights = torch.tensor(class_weights, dtype=torch.float).to(self.args.device)
        else:
            self.class_weights = None

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        
        if self.class_weights is not None:
            loss_fct = nn.CrossEntropyLoss(weight=self.class_weights)
            loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        else:
            # Fallback to default loss if no weights are provided
            loss = outputs.loss if isinstance(outputs, dict) else outputs[0]
            
        return (loss, outputs) if return_outputs else loss

# --- 2. Metrics with 0.3 Threshold ---
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Convert logits to probabilities
    predictions = np.argmax(logits, axis=-1)

    f1 = f1_score(labels, predictions, pos_label=1, average='binary')
    return {"f1_score_class_1": f1}

# --- 3. Configuration ---
# Since you have a 9:1 ratio:
# Weight for Class 0 = 1.0
# Weight for Class 1 = 9.0
class_weights = [1.0, 9.0]

training_args = TrainingArguments(
    output_dir="./artifacts",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=16,
    num_train_epochs=2,
    weight_decay=0.01,
    save_strategy="steps",
    save_steps=50,
    save_total_limit=3, 
    load_best_model_at_end=True,
    metric_for_best_model="f1_score_class_1", 
    eval_strategy="steps",
    eval_steps=50,
    greater_is_better=True,
    resume_from_checkpoint=True
)

# --- 4. Execution ---
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_eval_dataset,
    compute_metrics=compute_metrics,
    class_weights=class_weights,
)

trainer.train(resume_from_checkpoint=True)

# Save results
tokenizer.save_pretrained("./trained_model_ex3")
trainer.save_model("./trained_model_ex3")



Step,Training Loss,Validation Loss,F1 Score Class 1
250,No log,0.285482,0.817582
300,No log,0.313472,0.814815
350,No log,0.233095,0.803456
400,No log,0.21081,0.809322
450,No log,0.296697,0.811456




In [15]:
from transformers import Trainer
from sklearn.metrics import classification_report
import os

model = AutoModelForSequenceClassification.from_pretrained("trained_model_ex3", num_labels=2)

def pretty_print_report_dict(report):
	report_df = pd.DataFrame(report).transpose()
	report_df = report_df.round(3)

	class_metrics = report_df.iloc[:-3, :].copy()

	summary_metrics = report_df.iloc[-3:, :].copy()
	summary_metrics = summary_metrics.drop(columns=['support'])

	print("CLASS PERFORMANCE")
	print(tabulate(class_metrics, headers='keys', tablefmt='heavy_outline', numalign="center"))
	print()
	print("GLOBAL AVERAGES")
	print(tabulate(summary_metrics, headers='keys', tablefmt='heavy_outline', numalign="center"))

trainer = Trainer(model=model)  # no need for args for evaluation 
predictions_procentages = trainer.predict(tokenized_eval_dataset)[0]
predictions = predictions_procentages.argmax(-1)

#probs = torch.nn.functional.softmax(torch.from_numpy(predictions_procentages.predictions), dim=-1).numpy()
#threshold = 0.4
#predictions = (probs[:, 1] >= threshold).astype(int)

report_dict = classification_report(test_data["label"], predictions, zero_division=0, output_dict=True)
pretty_print_report_dict(report_dict)



CLASS PERFORMANCE
┏━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┓
┃    ┃  precision  ┃  recall  ┃  f1-score  ┃  support  ┃
┣━━━━╋━━━━━━━━━━━━━╋━━━━━━━━━━╋━━━━━━━━━━━━╋━━━━━━━━━━━┫
┃ 0  ┃    0.982    ┃  0.974   ┃   0.978    ┃   1871    ┃
┃ 1  ┃    0.791    ┃  0.845   ┃   0.818    ┃    220    ┃
┗━━━━┻━━━━━━━━━━━━━┻━━━━━━━━━━┻━━━━━━━━━━━━┻━━━━━━━━━━━┛

GLOBAL AVERAGES
┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━┓
┃              ┃  precision  ┃  recall  ┃  f1-score  ┃
┣━━━━━━━━━━━━━━╋━━━━━━━━━━━━━╋━━━━━━━━━━╋━━━━━━━━━━━━┫
┃ accuracy     ┃    0.96     ┃   0.96   ┃    0.96    ┃
┃ macro avg    ┃    0.887    ┃   0.91   ┃   0.898    ┃
┃ weighted avg ┃    0.962    ┃   0.96   ┃   0.961    ┃
┗━━━━━━━━━━━━━━┻━━━━━━━━━━━━━┻━━━━━━━━━━┻━━━━━━━━━━━━┛
