In [1]:
import torch
torch.cuda.empty_cache()
import os, inspect, ast
import numpy as np
import pandas as pd
from scipy.sparse import hstack

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    classification_report,
    confusion_matrix,
    roc_auc_score,
    log_loss
)

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")

from evaluation import *
from data_utils import *

In [2]:
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    set_seed,
    BitsAndBytesConfig,
)
from datasets import Dataset as HFDataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

  from .autonotebook import tqdm as notebook_tqdm
2025-11-27 00:17:50.222298: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-11-27 00:17:50.222374: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-11-27 00:17:50.223455: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-11-27 00:17:50.230776: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# Load and prepare data
train_df, test_df, y, class_names = load_and_prepare_data()

# Get train/val/test splits for GEMMA
X_train_gemma, X_val_gemma, X_test_gemma, y_train_gemma, y_val_gemma = prepare_text_pipeline_gemma(
    train_df, test_df, y
)

# Ensure positional indexing
y_train_gemma = np.asarray(y_train_gemma, dtype=int)
y_val_gemma   = np.asarray(y_val_gemma, dtype=int)

In [4]:
MODEL_NAME = "google/gemma-2-2b-it"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
max_len = 512

def tokenize_examples(texts, labels=None):
    encodings = tokenizer(
        texts,
        truncation=True,
        max_length=max_len,
        padding=False
    )
    if labels is not None:
        encodings["labels"] = labels
    return encodings

train_enc = tokenize_examples(X_train_gemma, y_train_gemma)
val_enc   = tokenize_examples(X_val_gemma,   y_val_gemma)
test_enc  = tokenize_examples(X_test_gemma)

train_ds = HFDataset.from_dict(train_enc)
val_ds   = HFDataset.from_dict(val_enc)
test_ds  = HFDataset.from_dict(test_enc)

In [5]:
num_labels = len(class_names)
id2label = {i: lbl for i, lbl in enumerate(class_names)}
label2id = {lbl: i for i, lbl in enumerate(class_names)}

USE_8BIT = True

if USE_8BIT:
    bnb_config = BitsAndBytesConfig(load_in_8bit=True)
else:
    bnb_config = BitsAndBytesConfig(load_in_4bit=True)

# Load quantized model
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
    problem_type="single_label_classification",
    quantization_config=bnb_config,
    device_map="cuda:0",
)

# Prepare for k-bit training + add LoRA adapters
model = prepare_model_for_kbit_training(model)

peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_CLS",
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# Prefer faster attention kernels when available.
try:
    model.config.attn_implementation = "flash_attention_2"
except Exception:
    try:
        model.config.attn_implementation = "sdpa"
    except Exception:
        pass

# Reduce or disable gradient checkpointing (saves recomputation, speeds up training)
USE_GRADIENT_CHECKPOINTING = False
if USE_GRADIENT_CHECKPOINTING:
    model.gradient_checkpointing_enable()

# Allow TF32 matmul (Ampere+ GPUs) for extra speed with negligible precision impact.
torch.backends.cuda.matmul.allow_tf32 = True
if hasattr(torch, "set_float32_matmul_precision"):
    torch.set_float32_matmul_precision("high")

set_seed(42)

Fetching 2 files: 100%|██████████| 2/2 [00:06<00:00,  3.49s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.88s/it]
Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b-it and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 1,604,352 || all params: 2,615,953,152 || trainable%: 0.0613


In [6]:
train_args = TrainingArguments(
    output_dir="runs/gemma_cls",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=2,
    learning_rate=2e-5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    bf16=True,
    tf32=True,

    # Data loading: use workers and keep padding to multiples of 8
    dataloader_num_workers=4,        # increase if storage is fast enough
    
    report_to="none",
    logging_steps=50,
)

data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)

trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = Trainer(


In [7]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 1}.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork i

Epoch,Training Loss,Validation Loss
1,2.0709,2.019975
2,1.9076,1.999136


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

TrainOutput(global_step=11496, training_loss=2.100958713327088, metrics={'train_runtime': 11699.8412, 'train_samples_per_second': 7.86, 'train_steps_per_second': 0.983, 'total_flos': 5.722950588748923e+17, 'train_loss': 2.100958713327088, 'epoch': 2.0})

In [8]:
# Predictions (Gemma)
with torch.no_grad():
    pred_logits = trainer.predict(val_ds).predictions
y_proba_val_gemma = torch.softmax(torch.tensor(pred_logits), dim=1).cpu().numpy()
y_pred_val_gemma  = y_proba_val_gemma.argmax(axis=1)

# Test predictions + submission
with torch.no_grad():
    test_logits = trainer.predict(test_ds).predictions
y_proba_test_gemma = torch.softmax(torch.tensor(test_logits), dim=1).cpu().numpy()
y_pred_test_gemma  = y_proba_test_gemma.argmax(axis=1)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [9]:
print("\n================ GEMMA EVALUATION ================\n")
# Metrics
_ = eval_metrics(y_val_gemma, y_pred_val_gemma)
eval_classification_report(y_val_gemma, y_pred_val_gemma, class_names)

# ROC-AUC
_ = eval_roc_auc(y_val_gemma, y_proba_val_gemma)

# Log-loss
_ = eval_log_loss(y_val_gemma, y_proba_val_gemma)
_ = eval_log_loss_per_class(y_val_gemma, y_proba_val_gemma)



*** GLOBAL METRICS ***
Accuracy (Global)      : 0.5007
Precision (Macro Avg)  : 0.5008
Recall (Macro Avg)     : 0.4952
F1-Score (Macro Avg)   : 0.4918

*** PER-CLASS EVALUATION ***
Class                Precision    Recall  F1-Score   Support
------------------------------------------------------------
winner_model_a            0.50      0.60      0.54      4013
winner_model_b            0.51      0.53      0.52      3931
winner_tie                0.50      0.35      0.41      3552
------------------------------------------------------------
Macro Avg                 0.50      0.50      0.49     34488
Weighted Avg              0.50      0.50      0.49     34488

*** ROC-AUC EVALUATION ***
ROC-AUC (OvR) : 0.6806

*** LOG-LOSS EVALUATION ***
Log-loss      : 0.9996

*** LOG-LOSS PER CLASS ***
Class 0: 0.9417  (n=4013)
Class 1: 0.9356  (n=3931)
Class 2: 1.1357  (n=3552)


In [10]:
# Confusion Matrix + Plot
cm_gemma = eval_confusion_matrix(y_val_gemma, y_pred_val_gemma, n_classes=y_proba_val_gemma.shape[1])
plot_confusion_matrix(cm_gemma, class_names, title="Confusion Matrix — Gemma", save_path="results/confusion_matrix/confusion_matrix_gemma.png")


Confusion Matrix (rows=true, cols=pred):
 [[2427  960  626]
 [1222 2082  627]
 [1245 1060 1247]]
Saved plot to: images/confusion_matrix/confusion_matrix_gemma.png


In [11]:
# ROC Curves
plot_roc_curves(y_val_gemma, y_proba_val_gemma, class_names, title_prefix="Gemma ROC", save_path="results/roc/roc_gemma.png")

Saved plot to: images/roc/roc_gemma.png


In [12]:
save_roc_to_csv(y_val_gemma, y_proba_val_gemma, "Gemma", fold_idx=1)

Saved ROC data for class 0 (AUC=0.6973) → results/roc/Gemma_fold1_class0.csv
Saved ROC data for class 1 (AUC=0.6904) → results/roc/Gemma_fold1_class1.csv
Saved ROC data for class 2 (AUC=0.6539) → results/roc/Gemma_fold1_class2.csv


In [13]:
submission_lr = build_submission(
    test_df=test_df,
    y_pred_test=y_pred_test_gemma,
    y_proba_test=y_proba_test_gemma,
    model_name="gemma"
)


Saved: results/submission/submission_gemma.csv
