# The Leaky Train Method
### This notebook is for reference only and does NOT the correspond to the final submission in the Kaggle Competition!
### To run this notebook, move this to the master directory of the project, otherwise it may NOT run correctly!

In [2]:
import os
import pandas as pd
import torch
from transformers import RobertaModel, RobertaTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding, RobertaForSequenceClassification
from peft import LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset, Dataset, ClassLabel
import pickle

  from .autonotebook import tqdm as notebook_tqdm





In [3]:
base_model = 'roberta-base'

dataset = load_dataset('ag_news', split='train')
dataset = dataset.add_column("index", list(range(len(dataset))))
tokenizer = RobertaTokenizer.from_pretrained(base_model)

def preprocess(examples):
    tokenized = tokenizer(examples['text'], truncation=True, padding=True)
    return tokenized

tokenized_dataset = dataset.map(preprocess, batched=True,  remove_columns=["text"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

In [4]:
# Extract the number of classess and their names
num_labels = dataset.features['label'].num_classes
class_names = dataset.features["label"].names
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

# Create an id2label mapping
# We will need this for our classifier.
id2label = {i: label for i, label in enumerate(class_names)}

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")


number of labels: 4
the labels: ['World', 'Sports', 'Business', 'Sci/Tech']


In [5]:
model = RobertaForSequenceClassification.from_pretrained(
    base_model,
    id2label=id2label) 

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
# Split the original training set
split_datasets = tokenized_dataset.train_test_split(test_size=640, seed=42)
train_dataset = split_datasets['train']
eval_dataset = split_datasets['test']

## Put weights on training samples that are close to test samples w.r.t. t-SNE

In [17]:
import numpy as np
sample_weights = np.ones(len(train_dataset))  # default weight = 1.0

# Increase weight for close samples
boosted_weight = 5
train_indices_array = np.array(train_dataset["index"])

# Get boolean mask for which rows should be boosted
K = 3
neighbor_indices = np.load("neighbor_indices.npy")
close_train_indices = np.unique(neighbor_indices[:,0:K].flatten())
boost_mask = np.isin(train_indices_array, close_train_indices)
sample_weights[boost_mask] = boosted_weight
sample_weights = sample_weights/sample_weights.mean()

# Add to dataset
train_dataset = train_dataset.add_column("weight", sample_weights.tolist())


In [19]:
# PEFT Config
peft_config = LoraConfig(
    r=2,
    lora_alpha=4,
    lora_dropout=0.05,
    bias = 'none',
    target_modules = ['query'],
    task_type="SEQ_CLS",
)

In [20]:
peft_model = get_peft_model(model, peft_config)
# peft_model

In [21]:
print('PEFT Model')
peft_model.print_trainable_parameters()

PEFT Model
trainable params: 630,532 || all params: 125,279,240 || trainable%: 0.5033


## Training Setup

In [22]:
# To track evaluation accuracy during training
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # Calculate accuracy
    accuracy = accuracy_score(labels, preds)
    return {
        'accuracy': accuracy
    }

In [25]:
from transformers import EarlyStoppingCallback
# Setup Training args
output_dir = "results6"
training_args = TrainingArguments(
    output_dir=output_dir,
    report_to=None,
    eval_strategy='steps',
    eval_steps=1000,
    save_strategy='steps',
    save_steps=1000,
    logging_steps=1000,
    learning_rate=1e-4,
    num_train_epochs=5,
    use_cpu=False,
    dataloader_num_workers=4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    optim="adamw_torch",
    gradient_checkpointing=False,
    gradient_checkpointing_kwargs={'use_reentrant':True},
    load_best_model_at_end=True
)

def get_trainer(model):
      return  Trainer(
          model=model,
          args=training_args,
          compute_metrics=compute_metrics,
          train_dataset=train_dataset,
          eval_dataset=eval_dataset,
          data_collator=data_collator,
        #   callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
      )

# Define Trainer with Weighted Loss 

In [27]:
from transformers import Trainer
import torch.nn.functional as F

class WeightedLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        weights = inputs.pop("weight", None)  # shape = (batch_size,)
        
        outputs = model(**inputs)
        logits = outputs.logits

        # Cross entropy loss per sample
        loss = F.cross_entropy(logits, labels, reduction="none")  # shape = (batch_size,)

        if weights is not None:
            weights = weights.to(loss.device)
            loss = (loss * weights).mean()
        else:
            loss = loss.mean()

        return (loss, outputs) if return_outputs else loss


### Start Training

In [28]:
weighted_trainer =  WeightedLossTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

result = weighted_trainer.train()

Step,Training Loss,Validation Loss,Accuracy
1000,0.5338,0.312408,0.898438
2000,0.3151,0.292992,0.909375
3000,0.2791,0.294866,0.910937
4000,0.2676,0.277367,0.9125
5000,0.2649,0.270529,0.904687
6000,0.2443,0.266764,0.917188
7000,0.2498,0.26611,0.920312
8000,0.2479,0.252327,0.9125
9000,0.2385,0.252246,0.914062
10000,0.2256,0.250519,0.920312


## Evaluate Finetuned Model


### Performing Inference on Custom Input
Uncomment following functions for running inference on custom inputs

In [None]:
def classify(model, tokenizer, text):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt").to(device)
    output = model(**inputs)

    prediction = output.logits.argmax(dim=-1).item()

    print(f'\n Class: {prediction}, Label: {id2label[prediction]}, Text: {text}')
    return id2label[prediction]

In [None]:
print(classify( peft_model, tokenizer, "Kederis proclaims innocence Olympic champion Kostas Kederis today left hospital ahead of his date with IOC inquisitors claiming his ..."))
print(classify( peft_model, tokenizer, "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again."))


 Class: 1, Label: Sports, Text: Kederis proclaims innocence Olympic champion Kostas Kederis today left hospital ahead of his date with IOC inquisitors claiming his ...
Sports

 Class: 2, Label: Business, Text: Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindlinand of ultra-cynics, are seeing green again.
Business


### Run Inference on eval_dataset

In [31]:
from torch.utils.data import DataLoader
import evaluate
from tqdm import tqdm

def evaluate_model(inference_model, dataset, labelled=True, batch_size=8, data_collator=None):
    """
    Evaluate a PEFT model on a dataset.

    Args:
        inference_model: The model to evaluate.
        dataset: The dataset (Hugging Face Dataset) to run inference on.
        labelled (bool): If True, the dataset includes labels and metrics will be computed.
                         If False, only predictions will be returned.
        batch_size (int): Batch size for inference.
        data_collator: Function to collate batches. If None, the default collate_fn is used.

    Returns:
        If labelled is True, returns a tuple (metrics, predictions)
        If labelled is False, returns the predictions.
    """
    # Create the DataLoader
    eval_dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=data_collator)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inference_model.to(device)
    inference_model.eval()

    all_predictions = []
    if labelled:
        metric = evaluate.load('accuracy')

    # Loop over the DataLoader
    for batch in tqdm(eval_dataloader):
        # Move each tensor in the batch to the device
        model_keys = {'input_ids', 'attention_mask', 'labels'}
        batch = {k: v.to(device) for k, v in batch.items() if k in model_keys}
        with torch.no_grad():
            outputs = inference_model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        all_predictions.append(predictions.cpu())

        if labelled:
            # Expecting that labels are provided under the "labels" key.
            references = batch["labels"]
            metric.add_batch(
                predictions=predictions.cpu().numpy(),
                references=references.cpu().numpy()
            )

    # Concatenate predictions from all batches
    all_predictions = torch.cat(all_predictions, dim=0)

    if labelled:
        eval_metric = metric.compute()
        print("Evaluation Metric:", eval_metric)
        return eval_metric, all_predictions
    else:
        return all_predictions

In [32]:
# Check evaluation accuracy
tempacc, _ = evaluate_model(peft_model, eval_dataset, True, 8, data_collator)

100%|██████████| 80/80 [00:03<00:00, 26.13it/s]


Evaluation Metric: {'accuracy': 0.925}


### Run Inference on unlabelled dataset

In [4]:
model = RobertaForSequenceClassification.from_pretrained(
    base_model,
    id2label=id2label)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# LOAD trained PEFT model instead
from peft import PeftConfig, PeftModel
peft_model_dir = "results4/checkpoint-19000"
peft_cfg_load = PeftConfig.from_pretrained(peft_model_dir)
peft_model = PeftModel.from_pretrained(model, peft_model_dir)
peft_model.to('cuda')
peft_model.eval();

In [33]:
#Load your unlabelled data
unlabelled_dataset = pd.read_pickle("test_unlabelled.pkl")
test_dataset = unlabelled_dataset.map(preprocess, batched=True, remove_columns=["text"])
unlabelled_dataset

Map: 100%|██████████| 8000/8000 [00:02<00:00, 2696.32 examples/s]


Dataset({
    features: ['text'],
    num_rows: 8000
})

In [None]:
# Run inference and save predictions
preds = evaluate_model(peft_model, test_dataset, False, 8, data_collator)
output_dir = 'results6'
df_output = pd.DataFrame({
    'ID': range(len(preds)),
    'Label': preds.numpy()  # or preds.tolist()
})
df_output.to_csv(os.path.join(output_dir,"inference_output.csv"), index=False)
print("Inference complete. Predictions saved to inference_output.csv")

100%|██████████| 1000/1000 [00:29<00:00, 33.71it/s]

Inference complete. Predictions saved to inference_output.csv



