In [21]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [22]:
# -----------------------------
# 1. Import libraries
# -----------------------------
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import load_dataset
from peft import get_peft_model, LoraConfig, TaskType, AdaLoraConfig
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
import pickle

# -----------------------------
# 2. Use GPU if available
# -----------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# -----------------------------
# 3. Load and preprocess AGNEWS dataset
# -----------------------------
dataset = load_dataset("ag_news")
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

Using device: cuda


In [23]:
# Check the number of records in the train and test dataset
print(f"Train Samples: {tokenized_dataset['train'].shape[0]}")
print(f"Test Samples: {tokenized_dataset['test'].shape[0]}")

# Print out the first sample
print("\nFirst Sample:")
print("-------------")
for feature in ['text', 'labels']: #, 'input_ids', 'attention_mask']:
    print(f"{feature}: {tokenized_dataset['train'][feature][0]}")

# tokenized_dataset['train']['labels'].unique()

Train Samples: 120000
Test Samples: 7600

First Sample:
-------------
text: Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.
labels: 2


model = AutoModelForSequenceClassification.from_pretrained("roberta-base")

for name, module in model.named_modules():
    if any(k in name for k in ["query", "key", "value", "dense", "proj"]):
        print(name)

In [24]:
# -----------------------------
# 4. Load RoBERTa model with LoRA adapters
# -----------------------------
# model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=4)

'''
# Lora with variable r
# Define per-layer LoRA configs
layer_configs = {
    (0, 5): 1,
    (6, 9): 10,
    (10, 11): 14,
}

# Apply LoRA layer by layer
for (start, end), r in layer_configs.items():
    lora_config = LoraConfig(
        r=r,
        lora_alpha=r * 3,  # alpha proportional to r
        target_modules=["query", "key", "value", "dense"],
        layers_to_transform=list(range(start, end + 1)),
        lora_dropout=0.1,
        bias="none",
        task_type=TaskType.SEQ_CLS,
    )
'''

'''
# Regular LoRa
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    # lora_type="LoHA",  # or "AdaLoRA"
    target_modules=["query", "value", "key", "dense"],
    # target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
    layers_to_transform=list(range(4, 12)),  
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_CLS
)
'''


# # AdaLoRA 

# train_dataset = tokenized_dataset['train']

# total_steps = (len(train_dataset) // training_args.per_device_train_batch_size) * training_args.num_train_epochs
# total_steps = total_steps // training_args.gradient_accumulation_steps

# ada_config = AdaLoraConfig(
#     init_r=12,              # initial rank
#     target_r=4,             # final rank after adaptation
#     tinit=200,              # warmup steps before adaptation starts
#     tfinal=1000,            # total steps to decay rank
#     total_step=total_steps,  # ✅ Required!
#     deltaT=10,              # update rank every deltaT steps
#     beta1=0.85,             # regularization hyperparameters
#     beta2=0.95,
#     lora_alpha=32,
#     lora_dropout=0.1,
#     target_modules=["query", "key", "value"],
#     bias="none",
#     task_type=TaskType.SEQ_CLS,
#     layers_to_transform = list(range(6, 12))  # or even [9, 10, 11]
# )


# # model = get_peft_model(model, lora_config)
# model = get_peft_model(model, ada_config)
# model.to(device)
# model.print_trainable_parameters()

'\n# Regular LoRa\nlora_config = LoraConfig(\n    r=8,\n    lora_alpha=32,\n    # lora_type="LoHA",  # or "AdaLoRA"\n    target_modules=["query", "value", "key", "dense"],\n    # target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],\n    layers_to_transform=list(range(4, 12)),  \n    lora_dropout=0.1,\n    bias="none",\n    task_type=TaskType.SEQ_CLS\n)\n'

In [25]:
# -----------------------------
# 5. Define training arguments
# -----------------------------
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    # save_total_limit=3,                 # 👈 (Optional) Keep only the last 3 saved models
    learning_rate=5e-5,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    num_train_epochs=8,
    weight_decay=0.05,
    # logging_dir="./logs",
    report_to="none",
    fp16 = True,
    seed = 42,  
    # gradient_accumulation_steps=2,
    load_best_model_at_end=True,        # 👈 (Optional) Load best checkpoint automatically
    metric_for_best_model="accuracy",    # 👈 Must be paired with `load_best_model_at_end`
    # logging_steps=50,
    # max_grad_norm=1.0
)


model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=4)


# AdaLoRA 

train_dataset = tokenized_dataset['train']

total_steps = (len(train_dataset) // training_args.per_device_train_batch_size) * training_args.num_train_epochs
total_steps = total_steps // training_args.gradient_accumulation_steps

ada_config = AdaLoraConfig(
    init_r=12,              # initial rank
    target_r=4,             # final rank after adaptation
    tinit=200,              # warmup steps before adaptation starts
    tfinal=1000,            # total steps to decay rank
    total_step=total_steps,  # ✅ Required!
    deltaT=10,              # update rank every deltaT steps
    beta1=0.85,             # regularization hyperparameters
    beta2=0.95,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["query", "key", "value"],
    bias="none",
    task_type=TaskType.SEQ_CLS,
    layers_to_transform = list(range(6, 12))  # or even [9, 10, 11]
)

model = get_peft_model(model, ada_config)
model.to(device)
model.print_trainable_parameters()

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 925,660 || all params: 125,574,386 || trainable%: 0.7371


In [26]:
# -----------------------------
# 6. Train the model
# -----------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

trainer.train()

  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2819,0.2629,0.912763
2,0.2474,0.225329,0.923026
3,0.2362,0.216248,0.927763
4,0.2228,0.20766,0.929605
5,0.2118,0.20562,0.932105
6,0.2008,0.20353,0.933421
7,0.212,0.201846,0.934211
8,0.2005,0.201792,0.933026


TrainOutput(global_step=30000, training_loss=0.32535063044230145, metrics={'train_runtime': 2089.6016, 'train_samples_per_second': 459.418, 'train_steps_per_second': 14.357, 'total_flos': 1.2766054219776e+17, 'train_loss': 0.32535063044230145, 'epoch': 8.0})

In [27]:
# -----------------------------
# 7. Evaluate the model
# -----------------------------
eval_results = trainer.evaluate()
print("Final Evaluation Accuracy:", eval_results["eval_accuracy"])

Final Evaluation Accuracy: 0.9342105263157895


In [28]:
# -----------------------------
# 8. Check trainable parameter count
# -----------------------------
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Trainable parameters: {trainable_params}")

Trainable parameters: 925660


In [29]:
# Save the model and tokenizer
model.save_pretrained("./final_model")  # Saves LoRA adapters + config
tokenizer.save_pretrained("./final_model")

('./final_model/tokenizer_config.json',
 './final_model/special_tokens_map.json',
 './final_model/vocab.json',
 './final_model/merges.txt',
 './final_model/added_tokens.json',
 './final_model/tokenizer.json')

In [30]:
from datasets import Dataset
from torch.utils.data import DataLoader

# Load dataset object
with open("./kaggle/input/test_unlabelled.pkl", "rb") as f:
    test_dataset = pickle.load(f)

# Convert to HuggingFace Dataset (already is, but this helps formatting)
test_dataset = Dataset.from_dict({"text": test_dataset["text"]})

# Tokenize function
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)

# Apply tokenizer
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

# Create PyTorch DataLoader for batching
test_dataloader = DataLoader(tokenized_test_dataset, batch_size=64)

# Prediction loop
model.eval()
all_predictions = []

with torch.no_grad():
    for batch in test_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        preds = torch.argmax(outputs.logits, dim=-1)
        all_predictions.extend(preds.cpu().numpy())


Map: 100%|██████████| 8000/8000 [00:01<00:00, 7628.59 examples/s]


In [31]:
# -----------------------------
# 10. Save predictions to CSV
# -----------------------------
df = pd.DataFrame({
    "ID": list(range(len(all_predictions))),   # ID ✅
    "label": all_predictions
})
df.to_csv("submission.csv", index=False)
print("✅ Batched predictions complete. Saved to submission.csv.")

✅ Batched predictions complete. Saved to submission.csv.
