In [5]:
!pip install transformers datasets evaluate accelerate peft bitsandbytes nvidia-ml-py3 torchinfo -q

In [6]:
import os
import pandas as pd
import numpy as np
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, EarlyStoppingCallback
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
from sklearn.metrics import accuracy_score
from torchinfo import summary
from datetime import datetime
import matplotlib.pyplot as plt
from random import sample

In [4]:
# Load tokenizer and dataset
base_model = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(base_model)
dataset = load_dataset('ag_news', split='train')

def preprocess(examples):
    return tokenizer(examples['text'], truncation=True, padding=True)

tokenized_dataset = dataset.map(preprocess, batched=True, remove_columns=["text"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.07k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

In [7]:
num_labels = dataset.features['label'].num_classes
class_names = dataset.features["label"].names
id2label = {i: label for i, label in enumerate(class_names)}
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")
print(f"the id-label mapping: {id2label}")

number of labels: 4
the labels: ['World', 'Sports', 'Business', 'Sci/Tech']
the id-label mapping: {0: 'World', 1: 'Sports', 2: 'Business', 3: 'Sci/Tech'}


In [8]:
# Train/Validation split
split_datasets = tokenized_dataset.train_test_split(test_size=640, seed=42)
np.random.seed(42)
indices = np.random.permutation(len(split_datasets['train']))[:60000]
train_dataset = split_datasets['train'].select(indices)
eval_dataset = split_datasets['test']
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

In [9]:
print(len(train_dataset), len(eval_dataset))

60000 640


In [10]:
sample_indices = sample(range(len(train_dataset)), 5)

# Extract and decode for display
rows = []
for idx in sample_indices:
    input_ids = train_dataset[idx]['input_ids']
    label = train_dataset[idx]['labels']
    text = tokenizer.decode(input_ids, skip_special_tokens=True)
    rows.append({"ID": idx, "Label": label, "Text": text[:200] + "..."})

# Create DataFrame and print
df = pd.DataFrame(rows)
print(df.to_markdown(index=False))

|    ID |   Label | Text                                                                                                                                                                                                        |
|------:|--------:|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| 19428 |       1 | NASCAR driver T. Labonte to drive limited schedule in Nextel Cup &lt;b&gt;...&lt;/b&gt; Labonte, a native of Corpus Christi, Texas, announced Tuesday that he will compete in at least 10 races with Hen... |
| 42338 |       2 | Bank of America quiet regarding local layoffs While they have been quick to say how much customers will enjoy doing business with Bank of America now that it has taken over Fleet branches, bank offici... |
| 11022 |       1 | Pilkadaris Takes Shanghai Open Lead (AP) AP - Terry Pilkadaris of Australia 

In [11]:
# Load model
model = RobertaForSequenceClassification.from_pretrained(base_model, num_labels=num_labels, id2label=id2label)

peft_config = LoraConfig(
    r=4,
    lora_alpha=16,
    lora_dropout=0.05,
    bias='none',
    target_modules=['query', 'key', 'value', 'dense'],
    task_type="SEQ_CLS"
)

peft_model = get_peft_model(model, peft_config)


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Count trainable parameters
trainable_params = sum(p.numel() for p in peft_model.parameters() if p.requires_grad)
peft_model.print_trainable_parameters()
# assert trainable_params <= 1_000_000, "Too many trainable parameters! Must be <= 1 million."

trainable params: 1,263,364 || all params: 125,918,216 || trainable%: 1.0033


In [13]:
# Training arguments
training_args = TrainingArguments(
    output_dir="results",
    evaluation_strategy="steps",
    eval_steps=200,
    save_steps=200,
    use_cpu=False,
    logging_steps=100,
    learning_rate=2e-5,
    dataloader_num_workers=4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to=None,
    optim="adamw_torch",
    lr_scheduler_type="cosine",
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True
)



In [14]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

In [15]:
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    # callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)


  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
# Train
train_result = trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33multimatekevin1[0m ([33multimatekevin1-new-york-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss,Validation Loss,Accuracy
200,1.1074,0.534155,0.85625
400,0.3223,0.326256,0.898438
600,0.2995,0.322789,0.9




In [1]:
training_history = trainer.state.log_history
steps, losses, eval_accs = [], [], []
for entry in training_history:
    if 'loss' in entry:
        steps.append(entry['step'])
        losses.append(entry['loss'])
    if 'eval_accuracy' in entry:
        eval_accs.append((entry['step'], entry['eval_accuracy']))

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(steps, losses, label='Training Loss')
plt.xlabel("Step")
plt.ylabel("Loss")
plt.title("Training Loss Over Steps")
plt.grid(True)
plt.legend()

plt.subplot(1, 2, 2)
if eval_accs:
    eval_steps, eval_values = zip(*eval_accs)
    plt.plot(eval_steps, eval_values, label='Validation Accuracy', color='orange')
    plt.xlabel("Step")
    plt.ylabel("Accuracy")
    plt.title("Validation Accuracy Over Steps")
    plt.grid(True)
    plt.legend()

plt.tight_layout()
plt.show()

NameError: name 'trainer' is not defined

In [None]:
# Evaluate
eval_results = trainer.evaluate()
print("\nEvaluation Results:", eval_results)




Evaluation Results: {'eval_loss': 0.3092099726200104, 'eval_accuracy': 0.90625, 'eval_runtime': 12.3008, 'eval_samples_per_second': 52.029, 'eval_steps_per_second': 0.813, 'epoch': 0.8957133717210493}


In [None]:
def classify(model, tokenizer, text):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt").to(device)
    output = model(**inputs)

    prediction = output.logits.argmax(dim=-1).item()

    print(f'\n Class: {prediction}, Label: {id2label[prediction]}, Text: {text}')
    return id2label[prediction]

In [None]:
classify( peft_model, tokenizer, "Kederis proclaims innocence Olympic champion Kostas Kederis today left hospital ahead of his date with IOC inquisitors claiming his ...")
classify( peft_model, tokenizer, "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.")


 Class: 1, Label: Sports, Text: Kederis proclaims innocence Olympic champion Kostas Kederis today left hospital ahead of his date with IOC inquisitors claiming his ...

 Class: 2, Label: Business, Text: Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindlingand of ultra-cynics, are seeing green again.


'Business'

In [None]:
# Inference on test set
unlabelled_dataset = pd.read_pickle("test_unlabelled.pkl")
test_dataset = unlabelled_dataset.map(preprocess, batched=True, remove_columns=["text"])


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

In [None]:
def get_predictions(model, dataset):
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=16, collate_fn=data_collator)

    all_preds = []
    for batch in dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        preds = outputs.logits.argmax(dim=-1)
        all_preds.extend(preds.cpu().tolist())

    return all_preds

In [None]:
time = datetime.now().strftime("%d%m%Y%H%M")
predictions = get_predictions(peft_model, test_dataset)
df_output = pd.DataFrame({"ID": range(len(predictions)), "Label": predictions})
df_output.to_csv(f"inference_output_{time}.csv", index=False)
print(f"Predictions saved to inference_output_{time}.csv")

Predictions saved to inference_output_090420250021.csv
