[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1ZGObhOKJCQhJJZFakc-v2ykj-hXm7K2o?usp=sharing)


# Fine-tuning RoBERTa for Commodity Classification with Hugging Face Transformers and ICIS Datasets Library


In [None]:
!pip install -U transformers datasets huggingface_hub tensorboard==2.18.0
!sudo apt-get install git-lfs --yes

In [25]:
import torch
from datasets import load_dataset
from transformers import (
    RobertaForMaskedLM,
    RobertaTokenizerFast,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)
from huggingface_hub import HfFolder, notebook_login
from google.colab import userdata

In [26]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [27]:
model_id   = "FacebookAI/roberta-base"   # or your fine-tuned LM if you prefer
dataset_id = "Netizine/icis"           # your Hub dataset
saved_model_id    = "Netizine/icis"           # your target model repo on the Hub

In [28]:
# Load dataset
train_dataset = load_dataset(dataset_id, split="train")

In [30]:
# Init tokenizer & model
tokenizer = RobertaTokenizerFast.from_pretrained(model_id)
device    = torch.device("cuda")
model     = RobertaForMaskedLM.from_pretrained(model_id).to(device)

def tokenize_fn(examples):
    return tokenizer(examples["text"],
                      truncation=True,
                      max_length=512,
                      return_special_tokens_mask=True)

# 3) Tokenize
tokenized = train_dataset.map(
    tokenize_fn, batched=True, num_proc=8, remove_columns=["text"]
).shuffle(42)

# 4) Trainer setup
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm_probability=0.15)
training_args = TrainingArguments(
    output_dir="checkpoints", per_device_train_batch_size=32,
    gradient_checkpointing=True, fp16=True,
    num_train_epochs=3, warmup_steps=500,
    learning_rate=5e-5, weight_decay=0.01,
    dataloader_num_workers=4, logging_steps=100,
    save_steps=1000, report_to="none",
    push_to_hub=True, hub_model_id=saved_model_id,
    hub_private_repo=True, hub_token=userdata.get('HF_TOKEN'),
)
trainer = Trainer(
    model=model, args=training_args,
    train_dataset=tokenized, data_collator=data_collator
)

Map (num_proc=8):   0%|          | 0/1352658 [00:00<?, ? examples/s]

In [None]:
# Fine-tune the model
print("▶️ Starting training…")
trainer.train()

▶️ Starting training…


Step,Training Loss
100,2.2436
200,2.1533
300,2.0737
400,2.0444
500,2.0497
600,2.0547
700,2.0073
800,1.9991
900,1.9939
1000,1.9479


In [None]:
# Save our tokenizer and create model card
tokenizer.save_pretrained(saved_model_id)
trainer.create_model_card()
# Push the results to the hub
trainer.push_to_hub(commit_message="Final RoBERTa MLM on ICIS")

In [None]:
# Test our fine-tuned MLM with a fill-mask pipeline

from transformers import pipeline

# Load your model + tokenizer directly from the Hub
fill_mask = pipeline(
    "fill-mask",
    model=saved_model_id,
    tokenizer=saved_model_id,
    device=0  # assumes a single GPU; remove for CPU
)

# Craft a sentence with the special mask token
test_sentence = (
    "The glycerine market in Europe will continue to see [MASK] demand "
    "next year, and imports will provide supply stability."
)

# Run the mask-filling
results = fill_mask(test_sentence)

# Display the top 5 predictions
for res in results:
    print(f"{res['sequence']}  (score: {res['score']:.4f})")