[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1ZGObhOKJCQhJJZFakc-v2ykj-hXm7K2o?usp=sharing)


# Fine-tuning RoBERTa for Commodity Classification with Hugging Face Transformers and ICIS Datasets Library


In [2]:
!pip install -U transformers datasets huggingface_hub
!sudo apt-get install git-lfs --yes

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.3).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


In [3]:
import os, multiprocessing
import torch
from datasets import load_dataset
from transformers import (
    RobertaForMaskedLM,
    RobertaTokenizerFast,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)
from huggingface_hub import HfFolder, notebook_login
from google.colab import userdata, drive

In [3]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
# IDs & hardware params
model_id   = "FacebookAI/roberta-base"
dataset_id    = "Netizine/icis"
repo_id       = "Netizine/icis"     # RE-CREATE THIS REPO EMPTY BEFORE YOU RUN
output_dir    = "output/icis"
# Mount Drive for persistent storage
drive.mount("/content/drive")
# Dynamic CPU counts
num_cpus    = multiprocessing.cpu_count()
num_proc    = max(1, num_cpus - 2)
num_workers = max(1, num_cpus // 2)

Mounted at /content/drive


In [5]:
# Load dataset
train_ds = load_dataset(dataset_id, split="train")

corpus.txt:   0%|          | 0.00/174M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1352658 [00:00<?, ? examples/s]

In [6]:
# Init tokenizer & model
tokenizer = RobertaTokenizerFast.from_pretrained(model_id)
device    = torch.device("cuda")
model     = RobertaForMaskedLM.from_pretrained(model_id).to(device)

def tokenize_fn(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=256,     # or compute your 90th pct length
        return_special_tokens_mask=True
    )

# Tokenize
tokenized = train_ds.map(
    tokenize_fn,
    batched=True,
    num_proc=num_proc,
    remove_columns=["text"]
).shuffle(seed=42)

# Trainer setup
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm_probability=0.15)
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/icis-checkpoints",
    per_device_train_batch_size=32,
    gradient_checkpointing=True,
    fp16=True,
    num_train_epochs=3,
    warmup_steps=500,
    learning_rate=5e-5,
    weight_decay=0.01,

    logging_strategy="steps",
    logging_steps=10000,

    save_strategy="steps",
    save_steps=25000,
    save_total_limit=5,

    push_to_hub=True,
    hub_strategy="end",
    hub_model_id=repo_id,
    hub_token=os.getenv("HF_TOKEN"),

    dataloader_num_workers=num_workers,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized,
    data_collator=data_collator
)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Map (num_proc=10):   0%|          | 0/1352658 [00:00<?, ? examples/s]

In [7]:
# Fine-tune the model (auto-resume if you re-run with: resume_from_checkpoint=True)
print("▶️ Starting training…")
trainer.train()

# Save our tokenizer and create model card
tokenizer.save_pretrained(repo_id)
trainer.create_model_card()
# Push the results to the hub
trainer.push_to_hub(commit_message="Pushing final RoBERTa MLM")

# Fine-tune the model
print("✅ Done! The fine-tuned model is saved on Hugging Face Hub.")

▶️ Starting training…


Step,Training Loss
10000,1.7566
20000,1.5527
30000,1.4705
40000,1.409
50000,1.3627
60000,1.3217
70000,1.2846
80000,1.2547
90000,1.2245
100000,1.1955


Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

✅ Done! The fine-tuned model is saved on Hugging Face Hub.


In [1]:
# Clear the storage
!ls "/content/drive/MyDrive"


# Replace `icis-checkpoints` with whatever folder you used
!rm -rf "/content/drive/MyDrive/icis-checkpoints"

!ls "/content/drive/MyDrive"


ls: cannot access '/content/drive/MyDrive': No such file or directory
ls: cannot access '/content/drive/MyDrive': No such file or directory


In [9]:
# Test our fine-tuned MLM with a fill-mask pipeline

from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="Netizine/icis",
    tokenizer="Netizine/icis",
    device=0  # assumes a single GPU; remove for CPU
)

# Craft a sentence with the special mask token
test_sentence = (
    "The glycerine market in Europe will continue to see weak demand "
    "next year, and imports will provide supply stability. Benzene <mask> should see good demand."
)

# Run the mask-filling
results = fill_mask(test_sentence)

# Display the top 5 predictions
for res in results:
    print(f"{res['sequence']}  (score: {res['score']:.4f})")

Device set to use cuda:0


The glycerine market in Europe will continue to see weak demand next year, and imports will provide supply stability. Benzene derivatives should see good demand.  (score: 0.3432)
The glycerine market in Europe will continue to see weak demand next year, and imports will provide supply stability. Benzene markets should see good demand.  (score: 0.0679)
The glycerine market in Europe will continue to see weak demand next year, and imports will provide supply stability. Benzene producers should see good demand.  (score: 0.0633)
The glycerine market in Europe will continue to see weak demand next year, and imports will provide supply stability. Benzene market should see good demand.  (score: 0.0483)
The glycerine market in Europe will continue to see weak demand next year, and imports will provide supply stability. Benzene derivative should see good demand.  (score: 0.0328)
