[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1ZGObhOKJCQhJJZFakc-v2ykj-hXm7K2o?usp=sharing)


# Fine-tuning RoBERTa for Commodity Classification with Hugging Face Transformers and ICIS Datasets Library


In [1]:
!pip install -U transformers datasets huggingface_hub tensorboard==2.18.0
!sudo apt-get install git-lfs --yes

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency r

In [4]:
import os, multiprocessing
import torch
from datasets import load_dataset
from transformers import (
    RobertaForMaskedLM,
    RobertaTokenizerFast,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)
from huggingface_hub import HfFolder, notebook_login
from google.colab import userdata, drive

In [3]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
# IDs & hardware params
model_id   = "FacebookAI/roberta-base"
dataset_id    = "Netizine/icis"
repo_id       = "Netizine/icis"     # RE-CREATE THIS REPO EMPTY BEFORE YOU RUN
output_dir    = "output/icis"
# Mount Drive for persistent storage
drive.mount("/content/drive")
# Dynamic CPU counts
num_cpus    = multiprocessing.cpu_count()
num_proc    = max(1, num_cpus - 2)
num_workers = max(1, num_cpus // 2)

Mounted at /content/drive


In [7]:
# Load dataset
train_ds = load_dataset(dataset_id, split="train")

corpus.txt:   0%|          | 0.00/174M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1352658 [00:00<?, ? examples/s]

In [8]:
# Init tokenizer & model
tokenizer = RobertaTokenizerFast.from_pretrained(model_id)
device    = torch.device("cuda")
model     = RobertaForMaskedLM.from_pretrained(model_id).to(device)

def tokenize_fn(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=256,     # or compute your 90th pct length
        return_special_tokens_mask=True
    )

# Tokenize
tokenized = train_ds.map(
    tokenize_fn,
    batched=True,
    num_proc=num_proc,
    remove_columns=["text"]
).shuffle(seed=42)

# Trainer setup
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm_probability=0.15)
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/icis-checkpoints",
    per_device_train_batch_size=32,
    gradient_checkpointing=True,
    fp16=True,
    num_train_epochs=3,
    warmup_steps=500,
    learning_rate=5e-5,
    weight_decay=0.01,

    logging_strategy="steps",
    logging_steps=10000,

    save_strategy="steps",
    save_steps=25000,
    save_total_limit=5,

    push_to_hub=True,
    hub_strategy="end",
    hub_model_id=repo_id,
    hub_token=os.getenv("HF_TOKEN"),

    dataloader_num_workers=num_workers,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized,
    data_collator=data_collator
)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Map (num_proc=10):   0%|          | 0/1352658 [00:00<?, ? examples/s]

In [10]:
# Fine-tune the model (auto-resume if you re-run with: resume_from_checkpoint=True)
print("▶️ Starting training…")
trainer.train()

# Save our tokenizer and create model card
tokenizer.save_pretrained(repo_id)
trainer.create_model_card()
# Push the results to the hub
trainer.push_to_hub(commit_message="Pushing final RoBERTa MLM")

# Fine-tune the model
print("✅ Done! The fine-tuned model is saved on Hugging Face Hub.")

▶️ Starting training…


Step,Training Loss
5000,1.8562
10000,1.6583
15000,1.5819
20000,1.5248


Step,Training Loss
5000,1.8562
10000,1.6583
15000,1.5819
20000,1.5248
25000,1.4892
30000,1.4514
35000,1.422
40000,1.3991
45000,1.3771
50000,1.3512


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

✅ Done! The fine-tuned model is saved on Hugging Face Hub.


In [14]:
# Clear the storage
!ls "/content/drive/MyDrive"


# Replace `icis-checkpoints` with whatever folder you used
!rm -rf "/content/drive/MyDrive/icis-checkpoints"

!ls "/content/drive/MyDrive"


'Bridge Buildings Individuals Survey.gform'   Outlook
'Colab Notebooks'			     'SCM Coding Tests v1009.gdoc'
'Fill With LinkedIn contacts.gsheet'	     'Untitled spreadsheet.gsheet'
'Getting started.pdf'
'Bridge Buildings Individuals Survey.gform'   Outlook
'Colab Notebooks'			     'SCM Coding Tests v1009.gdoc'
'Fill With LinkedIn contacts.gsheet'	     'Untitled spreadsheet.gsheet'
'Getting started.pdf'


In [16]:
# Test our fine-tuned MLM with a fill-mask pipeline

from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model=repo_id,
    tokenizer=repo_id,
    device=0  # assumes a single GPU; remove for CPU
)

# Craft a sentence with the special mask token
test_sentence = (
    "The glycerine market in Europe will continue to see [MASK] demand "
    "next year, and imports will provide supply stability."
)

# Run the mask-filling
results = fill_mask(test_sentence)

# Display the top 5 predictions
for res in results:
    print(f"{res['sequence']}  (score: {res['score']:.4f})")

ValueError: Unrecognized model in Netizine/icis. Should have a `model_type` key in its config.json, or contain one of the following strings in its name: albert, align, altclip, aria, aria_text, audio-spectrogram-transformer, autoformer, aya_vision, bamba, bark, bart, beit, bert, bert-generation, big_bird, bigbird_pegasus, biogpt, bit, bitnet, blenderbot, blenderbot-small, blip, blip-2, blip_2_qformer, bloom, bridgetower, bros, camembert, canine, chameleon, chinese_clip, chinese_clip_vision_model, clap, clip, clip_text_model, clip_vision_model, clipseg, clvp, code_llama, codegen, cohere, cohere2, colpali, conditional_detr, convbert, convnext, convnextv2, cpmant, csm, ctrl, cvt, d_fine, dab-detr, dac, data2vec-audio, data2vec-text, data2vec-vision, dbrx, deberta, deberta-v2, decision_transformer, deepseek_v3, deformable_detr, deit, depth_anything, depth_pro, deta, detr, diffllama, dinat, dinov2, dinov2_with_registers, distilbert, donut-swin, dpr, dpt, efficientformer, efficientnet, electra, emu3, encodec, encoder-decoder, ernie, ernie_m, esm, falcon, falcon_mamba, fastspeech2_conformer, flaubert, flava, fnet, focalnet, fsmt, funnel, fuyu, gemma, gemma2, gemma3, gemma3_text, git, glm, glm4, glpn, got_ocr2, gpt-sw3, gpt2, gpt_bigcode, gpt_neo, gpt_neox, gpt_neox_japanese, gptj, gptsan-japanese, granite, granite_speech, granitemoe, granitemoehybrid, granitemoeshared, granitevision, graphormer, grounding-dino, groupvit, helium, hgnet_v2, hiera, hubert, ibert, idefics, idefics2, idefics3, idefics3_vision, ijepa, imagegpt, informer, instructblip, instructblipvideo, internvl, internvl_vision, jamba, janus, jetmoe, jukebox, kosmos-2, layoutlm, layoutlmv2, layoutlmv3, led, levit, lilt, llama, llama4, llama4_text, llava, llava_next, llava_next_video, llava_onevision, longformer, longt5, luke, lxmert, m2m_100, mamba, mamba2, marian, markuplm, mask2former, maskformer, maskformer-swin, mbart, mctct, mega, megatron-bert, mgp-str, mimi, mistral, mistral3, mixtral, mlcd, mllama, mobilebert, mobilenet_v1, mobilenet_v2, mobilevit, mobilevitv2, modernbert, moonshine, moshi, mpnet, mpt, mra, mt5, musicgen, musicgen_melody, mvp, nat, nemotron, nezha, nllb-moe, nougat, nystromformer, olmo, olmo2, olmoe, omdet-turbo, oneformer, open-llama, openai-gpt, opt, owlv2, owlvit, paligemma, patchtsmixer, patchtst, pegasus, pegasus_x, perceiver, persimmon, phi, phi3, phi4_multimodal, phimoe, pix2struct, pixtral, plbart, poolformer, pop2piano, prompt_depth_anything, prophetnet, pvt, pvt_v2, qdqbert, qwen2, qwen2_5_omni, qwen2_5_vl, qwen2_5_vl_text, qwen2_audio, qwen2_audio_encoder, qwen2_moe, qwen2_vl, qwen2_vl_text, qwen3, qwen3_moe, rag, realm, recurrent_gemma, reformer, regnet, rembert, resnet, retribert, roberta, roberta-prelayernorm, roc_bert, roformer, rt_detr, rt_detr_resnet, rt_detr_v2, rwkv, sam, sam_hq, sam_hq_vision_model, sam_vision_model, seamless_m4t, seamless_m4t_v2, segformer, seggpt, sew, sew-d, shieldgemma2, siglip, siglip2, siglip_vision_model, smolvlm, smolvlm_vision, speech-encoder-decoder, speech_to_text, speech_to_text_2, speecht5, splinter, squeezebert, stablelm, starcoder2, superglue, superpoint, swiftformer, swin, swin2sr, swinv2, switch_transformers, t5, table-transformer, tapas, textnet, time_series_transformer, timesfm, timesformer, timm_backbone, timm_wrapper, trajectory_transformer, transfo-xl, trocr, tvlt, tvp, udop, umt5, unispeech, unispeech-sat, univnet, upernet, van, video_llava, videomae, vilt, vipllava, vision-encoder-decoder, vision-text-dual-encoder, visual_bert, vit, vit_hybrid, vit_mae, vit_msn, vitdet, vitmatte, vitpose, vitpose_backbone, vits, vivit, wav2vec2, wav2vec2-bert, wav2vec2-conformer, wavlm, whisper, xclip, xglm, xlm, xlm-prophetnet, xlm-roberta, xlm-roberta-xl, xlnet, xmod, yolos, yoso, zamba, zamba2, zoedepth