[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1ZGObhOKJCQhJJZFakc-v2ykj-hXm7K2o?usp=sharing)


# Fine-tuning RoBERTa for Commodity Classification with Hugging Face Transformers and ICIS Datasets Library


In [None]:
!pip install -U transformers datasets huggingface_hub tensorboard==2.18.0
!sudo apt-get install git-lfs --yes

In [5]:
import torch
from datasets import load_dataset
from transformers import (
    AutoConfig,
    RobertaForSequenceClassification,
    RobertaTokenizerFast,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    AutoModelForSequenceClassification,
)
from huggingface_hub import HfFolder, notebook_login

In [6]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [7]:
model_id   = "facebook/roberta-base"   # or your fine-tuned LM if you prefer
dataset_id = "Netizine/icis"           # your Hub dataset
repo_id    = "Netizine/icis"           # your target model repo on the Hub

In [None]:
# Load dataset
dataset = load_dataset(dataset_id)
train_dataset = dataset['train']
test_dataset = dataset["test"].shard(num_shards=2, index=0)

# Split train_dataset into train and validation sets
val_dataset = dataset['test'].shard(num_shards=2, index=1)

# Preprocessing
tokenizer = RobertaTokenizerFast.from_pretrained(model_id)

def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=256)

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
val_dataset = val_dataset.map(tokenize, batched=True, batch_size=len(val_dataset))
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# Extract the number of classess and their names
num_labels = dataset['train'].features['label'].num_classes
class_names = dataset["train"].features["label"].names
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

# Create an id2label mapping
# We will need this to directly output the class names when using the pipeline without needing to map the labels later.
id2label = {i: label for i, label in enumerate(class_names)}

# 3. Update the model's configuration with the id2label mapping
config = AutoConfig.from_pretrained(model_id)
config.update({"id2label": id2label})

In [None]:
# Model
model = RobertaForSequenceClassification.from_pretrained(model_id, config=config)

# TrainingArguments
training_args = TrainingArguments(
    output_dir=repo_id,
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    logging_dir=f"{repo_id}/logs",
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=500,
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,
    report_to="tensorboard",
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repo_id,
    hub_token=HfFolder.get_token(),
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [None]:
# Fine-tune the model
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
# Save our tokenizer and create model card
tokenizer.save_pretrained(repo_id)
trainer.create_model_card()
# Push the results to the hub
trainer.push_to_hub()

In [None]:
# TEST MODEL

from transformers import pipeline
# from datasets import load_dataset

# dataset = load_dataset(dataset_id)
# class_names = dataset["train"].features["label"].names

pip = pipeline('text-classification',repo_id)


text = "Some good ICIS question based on 1 of the sentenes eg The glycerine market in Europe will continue to see relatively constrained demand next year, and imports will continue to provide supply stability in the region. is a sentence added"
result = pip(text)

predicted_label = result[0]["label"]
print(f"Predicted label: {predicted_label}")