In [1]:
import torch
import numpy as np
import pandas as pd
from datasets import load_from_disk
from transformers import DataCollatorWithPadding

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    set_seed,
)


# Here we are going to use Distil-RoBERTa
language_model_name = "distilroberta-base"

# Training Argurments
batch_size = 32

# optim
learning_rate = 1e-4
weight_decay = 0.001

# training
epochs = 1
device = "cuda" if torch.cuda.is_available() else "cpu"

set_seed(42)

2025-09-13 20:52:13.284731: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In this notebook we are training **RoBERTa** as a classifier. As training material we are going to use the description of each item (i.e. the english description in the wikidata entry of each item). At the end we are indeed going to use the model to extract sentence emdeddings. This will be the input, together with the properties and statistical embeddings, to a final `MLP` that will perform the classification task.

### Data preprocessing

In [2]:
dataset = load_from_disk("../datasets/train_and_val")

Here we are only interested in the description of the item. This will be the text that we are going to use to train the encoder. For this reason we remove the other attributes.

In [3]:
dataset = dataset.remove_columns(['item', 'name', 'type', 'category', 'subcategory'])

Now we need to map the labels to integer values. To do so we can use the `map` function.

In [4]:
labels = {'cultural agnostic': 0, 'cultural representative': 1, 'cultural exclusive': 2}

In [5]:
def map_labels(data):
    data['label'] = labels[data['label']]
    return data

train_dataset = dataset['train'].map(map_labels)
val_dataset = dataset['validation'].map(map_labels)

In [6]:
import evaluate

# Since we are saving also the hidden states eval_pred.predictions is a tuple
# of this kind ((300, 3), (7)), where (300, 3) are the logits and the second
# element is a tuple that has to do with the hidden states.

def compute_metrics(eval_pred):
    accuracy = evaluate.load('accuracy')
    f1 = evaluate.load('f1')

    logits = eval_pred.predictions[0]
    labels = eval_pred.label_ids


    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = f1.compute(predictions=predictions, references=labels, average='macro')['f1']

    return {"accuracy": accuracy, "f1": f1}

In [7]:
model = AutoModelForSequenceClassification.from_pretrained(language_model_name,
                                                                   ignore_mismatched_sizes=True,
                                                                   output_attentions=False,
                                                                   output_hidden_states=True,   # ← we'll need this later to extract features embeddings
                                                                   num_labels=3) # number of the classes
# Load the pretrained tokenizer
tokenizer = AutoTokenizer.from_pretrained(language_model_name)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def tokenize_function(examples):
    return tokenizer(examples["description"], padding=True, truncation=True)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
tokenized_training_dataset = train_dataset.map(tokenize_function, batched=True)

In [9]:
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)

Let's define the training arguments.

In [10]:
training_args = TrainingArguments(
    output_dir="training_dir",                    # output directory [Mandatory]
    num_train_epochs=epochs,                      # total number of training epochs
    per_device_train_batch_size=batch_size,       # batch size per device during training
    warmup_steps= 20,                             # number of warmup steps for learning rate scheduler
    weight_decay=weight_decay,                    # strength of weight decay
    save_strategy="no",
    learning_rate=learning_rate,                  # learning rate
    report_to="none",
    logging_dir="cultural_classification_logs",         # use it later to get the training curves
    logging_steps=30

)

In [11]:
trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_training_dataset,
   eval_dataset=tokenized_val_dataset,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

  trainer = Trainer(


Let's train the model.

In [12]:
trainer.train()



Step,Training Loss
30,0.9706
60,0.7474
90,0.6864
120,0.6389
150,0.6201
180,0.6117


TrainOutput(global_step=196, training_loss=0.6976619253353197, metrics={'train_runtime': 649.3975, 'train_samples_per_second': 9.626, 'train_steps_per_second': 0.302, 'total_flos': 113212486975140.0, 'train_loss': 0.6976619253353197, 'epoch': 1.0})

In [13]:
trainer.evaluate()

Using the latest cached version of the module from /home/gjergj/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--accuracy/f887c0aab52c2d38e1f8a215681126379eca617f96c447638f751434e8e65b14 (last modified on Sun Apr 20 14:12:12 2025) since it couldn't be found locally at evaluate-metric--accuracy, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from /home/gjergj/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--f1/34c46321f42186df33a6260966e34a368f14868d9cc2ba47d142112e2800d233 (last modified on Sun Apr 20 14:12:14 2025) since it couldn't be found locally at evaluate-metric--f1, or remotely on the Hugging Face Hub.


{'eval_loss': 0.6637540459632874,
 'eval_accuracy': 0.68,
 'eval_f1': 0.6586038883732849,
 'eval_runtime': 11.1945,
 'eval_samples_per_second': 26.799,
 'eval_steps_per_second': 3.395,
 'epoch': 1.0}

In [14]:
# Save the model
model.save_pretrained("../models/RoBERTa")
tokenizer.save_pretrained("../models/RoBERTa")

('../models/RoBERTa/tokenizer_config.json',
 '../models/RoBERTa/special_tokens_map.json',
 '../models/RoBERTa/vocab.json',
 '../models/RoBERTa/merges.txt',
 '../models/RoBERTa/added_tokens.json',
 '../models/RoBERTa/tokenizer.json')