Fine-tuning roberta-base-go_emotions

In [14]:
import torch
import numpy as np
import evaluate
import os
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    pipeline
)

# --- GLOBAL CONFIGURATION ---
MODEL_CHECKPOINT = "SamLowe/roberta-base-go_emotions"
DATASET_NAME = "dair-ai/emotion"
SAMPLE_SIZE = 16000 # Vous pouvez réduire ce chiffre (ex: 500) pour tester rapidement sur CPU
EVAL_SAMPLE_SIZE = 2000
NUM_TRAIN_EPOCHS = 3

# --- DEVICE SETUP ---
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE_ID = 0 if DEVICE.type == "cuda" else -1
print(f"Primary Device: {DEVICE}")

# --- A. DATA LOADING AND PREPARATION ---
print("Loading dataset...")
raw_datasets = load_dataset(DATASET_NAME)

# 1. Prepare Subsets
train_subset = raw_datasets["train"].shuffle(seed=42).select(range(SAMPLE_SIZE))
eval_subset = raw_datasets["validation"].shuffle(seed=42).select(range(EVAL_SAMPLE_SIZE))

label_names = raw_datasets["train"].features["label"].names
NUM_LABELS = len(label_names)
id2label = {i: name for i, name in enumerate(label_names)}
label2id = {name: i for i, name in enumerate(label_names)}

tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True)

# 2. Tokenize Data
print("Tokenizing data...")
tokenized_train_data = train_subset.map(tokenize_function, batched=True)
tokenized_eval_data = eval_subset.map(tokenize_function, batched=True)

# 3. Final Formatting
tokenized_train_data = tokenized_train_data.remove_columns(["text"])
tokenized_train_data = tokenized_train_data.rename_column("label", "labels")
tokenized_train_data.set_format("torch")

tokenized_eval_data = tokenized_eval_data.remove_columns(["text"])
tokenized_eval_data = tokenized_eval_data.rename_column("label", "labels")
tokenized_eval_data.set_format("torch")

print(f"Train samples available: {len(tokenized_train_data)}")

# --- B. MODEL SETUP ---
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_CHECKPOINT, num_labels=NUM_LABELS, id2label=id2label, label2id=label2id,
    ignore_mismatched_sizes=True, problem_type="single_label_classification"
)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1_score = evaluate.load("f1").compute(predictions=predictions, references=labels, average="weighted")
    accuracy = evaluate.load("accuracy").compute(predictions=predictions, references=labels)
    return {"accuracy": accuracy["accuracy"], "f1_weighted": f1_score["f1"]}

# --- C. TRAINING (IN MEMORY ONLY) ---
# Note: output_dir est obligatoire pour HuggingFace, il créera un dossier temporaire vide ou avec des logs minimes,
# mais grâce à save_strategy="no", il ne remplira pas votre disque avec des modèles lourds.
training_args = TrainingArguments(
    output_dir="./tmp_trainer_logs",
    save_strategy="no", # IMPORTANT: Ne sauvegarde PAS de checkpoints sur le disque
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=NUM_TRAIN_EPOCHS,
    weight_decay=0.01,
    eval_strategy="epoch",
    fp16=torch.cuda.is_available(),
    logging_dir='./logs',
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_data,
    eval_dataset=tokenized_eval_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

print("\n\nStarting Fine-Tuning (In-Memory)...")
trainer.train()
print("\nTraining Complete. Model is updated in RAM.")
# Les commandes de sauvegarde (save_model) ont été supprimées ici.

# --- D. INFERENCE SETUP (DIRECT MEMORY ACCESS) ---
print("Setting up inference pipeline from memory...")

# On utilise directement l'objet 'model' qui vient d'être entraîné
emotion_classifier = pipeline(
    "text-classification",
    model=model,      # Utilise le modèle chargé en RAM
    tokenizer=tokenizer, # Utilise le tokenizer chargé en RAM
    device=DEVICE_ID,
    top_k=1
)

# --- E. CLASSIFICATION LOOP ---
def run_classification_loop(classifier):
    print("\n--- Emotion Classification Tool (In-Memory) ---")
    print("Type 'quit' or 'exit' to stop.")

    while True:
        try:
            user_input = input("\nEnter text to analyze: ")

            if user_input.lower() in ['quit', 'exit']:
                print("Exiting.")
                break

            if not user_input.strip():
                continue

            classification_result = classifier(user_input)
            top_result = classification_result[0][0]
            detected_emotion = top_result['label']
            confidence_score = top_result['score']

            print(f" -> Result: **{detected_emotion.upper()}** (Confidence: {confidence_score:.4f})")

        except Exception as e:
            print(f"\nError: {e}")
            break

# START
if 'emotion_classifier' in locals():
    run_classification_loop(emotion_classifier)

Primary Device: cuda
Loading dataset...
Tokenizing data...


Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at SamLowe/roberta-base-go_emotions and are newly initialized because the shapes did not match:
- classifier.out_proj.bias: found shape torch.Size([28]) in the checkpoint and torch.Size([6]) in the model instantiated
- classifier.out_proj.weight: found shape torch.Size([28, 768]) in the checkpoint and torch.Size([6, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Train samples available: 16000


  trainer = Trainer(




Starting Fine-Tuning (In-Memory)...


Epoch,Training Loss,Validation Loss,Accuracy,F1 Weighted
1,0.2902,0.235278,0.9165,0.917358
2,0.1857,0.165829,0.9315,0.932134
3,0.1144,0.166664,0.934,0.934688


Device set to use cuda:0



Training Complete. Model is updated in RAM.
Setting up inference pipeline from memory...

--- Emotion Classification Tool (In-Memory) ---
Type 'quit' or 'exit' to stop.

Enter text to analyze: my sister is not doing okay , i guess she is ill
 -> Result: **JOY** (Confidence: 0.8875)

Enter text to analyze: she is sick
 -> Result: **SADNESS** (Confidence: 0.7546)

Enter text to analyze: quit
Exiting.


In [None]:
# The model path defined in your code is 'final_roberta_emotion_model'
MODEL_DIR = "final_roberta_emotion_model"
ZIP_FILE_NAME = "emotion_model.zip"

# Use the zip command to compress the folder
!zip -r {ZIP_FILE_NAME} {MODEL_DIR}

  adding: final_roberta_emotion_model/ (stored 0%)
  adding: final_roberta_emotion_model/tokenizer_config.json (deflated 74%)
  adding: final_roberta_emotion_model/training_args.bin (deflated 53%)
  adding: final_roberta_emotion_model/config.json (deflated 53%)
  adding: final_roberta_emotion_model/vocab.json (deflated 59%)
  adding: final_roberta_emotion_model/model.safetensors (deflated 9%)
  adding: final_roberta_emotion_model/tokenizer.json (deflated 82%)
  adding: final_roberta_emotion_model/merges.txt (deflated 53%)
  adding: final_roberta_emotion_model/special_tokens_map.json (deflated 85%)


In [None]:
from google.colab import files

files.download(ZIP_FILE_NAME)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import shutil

# This will copy the entire model directory into a new folder in your Drive's root.
# You can change 'My Drive/Colab_Models' to any path you prefer.
DRIVE_PATH = '/content/drive/My Drive/Colab_Models/final_roberta_emotion_model'
LOCAL_MODEL_PATH = 'final_roberta_emotion_model'

shutil.copytree(LOCAL_MODEL_PATH, DRIVE_PATH)
print(f"Model copied to Google Drive at: {DRIVE_PATH}")

Model copied to Google Drive at: /content/drive/My Drive/Colab_Models/final_roberta_emotion_model
