In [None]:
import pandas as pd
import numpy as np
import torch
import evaluate
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import Dataset

# Load dataset
df = pd.read_csv('data/processed/colbert_humor.csv')
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.2, seed=42)
print(dataset)

# Load tokenizer and model
model_name = "distilbert/distilbert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

encoded_dataset = dataset.map(preprocess_function, batched=True)
# Remove only columns that are actually present (avoid ValueError). Do not remove 'text' before tokenization.
cols_to_try = ['__index_level_0__']
cols_present = [c for c in cols_to_try if c in encoded_dataset['train'].column_names]
if cols_present:
    encoded_dataset = encoded_dataset.remove_columns(cols_present)

# Ensure the label column is named 'labels' because Trainer expects 'labels' by default
if 'labels' not in encoded_dataset['train'].column_names:
    label_candidate = 'humor' if 'humor' in encoded_dataset['train'].column_names else ('label' if 'label' in encoded_dataset['train'].column_names else None)
    if label_candidate is None:
        raise ValueError(f"No label column found. Available columns: {encoded_dataset['train'].column_names}")
    # When using batched=True the function receives lists; convert labels to int to force CrossEntropyLoss (not BCEWithLogits)
    def _add_labels(batch):
        labels = batch[label_candidate]
        batch['labels'] = [int(l) for l in labels]
        return batch
    encoded_dataset = encoded_dataset.map(_add_labels, batched=True)
    # Optionally remove the old label column
    if label_candidate in encoded_dataset['train'].column_names:
        encoded_dataset = encoded_dataset.remove_columns([label_candidate])

# Ensure labels have integer dtype in the dataset features (best-effort)
try:
    from datasets import Value
    encoded_dataset = encoded_dataset.cast_column('labels', Value('int64'))
except Exception:
    # cast_column may fail depending on datasets version; we've already converted to int above
    pass

encoded_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

training_args = TrainingArguments(
    output_dir='./results_multilingual',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=1,
    fp16=True,
)

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='binary')
    acc = accuracy_score(p.label_ids, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['test'],
    compute_metrics=compute_metrics,
)


In [None]:
trainer.train()
trainer.evaluate()
model.save_pretrained('./humor_model_multilingual')