In [None]:
!pip install transformers datasets torch scikit-learn

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [None]:
from transformers import XLMRobertaForSequenceClassification, XLMRobertaTokenizer
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import os

os.environ["WANDB_DISABLED"] = "true"

def compute_metrics(pred):
    logits, labels = pred
    predictions = np.argmax(logits, axis=1)
    return {
        'accuracy': accuracy_score(labels, predictions),
        'f1': f1_score(labels, predictions, average='weighted'),
        'precision': precision_score(labels, predictions, average='weighted'),
        'recall': recall_score(labels, predictions, average='weighted'),
    }

dataset = load_dataset("tweets_hate_speech_multilingual")

langs = ["en", "de", "fr"]
dataset = DatasetDict({
    "train": dataset["train"].filter(lambda x: x["lang"] in langs),
    "test": dataset["test"].filter(lambda x: x["lang"] in langs),
})

def map_labels(example):
    label_map = {"negative": 0, "neutral": 1, "positive": 2}
    example["labels"] = label_map[example["sentiment"]]
    return example

dataset = dataset.map(map_labels)

model_name = "xlm-roberta-base"
tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
model = XLMRobertaForSequenceClassification.from_pretrained(model_name, num_labels=3)

def preprocess_data(examples):
    return tokenizer(examples["text"], truncation=True, padding=True, max_length=128)

tokenized_dataset = dataset.map(preprocess_data, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

results = trainer.evaluate()
print("Evaluation Results:", results)

new_texts = [
    "I love this product!",  # English
    "Dieses Produkt ist schrecklich.",  # German
    "C'est un produit moyen.",  # French
]
tokenized_texts = tokenizer(new_texts, padding=True, truncation=True, return_tensors="pt", max_length=128)
predictions = model(**tokenized_texts).logits
predicted_labels = np.argmax(predictions.detach().numpy(), axis=1)

label_map = {0: "negative", 1: "neutral", 2: "positive"}
predicted_sentiments = [label_map[label] for label in predicted_labels]
print("Predicted Sentiments:", predicted_sentiments)


DatasetNotFoundError: Dataset 'tweets_hate_speech_multilingual' doesn't exist on the Hub or cannot be accessed.