In [2]:
import pandas as pd
from pathlib import Path
import numpy as np
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

#Utilisation des mêmes labels
labels = ["admiration", "amusement" , "anger" , "annoyance" ,"approval" ,"caring" ,"confusion" ,"curiosity" ,"desire","disappointment","disapproval",
          "disgust","embarrassment","excitement","fear","gratitude","grief","joy","love","nervousness","optimism","pride","realization","relief",
          "remorse","sadness","surprise","neutral"
]

#Création de disctionnaire pour le model
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i : label for i, label in enumerate(labels)}

#Utilisation du modèle
model_n = "bhadresh-savani/bert-base-go-emotion"
tokenizer = AutoTokenizer.from_pretrained(model_n)
model = AutoModelForSequenceClassification.from_pretrained(model_n, num_labels=len(labels), id2label=id2label, label2id=label2id)

#Chargement du CSV
csv_path = Path.home() /"PROJECT" / "figures" / "Augmentationdedonnees"/ "emotions.csv"
df = pd.read_csv(csv_path)

#Convertir les labels
if df['label'].dtype == object:
    df["label"] = df["label"].map(label2id)

dataset = Dataset.from_pandas(df)

#Fonction de tokenization 
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

dataset = dataset.map(preprocess_function, batched=True)

#Je split le Dataset
split = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split["train"]
eval_dataset = split["test"]

output_dir = Path.home() / "PROJECT" / "src"
output_dir.mkdir(parents=True, exist_ok=True)

#J'entraie le modèle, ce sont les arguments
training_args = TrainingArguments(
    output_dir=str(output_dir),
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

#Fonction pour calculer
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    prf = precision_recall_fscore_support(labels, predictions, average="weighted")
    return {
        "accuracy":acc,
        "precision": prf[0],
        "recall": prf[1],
        "f1": prf[2],
    }

#Initialisation
trainer =Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

#Je lance l'entrainement et le sauvegarde
trainer.train()

trainer.save_model(str(output_dir))
tokenizer.save_pretrained(str(output_dir))

Map: 100%|██████████| 30/30 [00:00<00:00, 2964.31 examples/s]


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.821842,0.5,0.3,0.5,0.357143
2,No log,0.553364,0.666667,0.5,0.666667,0.555556
3,No log,0.427266,1.0,1.0,1.0,1.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


('C:\\Users\\AyaKu\\PROJECT\\src\\tokenizer_config.json',
 'C:\\Users\\AyaKu\\PROJECT\\src\\special_tokens_map.json',
 'C:\\Users\\AyaKu\\PROJECT\\src\\vocab.txt',
 'C:\\Users\\AyaKu\\PROJECT\\src\\added_tokens.json',
 'C:\\Users\\AyaKu\\PROJECT\\src\\tokenizer.json')