In [5]:
import pandas as pd

#Utilisation d'un Modèle Pré-Entrraîné
#Le fine-tuning consiste à prendre un modèle de langage pré-entraîné sur un vaste corpus de données générales et à l'ajuster 
# (fine-tuner) sur un ensemble de données spécifique à votre tâche. Pour la classification de texte, vous pouvez utiliser 
# des modèles comme BERT, DistilBERT, ou RoBERTa.

#2. Avantages du Fine-Tuning
#Connaissances Générales : Le modèle pré-entraîné a déjà appris une grande quantité de connaissances générales sur le langage, 
# ce qui lui permet de mieux comprendre les nuances du texte.
#Adaptation Spécifique : En fine-tunant le modèle sur vos données spécifiques, vous pouvez l'adapter pour qu'il fasse des 
# prédictions précises pour votre tâche particulière.

data = {
    "note": [
        "Le serveur est en panne, tout est bloqué.",
        "L'application est lente à charger.",
        "Une fonctionnalité mineure ne fonctionne pas.",
        "La base de données est corrompue.",
        "La page de connexion prend du temps à s'afficher.",
        "Le bouton de téléchargement ne fonctionne pas.",
        "Le réseau est complètement hors service.",
        "Il y a un léger décalage dans l'affichage.",
        "Le système redémarre de manière aléatoire.",
        "Certains utilisateurs ne peuvent pas se connecter.",
        "Les emails ne sont pas envoyés correctement.",
        "Le site web est inaccessible pour certains utilisateurs.",
        "Les notifications push ne fonctionnent pas.",
        "Les rapports ne sont pas générés comme prévu.",
        "Le paiement en ligne ne fonctionne pas.",
        "Le temps de réponse du serveur est très lent.",
        "Des erreurs 500 apparaissent fréquemment.",
        "L'interface utilisateur est boguée.",
        "Des fichiers sont manquants dans la base de données.",
        "Les utilisateurs sont déconnectés automatiquement.",
        "Le système de sauvegarde ne fonctionne pas.",
        "Le processus de login est très lent.",
        "Des erreurs de validation des données.",
        "Les mises à jour ne sont pas appliquées correctement.",
        "La recherche dans l'application ne fonctionne pas.",
        "Le tableau de bord ne s'affiche pas.",
        "Les utilisateurs ne reçoivent pas leurs confirmations par email.",
        "Le chargement des images est très lent.",
        "Les permissions d'accès sont incorrectes.",
        "Le service client ne reçoit pas les messages."
    ],
    "label": [
        "critique", "moyen", "bas", "critique", "moyen", "bas", "critique", "bas",
        "critique", "moyen", "moyen", "critique", "bas", "moyen", "critique", "moyen",
        "critique", "bas", "critique", "critique", "moyen", "moyen", "moyen", "bas",
        "moyen", "moyen", "moyen", "moyen", "moyen", "moyen"
    ]
}

# Créer un DataFrame
df = pd.DataFrame(data)

# Sauvegarder en CSV
file_path = "notes_pannes.csv"
df.to_csv(file_path, index=False)
file_path

'notes_pannes.csv'

In [4]:
import pandas as pd 

df = pd.read_csv('notes_pannes.csv')
df, df.info(), df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   note    30 non-null     object
 1   label   30 non-null     object
dtypes: object(2)
memory usage: 608.0+ bytes


(                                                 note     label
 0           Le serveur est en panne, tout est bloqué.  critique
 1                  L'application est lente à charger.     moyen
 2       Une fonctionnalité mineure ne fonctionne pas.       bas
 3                   La base de données est corrompue.  critique
 4   La page de connexion prend du temps à s'afficher.     moyen
 5      Le bouton de téléchargement ne fonctionne pas.       bas
 6            Le réseau est complètement hors service.  critique
 7          Il y a un léger décalage dans l'affichage.       bas
 8          Le système redémarre de manière aléatoire.  critique
 9   Certains utilisateurs ne peuvent pas se connec...     moyen
 10       Les emails ne sont pas envoyés correctement.     moyen
 11  Le site web est inaccessible pour certains uti...  critique
 12        Les notifications push ne fonctionnent pas.       bas
 13      Les rapports ne sont pas générés comme prévu.     moyen
 14            Le paiemen

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Charger les données
data = pd.read_csv("notes_pannes.csv")

# Mapper les labels en entiers
label_mapping = {"critique": 0, "moyen": 1, "bas": 2}
data['label'] = data['label'].map(label_mapping)

# Diviser les données en ensembles d'entraînement et de test
train_texts, val_texts, train_labels, val_labels = train_test_split(data['note'].tolist(), data['label'].tolist(), test_size=0.2)

In [15]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from datasets import Dataset

# Charger le tokenizer et le modèle
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=3)

# Préparer les datasets
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

train_dataset = Dataset.from_dict({"input_ids": train_encodings['input_ids'], "attention_mask": train_encodings['attention_mask'], "labels": train_labels})
val_dataset = Dataset.from_dict({"input_ids": val_encodings['input_ids'], "attention_mask": val_encodings['attention_mask'], "labels": val_labels})

# Configurer les arguments de l'entraînement
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=32,
    weight_decay=0.01,
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Créer un Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Entraîner le modèle
trainer.train()

# Sauvegarder le modèle fine-tuné
model.save_pretrained("./fine_tuned_model_classif")
tokenizer.save_pretrained("./fine_tuned_model_classif")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  3%|▎         | 2/64 [00:00<00:28,  2.17it/s]
  3%|▎         | 2/64 [00:00<00:28,  2.17it/s]

{'eval_loss': 1.1187443733215332, 'eval_runtime': 0.042, 'eval_samples_per_second': 142.851, 'eval_steps_per_second': 23.809, 'epoch': 1.0}


  6%|▋         | 4/64 [00:01<00:28,  2.13it/s]
  6%|▋         | 4/64 [00:01<00:28,  2.13it/s]

{'eval_loss': 1.0964492559432983, 'eval_runtime': 0.042, 'eval_samples_per_second': 142.852, 'eval_steps_per_second': 23.809, 'epoch': 2.0}


  9%|▉         | 6/64 [00:02<00:27,  2.07it/s]
  9%|▉         | 6/64 [00:02<00:27,  2.07it/s]

{'eval_loss': 1.0885125398635864, 'eval_runtime': 0.043, 'eval_samples_per_second': 139.533, 'eval_steps_per_second': 23.255, 'epoch': 3.0}


 12%|█▎        | 8/64 [00:03<00:26,  2.08it/s]
 12%|█▎        | 8/64 [00:03<00:26,  2.08it/s]

{'eval_loss': 1.0766834020614624, 'eval_runtime': 0.042, 'eval_samples_per_second': 142.859, 'eval_steps_per_second': 23.81, 'epoch': 4.0}


 16%|█▌        | 10/64 [00:04<00:25,  2.09it/s]
 16%|█▌        | 10/64 [00:04<00:25,  2.09it/s]

{'eval_loss': 1.0737932920455933, 'eval_runtime': 0.042, 'eval_samples_per_second': 142.859, 'eval_steps_per_second': 23.81, 'epoch': 5.0}


 19%|█▉        | 12/64 [00:05<00:25,  2.07it/s]
 19%|█▉        | 12/64 [00:05<00:25,  2.07it/s]

{'eval_loss': 1.0563627481460571, 'eval_runtime': 0.041, 'eval_samples_per_second': 146.341, 'eval_steps_per_second': 24.39, 'epoch': 6.0}


 22%|██▏       | 14/64 [00:06<00:24,  2.02it/s]
 22%|██▏       | 14/64 [00:06<00:24,  2.02it/s]

{'eval_loss': 1.0487877130508423, 'eval_runtime': 0.042, 'eval_samples_per_second': 142.853, 'eval_steps_per_second': 23.809, 'epoch': 7.0}


 25%|██▌       | 16/64 [00:07<00:23,  2.06it/s]
 25%|██▌       | 16/64 [00:07<00:23,  2.06it/s]

{'eval_loss': 1.048904538154602, 'eval_runtime': 0.041, 'eval_samples_per_second': 146.341, 'eval_steps_per_second': 24.39, 'epoch': 8.0}


 28%|██▊       | 18/64 [00:08<00:22,  2.06it/s]
 28%|██▊       | 18/64 [00:08<00:22,  2.06it/s]

{'eval_loss': 1.0415440797805786, 'eval_runtime': 0.047, 'eval_samples_per_second': 127.658, 'eval_steps_per_second': 21.276, 'epoch': 9.0}


 31%|███▏      | 20/64 [00:09<00:21,  2.04it/s]
 31%|███▏      | 20/64 [00:09<00:21,  2.04it/s]

{'eval_loss': 1.0393277406692505, 'eval_runtime': 0.042, 'eval_samples_per_second': 142.856, 'eval_steps_per_second': 23.809, 'epoch': 10.0}


 34%|███▍      | 22/64 [00:10<00:20,  2.03it/s]
 34%|███▍      | 22/64 [00:10<00:20,  2.03it/s]

{'eval_loss': 1.0213762521743774, 'eval_runtime': 0.042, 'eval_samples_per_second': 142.857, 'eval_steps_per_second': 23.809, 'epoch': 11.0}


 38%|███▊      | 24/64 [00:11<00:19,  2.02it/s]
 38%|███▊      | 24/64 [00:12<00:19,  2.02it/s]

{'eval_loss': 1.0028547048568726, 'eval_runtime': 0.042, 'eval_samples_per_second': 142.856, 'eval_steps_per_second': 23.809, 'epoch': 12.0}


 41%|████      | 26/64 [00:13<00:19,  2.00it/s]
 41%|████      | 26/64 [00:13<00:19,  2.00it/s]

{'eval_loss': 0.9656118750572205, 'eval_runtime': 0.041, 'eval_samples_per_second': 146.339, 'eval_steps_per_second': 24.39, 'epoch': 13.0}


 44%|████▍     | 28/64 [00:13<00:17,  2.04it/s]
 44%|████▍     | 28/64 [00:14<00:17,  2.04it/s]

{'eval_loss': 0.9272081255912781, 'eval_runtime': 0.041, 'eval_samples_per_second': 146.341, 'eval_steps_per_second': 24.39, 'epoch': 14.0}


 47%|████▋     | 30/64 [00:14<00:16,  2.04it/s]
 47%|████▋     | 30/64 [00:15<00:16,  2.04it/s]

{'eval_loss': 0.9222326874732971, 'eval_runtime': 0.041, 'eval_samples_per_second': 146.341, 'eval_steps_per_second': 24.39, 'epoch': 15.0}


 50%|█████     | 32/64 [00:15<00:15,  2.06it/s]
 50%|█████     | 32/64 [00:16<00:15,  2.06it/s]

{'eval_loss': 0.9378530383110046, 'eval_runtime': 0.041, 'eval_samples_per_second': 146.339, 'eval_steps_per_second': 24.39, 'epoch': 16.0}


 53%|█████▎    | 34/64 [00:16<00:14,  2.06it/s]
 53%|█████▎    | 34/64 [00:17<00:14,  2.06it/s]

{'eval_loss': 0.9421439170837402, 'eval_runtime': 0.041, 'eval_samples_per_second': 146.343, 'eval_steps_per_second': 24.39, 'epoch': 17.0}


 56%|█████▋    | 36/64 [00:17<00:13,  2.07it/s]
 56%|█████▋    | 36/64 [00:18<00:13,  2.07it/s]

{'eval_loss': 0.9385148882865906, 'eval_runtime': 0.042, 'eval_samples_per_second': 142.856, 'eval_steps_per_second': 23.809, 'epoch': 18.0}


 59%|█████▉    | 38/64 [00:18<00:12,  2.07it/s]
 59%|█████▉    | 38/64 [00:19<00:12,  2.07it/s]

{'eval_loss': 0.9231727719306946, 'eval_runtime': 0.042, 'eval_samples_per_second': 142.852, 'eval_steps_per_second': 23.809, 'epoch': 19.0}


 62%|██████▎   | 40/64 [00:19<00:11,  2.08it/s]
 62%|██████▎   | 40/64 [00:19<00:11,  2.08it/s]

{'eval_loss': 0.916694700717926, 'eval_runtime': 0.042, 'eval_samples_per_second': 142.854, 'eval_steps_per_second': 23.809, 'epoch': 20.0}


 66%|██████▌   | 42/64 [00:20<00:10,  2.04it/s]
 66%|██████▌   | 42/64 [00:21<00:10,  2.04it/s]

{'eval_loss': 0.9073235392570496, 'eval_runtime': 0.042, 'eval_samples_per_second': 142.857, 'eval_steps_per_second': 23.809, 'epoch': 21.0}


 69%|██████▉   | 44/64 [00:22<00:09,  2.00it/s]
 69%|██████▉   | 44/64 [00:22<00:09,  2.00it/s]

{'eval_loss': 0.9066097140312195, 'eval_runtime': 0.042, 'eval_samples_per_second': 142.855, 'eval_steps_per_second': 23.809, 'epoch': 22.0}


 72%|███████▏  | 46/64 [00:23<00:08,  2.02it/s]
 72%|███████▏  | 46/64 [00:23<00:08,  2.02it/s]

{'eval_loss': 0.9069351553916931, 'eval_runtime': 0.042, 'eval_samples_per_second': 142.856, 'eval_steps_per_second': 23.809, 'epoch': 23.0}


 75%|███████▌  | 48/64 [00:24<00:07,  2.05it/s]
 75%|███████▌  | 48/64 [00:24<00:07,  2.05it/s]

{'eval_loss': 0.9286949038505554, 'eval_runtime': 0.043, 'eval_samples_per_second': 139.535, 'eval_steps_per_second': 23.256, 'epoch': 24.0}


 78%|███████▊  | 50/64 [00:24<00:06,  2.07it/s]
 78%|███████▊  | 50/64 [00:25<00:06,  2.07it/s]

{'eval_loss': 0.9523162841796875, 'eval_runtime': 0.042, 'eval_samples_per_second': 142.858, 'eval_steps_per_second': 23.81, 'epoch': 25.0}


 81%|████████▏ | 52/64 [00:25<00:05,  2.07it/s]
 81%|████████▏ | 52/64 [00:26<00:05,  2.07it/s]

{'eval_loss': 0.964904248714447, 'eval_runtime': 0.042, 'eval_samples_per_second': 142.853, 'eval_steps_per_second': 23.809, 'epoch': 26.0}


 84%|████████▍ | 54/64 [00:26<00:04,  2.09it/s]
 84%|████████▍ | 54/64 [00:27<00:04,  2.09it/s]

{'eval_loss': 0.9730550646781921, 'eval_runtime': 0.082, 'eval_samples_per_second': 73.171, 'eval_steps_per_second': 12.195, 'epoch': 27.0}


 88%|████████▊ | 56/64 [00:28<00:03,  2.01it/s]
 88%|████████▊ | 56/64 [00:28<00:03,  2.01it/s]

{'eval_loss': 0.9777374267578125, 'eval_runtime': 0.042, 'eval_samples_per_second': 142.856, 'eval_steps_per_second': 23.809, 'epoch': 28.0}


 91%|█████████ | 58/64 [00:29<00:02,  2.05it/s]
 91%|█████████ | 58/64 [00:29<00:02,  2.05it/s]

{'eval_loss': 0.9767656326293945, 'eval_runtime': 0.042, 'eval_samples_per_second': 142.861, 'eval_steps_per_second': 23.81, 'epoch': 29.0}


 94%|█████████▍| 60/64 [00:30<00:01,  2.04it/s]
 94%|█████████▍| 60/64 [00:30<00:01,  2.04it/s]

{'eval_loss': 0.9808658957481384, 'eval_runtime': 0.042, 'eval_samples_per_second': 142.859, 'eval_steps_per_second': 23.81, 'epoch': 30.0}


 97%|█████████▋| 62/64 [00:31<00:00,  2.06it/s]
 97%|█████████▋| 62/64 [00:31<00:00,  2.06it/s]

{'eval_loss': 0.9793005585670471, 'eval_runtime': 0.043, 'eval_samples_per_second': 139.534, 'eval_steps_per_second': 23.256, 'epoch': 31.0}


100%|██████████| 64/64 [00:32<00:00,  2.03it/s]
100%|██████████| 64/64 [00:32<00:00,  1.99it/s]


{'eval_loss': 0.9777347445487976, 'eval_runtime': 0.042, 'eval_samples_per_second': 142.855, 'eval_steps_per_second': 23.809, 'epoch': 32.0}
{'train_runtime': 32.093, 'train_samples_per_second': 23.93, 'train_steps_per_second': 1.994, 'train_loss': 0.6566612720489502, 'epoch': 32.0}


('./fine_tuned_model_classif\\tokenizer_config.json',
 './fine_tuned_model_classif\\special_tokens_map.json',
 './fine_tuned_model_classif\\vocab.txt',
 './fine_tuned_model_classif\\added_tokens.json')

In [16]:
# Évaluer le modèle
results = trainer.evaluate()
print(results)

100%|██████████| 1/1 [00:00<00:00, 200.00it/s]

{'eval_loss': 0.9777347445487976, 'eval_runtime': 0.04, 'eval_samples_per_second': 150.005, 'eval_steps_per_second': 25.001, 'epoch': 32.0}





In [19]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer

# Charger le modèle fine-tuné
model = DistilBertForSequenceClassification.from_pretrained("./fine_tuned_model_classif")
tokenizer = DistilBertTokenizer.from_pretrained("./fine_tuned_model_classif")

# Fonction pour classifier une nouvelle note
def classify_note(note):
    inputs = tokenizer(note, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predicted_class = torch.argmax(probabilities).item()
    return predicted_class

# Exemple d'utilisation
new_note = "il y a une baisse."
predicted_class = classify_note(new_note)
class_mapping = {0: "critique", 1: "moyen", 2: "bas"}
print(f"La note est classifiée comme : {class_mapping[predicted_class]}")

La note est classifiée comme : bas


In [7]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TextClassificationPipeline
#Ce modèle est une version affinée du modèle DistilBERT pour classer les commentaires toxiques.
#https://huggingface.co/martin-ha/toxic-comment-model?text=I+like+you.+I+love+you

model_path = "martin-ha/toxic-comment-model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

pipeline = TextClassificationPipeline(model=model, tokenizer=tokenizer)
result = pipeline('assole.') 
print(result)

[{'label': 'toxic', 'score': 0.9288287162780762}]
