## Execice 2 : Utilisation de transformers pour les deux datasets

### 2.1 :  Utilisation d'un transformer prêt à l'emploi

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import pipeline

# Charger le dataset
df = pd.read_csv("data_fake_news_en.csv")

# Nettoyer et préparer les données
df = df.dropna(subset=['text', 'label'])  # Supprimer les lignes avec valeurs manquantes
df = df[(df['text'].str.len() > 20) & (df['text'].str.len() < 512)]  # Filtrer par longueur
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42
)

# Charger le pipeline Transformers
classifier = pipeline("text-classification", model="distilbert-base-uncased", device=0)

# Prédictions sur les textes de test
predictions_pipeline = classifier(test_texts.tolist(), truncation=True, max_length=512)

# Convertir les prédictions en étiquettes binaires
y_pred_transformer = [1 if pred['label'] == 'LABEL_1' else 0 for pred in predictions_pipeline]

# Évaluer les performances
print("Transformers Pipeline:(ANGLAIS)\n", classification_report(test_labels, y_pred_transformer))


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


Transformers Pipeline:
               precision    recall  f1-score   support

           0       0.65      0.08      0.13       346
           1       0.58      0.97      0.73       459

    accuracy                           0.59       805
   macro avg       0.62      0.52      0.43       805
weighted avg       0.61      0.59      0.47       805



In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import pipeline

# Charger le dataset en français
df = pd.read_csv("data_fake_news_fr.csv")

# Nettoyer et préparer les données
df = df.dropna(subset=['text', 'fake'])  # Supprimer les lignes avec des valeurs manquantes
df = df[(df['text'].str.len() > 20) & (df['text'].str.len() < 512)]  # Filtrer les longueurs

# Diviser les données en ensembles d'entraînement et de test
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'], df['fake'], test_size=0.2, random_state=42
)

# Charger le pipeline Transformers pour le français
classifier = pipeline("text-classification", model="camembert-base", device=0)

# Prédictions sur les textes de test
predictions_pipeline = classifier(test_texts.tolist(), truncation=True, max_length=512)

# Convertir les prédictions en étiquettes binaires
y_pred_transformer = [1 if pred['label'] == 'LABEL_1' else 0 for pred in predictions_pipeline]

# Évaluer les performances
print("Transformers Pipeline (Français):\n", classification_report(test_labels, y_pred_transformer))


Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


Transformers Pipeline (Français):
               precision    recall  f1-score   support

           0       0.12      1.00      0.22         1
           1       1.00      0.61      0.76        18

    accuracy                           0.63        19
   macro avg       0.56      0.81      0.49        19
weighted avg       0.95      0.63      0.73        19



### 2.2 Finetuning d'un modèle de transformers

In [2]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Charger et limiter le dataset
df = pd.read_csv("data_fake_news_en.csv")
df = df.dropna(subset=['text', 'label'])
df = df[(df['text'].str.len() > 20) & (df['text'].str.len() < 512)]
df['label'] = df['label'].apply(int)

# Limiter à 1000 exemples pour l'entraînement et 200 pour le test
train_df = df.sample(n=1000, random_state=42)
test_df = df.sample(n=200, random_state=42)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Charger le tokenizer et le modèle
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Tokenizer les données
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

train_dataset = train_dataset.remove_columns(["title", "text"])
test_dataset = test_dataset.remove_columns(["title", "text"])

train_dataset = train_dataset.with_format("torch")
test_dataset = test_dataset.with_format("torch")

# Réduire les époques et ajuster les paramètres
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=500,
    save_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=1,  # Réduit à 1 époque
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)

# Fonction de calcul des métriques
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# Initialiser Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Entraîner le modèle
trainer.train()

# Évaluer le modèle
results = trainer.evaluate()
print("Résultats d'évaluation :")
print(results)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss,Validation Loss


Résultats d'évaluation :
{'eval_loss': 0.14361491799354553, 'eval_accuracy': 0.995, 'eval_precision': 1.0, 'eval_recall': 0.9910714285714286, 'eval_f1': 0.9955156950672646, 'eval_runtime': 15.5656, 'eval_samples_per_second': 12.849, 'eval_steps_per_second': 0.45, 'epoch': 1.0}
