# Instructions

Ce notebook ne doit contenir que votre script servant à utiliser vos modèles **entrainés** en les testant sur le dataset *fake-test.csv*. Nous devons pouvoir l'exécuter en cliquant sur *Exécution -> Tout exécuter*.

Nous utiliserons ce script pour évaluer votre modèle sur nos propres données d'évaluation par la suite.


# Installation des librairies

In [1]:
!pip install transformers
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

# Importation des librairies

In [2]:
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer
import torch
from sklearn.metrics import accuracy_score, classification_report
import os

# Désactivation de wandb

In [3]:
os.environ["WANDB_DISABLED"] = "true"

# Unzip du modèle et on le place dans my_model ici c'est camembert-base.zip mais il faut changer par le nom du fichier si on veut tester un autre fichier

ici j'ai mis bert-base-french-europeana-cased.zip qui est le meilleur modèle que j'ai entraîner

In [4]:
# camembert-base.zip
# bert-base-french-europeana-cased.zip
# roberta-fake-news-classification.zip
# xlm-roberta-base.zip


# Créer le répertoire cible
!mkdir -p my_model
!rm -rf my_model/*

# Décompresser le fichier ZIP
!unzip bert-base-french-europeana-cased.zip -d my_model

Archive:  bert-base-french-europeana-cased.zip
  inflating: my_model/vocab.txt      
  inflating: my_model/tokenizer.json  
  inflating: my_model/tokenizer_config.json  
  inflating: my_model/model.safetensors  
  inflating: my_model/training_args.bin  
  inflating: my_model/special_tokens_map.json  
  inflating: my_model/config.json    


# Chargement du modèle et du tokenizer

In [5]:
model_path = './my_model'

# Charger le modèle et tokenizer
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Passage en mode évaluation
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

# Chargement du dataset de test


In [6]:
# Charger le dataset de test
test_file_path = 'fake_test.csv'
dataset_test = load_dataset("csv", data_files=test_file_path, split='train')

Generating train split: 0 examples [00:00, ? examples/s]

# Tokenisation des données

In [7]:
# Tokenisation des données
def preprocess_function(examples):
    return tokenizer(examples['data'], truncation=True, padding=True, max_length=512)

# Appliquer la tokenisation au dataset
tokenized_test = dataset_test.map(preprocess_function, batched=True)

Map:   0%|          | 0/486 [00:00<?, ? examples/s]

# Prediction et affichage de l'accuracy

In [8]:
from transformers import Trainer
from sklearn.metrics import accuracy_score

# Créer un Trainer avec le modèle et le dataset de test
trainer = Trainer(model=model, tokenizer=tokenizer, eval_dataset=tokenized_test)

# Faire des prédictions
predictions = trainer.predict(tokenized_test)
pred_labels = predictions.predictions.argmax(axis=-1)
true_labels = predictions.label_ids

# Résultats
print(f"Accuracy: {accuracy_score(true_labels, pred_labels)}")
print(classification_report(true_labels, pred_labels))

  trainer = Trainer(model=model, tokenizer=tokenizer, eval_dataset=tokenized_test)
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Accuracy: 0.9711934156378601
              precision    recall  f1-score   support

           0       0.96      0.99      0.97       272
           1       0.99      0.94      0.97       214

    accuracy                           0.97       486
   macro avg       0.97      0.97      0.97       486
weighted avg       0.97      0.97      0.97       486

