In [1]:
!pip install transformers datasets transformers[torch] accelerate -U




In [26]:
import pandas as pd
from datasets import Dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, TrainingArguments, Trainer , TrainerCallback
from sklearn.model_selection import train_test_split
import numpy as np

def get_data(data_path):
  # Récupérer le dataset
  df = pd.read_csv(data_path)
  return df


def tokenize(examples, tokenizer):
    return tokenizer(examples['question'], padding="max_length", truncation=True, max_length=512)


def preprocess(df:pd.DataFrame):

  df['label'] = df['label'].astype(int)

  tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

  # Transformer le DataFrame en Dataset Hugging Face
  dataset = Dataset.from_pandas(df)
  dataset = dataset.map(lambda examples: tokenize(examples, tokenizer), batched=True)
  dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

  # Split le dataset
  train_dataset, test_dataset = dataset.train_test_split(test_size=0.2).values()

  return train_dataset, test_dataset


def prepare_pretrained_model():

  model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

  for name, param in model.named_parameters():
    # print(name)
    if 'pre_classifier' not in name and 'classifier' not in name:
      param.requires_grad = False

  return model



# Callback pour calculer l'accuracy
class ComputeMetricsCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        # Calcul de l'accuracy à partir des metrics retournés par l'évaluation
        logs = {k: v for k, v in metrics.items()}
        # print(f"Accuracy: {logs['eval_accuracy']:.4f}")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {'accuracy': (predictions == labels).mean()}


def train_model(train_dataset , test_dataset , model , num_train_epochs=10):

  # Arguments d'entraînement
  training_args = TrainingArguments(
      output_dir='./results',
      num_train_epochs=num_train_epochs,
      per_device_train_batch_size=8,
      per_device_eval_batch_size=16,
      warmup_steps=500,
      weight_decay=0.01,
      logging_dir='./logs',
      evaluation_strategy="epoch",
      save_strategy="epoch",
      load_best_model_at_end=True,
      metric_for_best_model="accuracy",
  )

  trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=train_dataset,
      eval_dataset=test_dataset,
      compute_metrics=compute_metrics,
      callbacks=[ComputeMetricsCallback()]
  )

  # Entraîner le modèle
  trainer.train()

  # Évaluer le modèle
  evaluation_results = trainer.evaluate()
  print(f"Training Loss: {evaluation_results['eval_loss']:.4f}")
  print(f"Accuracy: {evaluation_results['eval_accuracy']:.4f}")


In [27]:
# Récupérer les données
data = get_data(data_path='question_classif.csv')

# Prétraiter les données
train, test = preprocess(data)

# Charger le modèle pré-entraîné
model = prepare_pretrained_model()

# Train le model
train_model(train , test , model , 20)

Map:   0%|          | 0/98 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.696688,0.5
2,No log,0.69445,0.5
3,No log,0.690645,0.5
4,No log,0.685271,0.5
5,No log,0.678769,0.5
6,No log,0.670539,0.5
7,No log,0.660979,0.6
8,No log,0.650242,0.7
9,No log,0.638313,0.85
10,No log,0.625384,0.95


Checkpoint destination directory ./results/checkpoint-10 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-20 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-30 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-40 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-50 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-60 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-70 already exists and is non-empty. Saving will proceed but saved results may be i

Training Loss: 0.5587
Accuracy: 1.0000
