## Lecture CSV

In [29]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.metrics import classification_report, confusion_matrix

In [12]:
data_train = pd.read_csv("../datas/train_clean.csv")
data_train.head()

Unnamed: 0,keyword,location,text,labels
0,,,Our Deeds are the Reason of this #earthquake M...,1
1,,,Forest fire near La Ronge Sask. Canada,1
2,,,All residents asked to 'shelter in place' are ...,1
3,,,"13,000 people receive #wildfires evacuation or...",1
4,,,Just got sent this photo from Ruby #Alaska as ...,1


---

## distilbert-base-uncased

In [13]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased-finetuned-sst-2-english")
model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased-finetuned-sst-2-english")

In [14]:
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding=True)

Split train / val

In [15]:
from datasets import Dataset

# Conversion pandas -> Hugging Face Dataset
hf_dataset = Dataset.from_pandas(data_train)

# Split 80% train, 20% test (ou val)
split_dataset = hf_dataset.train_test_split(test_size=0.2)

# Accès aux sous-datasets
train_dataset = split_dataset["train"]
val_dataset = split_dataset["test"]

# Appliquer la fonction de tokenization avec batching
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 6090/6090 [00:00<00:00, 12864.97 examples/s]
Map: 100%|██████████| 1523/1523 [00:00<00:00, 22401.98 examples/s]


In [16]:
from transformers import DataCollatorWithPadding

# Auto Padding : toutes les séquences sont de la même longueur
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [17]:
# On vérifie la taille des séquences
samples = tokenized_train_dataset[:8]
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}
[len(x) for x in samples["input_ids"]]

[82, 82, 82, 82, 82, 82, 82, 82]

---

## Fine Tuning

In [18]:
from transformers import TrainingArguments

# paramètres d'entrainement par défaut
training_args = TrainingArguments("test-trainer", report_to="none")

In [19]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    data_collator=data_collator,
    processing_class=tokenizer,
)

---

## Train

In [20]:
trainer.train()



Step,Training Loss
500,0.5021
1000,0.3707
1500,0.3052
2000,0.1886




TrainOutput(global_step=2286, training_loss=0.3200492191398133, metrics={'train_runtime': 1798.9544, 'train_samples_per_second': 10.156, 'train_steps_per_second': 1.271, 'total_flos': 392921072247840.0, 'train_loss': 0.3200492191398133, 'epoch': 3.0})

---

## Predictions

In [21]:
predictions = trainer.predict(tokenized_val_dataset)



In [23]:
preds = np.argmax(predictions.predictions, axis=-1)

---

## Classification report

In [28]:
print(classification_report(tokenized_val_dataset["labels"], preds))

              precision    recall  f1-score   support

           0       0.84      0.85      0.84       872
           1       0.79      0.78      0.79       651

    accuracy                           0.82      1523
   macro avg       0.82      0.82      0.82      1523
weighted avg       0.82      0.82      0.82      1523



---

## Confusion Matrix

In [30]:
# produce the confusion matrix for your predictions, what comments can you make ?
mat = confusion_matrix(tokenized_val_dataset["labels"], preds)

labels = data_train["labels"].unique()
df_mat = pd.DataFrame(mat, index=labels, columns=labels)

fig = px.imshow(df_mat, text_auto=True, color_continuous_scale=px.colors.sequential.Aggrnyl,
    labels=dict(x="Prédiction", y="Réalité", color="Nombre"))
fig.update_coloraxes(showscale=False)
fig.show()