## 1. Lecture CSV

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.metrics import classification_report, confusion_matrix
import tiktoken
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_spam = pd.read_csv("datas/spam.csv", encoding="iso-8859-1")
df_spam.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


---

## 2. Préprocessing

#### Analyse des 3 colonnes `Unnamed`

In [3]:
df_spam.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
mask_unnamed_2 = (df_spam["Unnamed: 2"].notnull())
df_spam[mask_unnamed_2].head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
95,spam,Your free ringtone is waiting to be collected....,PO Box 5249,"MK17 92H. 450Ppw 16""",
281,ham,\Wen u miss someone,the person is definitely special for u..... B...,why to miss them,"just Keep-in-touch\"" gdeve.."""
444,ham,\HEY HEY WERETHE MONKEESPEOPLE SAY WE MONKEYAR...,HOWU DOIN? FOUNDURSELF A JOBYET SAUSAGE?LOVE ...,,
671,spam,SMS. ac sun0819 posts HELLO:\You seem cool,"wanted to say hi. HI!!!\"" Stop? Send STOP to ...",,
710,ham,Height of Confidence: All the Aeronautics prof...,"this wont even start........ Datz confidence..""",,


In [5]:
mask_unnamed_3 = (df_spam["Unnamed: 3"].notnull())
df_spam[mask_unnamed_3].head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
95,spam,Your free ringtone is waiting to be collected....,PO Box 5249,"MK17 92H. 450Ppw 16""",
281,ham,\Wen u miss someone,the person is definitely special for u..... B...,why to miss them,"just Keep-in-touch\"" gdeve.."""
899,spam,Your free ringtone is waiting to be collected....,PO Box 5249,"MK17 92H. 450Ppw 16""",
1038,ham,"Edison has rightly said, \A fool can ask more ...",GN,GE,"GNT:-)"""
2170,ham,\CAN I PLEASE COME UP NOW IMIN TOWN.DONTMATTER...,JUST REALLYNEED 2DOCD.PLEASE DONTPLEASE DONTIG...,"U NO THECD ISV.IMPORTANT TOME 4 2MORO\""""",


In [6]:
mask_unnamed_4 = (df_spam["Unnamed: 4"].notnull())
df_spam[mask_unnamed_4].head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
281,ham,\Wen u miss someone,the person is definitely special for u..... B...,why to miss them,"just Keep-in-touch\"" gdeve.."""
1038,ham,"Edison has rightly said, \A fool can ask more ...",GN,GE,"GNT:-)"""
2255,ham,I just lov this line: \Hurt me with the truth,I don't mind,i wil tolerat.bcs ur my someone..... But,"Never comfort me with a lie\"" gud ni8 and swe..."
3525,ham,\HEY BABE! FAR 2 SPUN-OUT 2 SPK AT DA MO... DE...,HAD A COOL NYTHO,TX 4 FONIN HON,"CALL 2MWEN IM BK FRMCLOUD 9! J X\"""""
4668,ham,"When I was born, GOD said, \Oh No! Another IDI...",GOD said,"\""OH No! COMPETITION\"". Who knew","one day these two will become FREINDS FOREVER!"""


Je vais concaténer ces colonnes avec la colonne qui contient le message

In [7]:
df_spam["v2"] = (df_spam["v2"].fillna("") + df_spam["Unnamed: 2"].fillna("") + df_spam["Unnamed: 3"].fillna("") + df_spam["Unnamed: 4"].fillna(""))
to_drop = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"]
df_spam = df_spam.drop(columns=to_drop)
df_spam.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


#### Colonnes label et message

Je renomme les colonnes : `v1` --> `label_text`, `v2` --> `message`

In [8]:
df_spam.columns = ["label_text", "message"]
df_spam.head()

Unnamed: 0,label_text,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


J'ajoute une colonne `label` qui encode le label ham / spam : `ham` --> `0`, `spam` --> `1`

In [9]:
df_spam["label"] = [0 if label == "ham" else 1 for label in df_spam["label_text"]]
df_spam.head()

Unnamed: 0,label_text,message,label
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


Distribution des ham et des spam

In [10]:
px.histogram(df_spam, x="label_text")

On voit que dans notre dataset, il y a une très mauvaise répartition des labels : `spam`est beaucoup moins représenté

---

## 3. Tokenization

Je tokenize les messages avec le tokenizer "cl100k_base" (basé sur le byte pair coding)

In [11]:
tokenizer = tiktoken.get_encoding("cl100k_base")

def encode_texts(texts):
    return [tokenizer.encode(text) for text in texts]

tokens = encode_texts(df_spam["message"])

In [12]:
tokens[0][:10]

[11087, 3156, 16422, 647, 1486, 11, 14599, 497, 16528, 1193]

In [13]:
tokens[1][:10]

[11839, 45555, 1131, 622, 10979, 289, 333, 577, 389, 72]

Les modèles de NLP exigent souvent des séquences (liste de tokens) de tailles uniformes.

Calcul de la taille moyenne des séquences

In [14]:
seq_lens = [len(seq) for seq in tokens]
np.mean(seq_lens)

np.float64(22.68449389806174)

Distribution de la taille des séquences

In [15]:
px.histogram(seq_lens,nbins=30)

Taille moyenne des séquences : 22 tokens. Nous allons garder des séquences à 30 tokens.

In [16]:
def pad_sequences(sequences, max_length=30):
    return [seq[:max_length] + [0] * (max_length - len(seq)) for seq in sequences]

tokens = pad_sequences(tokens)

---

## 4. Dataset and split datas

Création du Dataset, des DataLoader et split des messages : train et validation. (80% - 20%)

In [17]:
# Class ATTDataset
class ATTDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = torch.tensor(texts, dtype=torch.long)
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

df_dataset = ATTDataset(tokens, df_spam["label"])

# Split dataset into training (80%) and validation (20%)
train_size = int(0.8 * len(df_dataset))
val_size = len(df_dataset) - train_size
train_dataset, val_dataset = random_split(df_dataset, [train_size, val_size])

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [18]:
text, label = next(iter(train_loader))
print(label)
print(text)

tensor([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.])
tensor([[ 2181,   596,  1717,   234, 20644,  1193,   400,  6860, 85009,  1131,
           127,   234,  2357,  2800,   682, 85009,   400,  5245,   520,  3325,
          1131, 23956,   374,  1717,   234, 20644,  3430,   220,    19,  1717],
        [ 1128,  5735,   856, 24886,   311,  1461,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [   38,   664, 17767,   983, 25237,   617,   264,  6555,  1938,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [22186,    77,   956,   342,   617,   538,  4216, 16986,   323,  8617,
         13434,   956,   387,  4560,   311, 16603,   520,   22

---

## 5. First prediction model (Classification)

#### Définition du modèle

1 couche embedding (transforme les tokens en vecteurs)

1 couche pooling (réduit les outputs)

1 couche Linear

Activation Sigmoid car nous sommes sur un problème de classification

In [19]:
vocab_size = tokenizer.n_vocab

class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.pooling = nn.AdaptiveAvgPool1d(1)
        self.fc = nn.Linear(embed_dim, num_class)

    def forward(self, text):
        embedded = self.embedding(text)
        pooled = self.pooling(embedded.permute(0, 2, 1)).squeeze(2)
        return torch.sigmoid(self.fc(pooled))

model = TextClassifier(vocab_size=vocab_size,
                      embed_dim=16,
                      num_class=1)

In [20]:
from torchinfo import summary

print(model)

# Print model summary
summary(model, input_data=text)

TextClassifier(
  (embedding): Embedding(100277, 16, padding_idx=0)
  (pooling): AdaptiveAvgPool1d(output_size=1)
  (fc): Linear(in_features=16, out_features=1, bias=True)
)


Layer (type:depth-idx)                   Output Shape              Param #
TextClassifier                           [32, 1]                   --
├─Embedding: 1-1                         [32, 30, 16]              1,604,432
├─AdaptiveAvgPool1d: 1-2                 [32, 16, 1]               --
├─Linear: 1-3                            [32, 1]                   17
Total params: 1,604,449
Trainable params: 1,604,449
Non-trainable params: 0
Total mult-adds (M): 51.34
Input size (MB): 0.01
Forward/backward pass size (MB): 0.12
Params size (MB): 6.42
Estimated Total Size (MB): 6.55

#### Entrainement

Fonction de coût : Binary Cross Entropy pour la Classification

Optimiser : Adam

On entraine le modèle sur 20 epochs

In [21]:
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

def train(model, train_loader, val_loader, criterion, optimizer, epochs=100):

    # Dictionary to store training & validation loss and accuracy over epochs
    history = {'loss': [], 'val_loss': [], 'accuracy': [], 'val_accuracy': []}

    for epoch in range(epochs):  # Loop over the number of epochs
        model.train()  # Set model to training mode
        total_loss, correct = 0, 0  # Initialize total loss and correct predictions

        # Training loop
        for inputs, labels in train_loader:
            optimizer.zero_grad()  # Reset gradients before each batch
            outputs = model(inputs).squeeze()  # Forward pass
            loss = criterion(outputs, labels)  # Compute loss
            loss.backward()  # Backpropagation (compute gradients)
            optimizer.step()  # Update model parameters

            total_loss += loss.item()  # Accumulate batch loss
            correct += ((outputs > 0.5) == labels).sum().item()  # Count correct predictions

        # Compute average loss and accuracy for training
        train_loss = total_loss / len(train_loader)
        train_acc = correct / len(train_loader.dataset)

        # Validation phase (without gradient computation)
        model.eval()  # Set model to evaluation mode
        val_loss, val_correct = 0, 0
        with torch.no_grad():  # No need to compute gradients during validation
            for inputs, labels in val_loader:
                outputs = model(inputs).squeeze()  # Forward pass
                loss = criterion(outputs, labels)  # Compute loss
                val_loss += loss.item()  # Accumulate validation loss
                val_correct += ((outputs > 0.5) == labels).sum().item()  # Count correct predictions

        # Compute average loss and accuracy for validation
        val_loss /= len(val_loader)
        val_acc = val_correct / len(val_loader.dataset)

        # Store metrics in history dictionary
        history['loss'].append(train_loss)
        history['val_loss'].append(val_loss)
        history['accuracy'].append(train_acc)
        history['val_accuracy'].append(val_acc)

        # Print training progress
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {train_loss:.4f}, Acc: {train_acc:.4f}, "
              f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

    return history  # Return training history

history = train(model,
                train_loader=train_loader,
                val_loader=val_loader,
                criterion=criterion,
                optimizer=optimizer,
                epochs=20)

Epoch [1/20], Loss: 0.5966, Acc: 0.8661, Val Loss: 0.5620, Val Acc: 0.8511
Epoch [2/20], Loss: 0.5133, Acc: 0.8703, Val Loss: 0.4739, Val Acc: 0.8520
Epoch [3/20], Loss: 0.4162, Acc: 0.8788, Val Loss: 0.3820, Val Acc: 0.8744
Epoch [4/20], Loss: 0.3261, Acc: 0.9067, Val Loss: 0.3069, Val Acc: 0.9058
Epoch [5/20], Loss: 0.2560, Acc: 0.9376, Val Loss: 0.2494, Val Acc: 0.9265
Epoch [6/20], Loss: 0.2036, Acc: 0.9558, Val Loss: 0.2066, Val Acc: 0.9417
Epoch [7/20], Loss: 0.1656, Acc: 0.9686, Val Loss: 0.1756, Val Acc: 0.9561
Epoch [8/20], Loss: 0.1385, Acc: 0.9755, Val Loss: 0.1523, Val Acc: 0.9614
Epoch [9/20], Loss: 0.1164, Acc: 0.9809, Val Loss: 0.1347, Val Acc: 0.9695
Epoch [10/20], Loss: 0.1005, Acc: 0.9838, Val Loss: 0.1216, Val Acc: 0.9722
Epoch [11/20], Loss: 0.0878, Acc: 0.9859, Val Loss: 0.1112, Val Acc: 0.9767
Epoch [12/20], Loss: 0.0776, Acc: 0.9872, Val Loss: 0.1025, Val Acc: 0.9794
Epoch [13/20], Loss: 0.0689, Acc: 0.9883, Val Loss: 0.0955, Val Acc: 0.9803
Epoch [14/20], Loss: 

Sauvegarde du modèle

In [22]:
checkpoint_path = "models/first_model.pth"
torch.save({
    "model_state_dict": model.state_dict(),
    "optimizer_state_dict": optimizer.state_dict(),
    "history": history,
}, checkpoint_path)

#### Analyse des résultats

Visualisation de la fonction de coût et de l'accuracy

In [23]:
from plotly import graph_objects as go
color_chart = ["#4B9AC7", "#4BE8E0", "#9DD4F3", "#97FBF6", "#2A7FAF", "#23B1AB", "#0E3449", "#015955"]

fig = go.Figure(data=[
                      go.Scatter(
                          y=history["loss"],
                          name="Training loss",
                          mode="lines",
                          marker=dict(
                              color=color_chart[0]
                          )),
                      go.Scatter(
                          y=history["val_loss"],
                          name="Validation loss",
                          mode="lines",
                          marker=dict(
                              color=color_chart[1]
                          ))
])
fig.update_layout(
    title='Training and val loss across epochs',
    xaxis_title='epochs',
    yaxis_title='Cross Entropy'
)
fig.show()

In [24]:
from plotly import graph_objects as go
color_chart = ["#4B9AC7", "#4BE8E0", "#9DD4F3", "#97FBF6", "#2A7FAF", "#23B1AB", "#0E3449", "#015955"]

fig = go.Figure(data=[
                      go.Scatter(
                          y=history["accuracy"],
                          name="Training Accuracy",
                          mode="lines",
                          marker=dict(
                              color=color_chart[0]
                          )),
                      go.Scatter(
                          y=history["val_accuracy"],
                          name="Validation Accuracy",
                          mode="lines",
                          marker=dict(
                              color=color_chart[1]
                          ))
])
fig.update_layout(
    title='Training and val Accuracy across epochs',
    xaxis_title='epochs',
    yaxis_title='Cross Entropy'
)
fig.show()

In [25]:
final_history = {key: valeur[-1] for key, valeur in history.items()}
print(final_history)

{'loss': 0.033850722081427066, 'val_loss': 0.06972750989454134, 'accuracy': 0.9952883105227731, 'val_accuracy': 0.9829596412556054}


A première vue, le modèle classifie plutôt bien les spams et hams.

Sur le set de train la loss est de 0.04 et l'accuracy de 0.99.

Sur le set de validation la loss est de 0.07 et l'accuracy de 0.98.

#### Analyse des erreurs : là où le modèle s'est trompé

In [36]:
# Function to evaluate the model and get worst predictions
def evaluate_worst_predictions(model, dataloader, tokenizer):
    # Set model to evaluation mode to disable dropout and batch normalization
    model.eval()

    # Lists to store all predictions, labels, errors, and inputs for analysis
    list_predictions = []
    list_labels = []
    list_errors = []
    list_inputs = []

    # No gradients needed during evaluation for efficiency
    with torch.no_grad():
        for batch in dataloader:
            # Extract inputs and labels from the batch
            inputs, labels = batch
            outputs = model(inputs) # Forward pass: Get model predictions

            # Convert outputs to predicted class for classification problems
            # TODO
            # A LA PLACE ;
            # threshold = 0.5 # threshold may vary
            # cassification = int(probability >= threshold)
            preds = torch.argmax(outputs, dim=1)
            errors = (preds != labels).float()  # Misclassified observations
            
            # Store predictions, labels, errors, and raw inputs for further analysis
            list_predictions.extend(preds.cpu().numpy())
            list_labels.extend(int(x) for x in labels.cpu().numpy())
            list_errors.extend(errors.cpu().numpy())
            list_inputs.extend(inputs.cpu().numpy())

    # Convert stored results into a Pandas DataFrame for easy analysis
    # Decode tokenized text back into human-readable text
    df_results = pd.DataFrame({
        "True_Label": list_labels,
        "Predicted": list_predictions,
        "Error": list_errors,
        "Inputs": list_inputs,
        "Text" : [tokenizer.decode(input) for input in list_inputs]
    })

    # Sort the DataFrame by highest error to identify the worst predictions
    df_results_sorted = df_results.sort_values(by="Error", ascending=False)

    # Return the sorted DataFrame containing worst predictions
    return df_results_sorted

# Evaluate worst predictions on validation set
worst_predictions_val = evaluate_worst_predictions(model, val_loader, tokenizer)

# Evaluate worst predictions on training set
worst_predictions_train = evaluate_worst_predictions(model, train_loader, tokenizer)

In [33]:
worst_predictions_train["Predicted"].value_counts()

Predicted
0    4457
Name: count, dtype: int64

Analyse des pires prédictions sur le set d'entrainement

In [48]:
worst_predictions_train.head(10)

Unnamed: 0,True_Label,Predicted,Error,Inputs,Text
4436,1,0,1.0,"[33, 6312, 12481, 43076, 369, 220, 3965, 79, 1...",Boltblue tones for 150p Reply POLY# or MONO# e...
7,1,0,1.0,"[33, 3093, 3838, 59488, 0, 13149, 308, 2457, 1...",Bored housewives! Chat n date now! 0871750.77....
15,1,0,1.0,"[43069, 6130, 11, 499, 1253, 1457, 3802, 701, ...","Orange customer, you may now claim your FREE C..."
14,1,0,1.0,"[7530, 386, 1354, 612, 7238, 80204, 612, 220, ...",Double Mins & Double Txt & 1/2 price Linerenta...
4422,1,0,1.0,"[11180, 6749, 28653, 549, 11, 602, 1120, 2751,...","FreeMsg Hey U, i just got 1 of these video/pic..."
4421,1,0,1.0,"[29923, 279, 5652, 8519, 83263, 30, 220, 11711...",Want the latest Video handset? 750 anytime any...
4420,1,0,1.0,"[2675, 617, 459, 3062, 6130, 2532, 17480, 13, ...",You have an important customer service announc...
4418,1,0,1.0,"[2378, 911, 67, 92050, 22725, 6130, 11, 499, 1...","UpgrdCentre Orange customer, you may now claim..."
25,1,0,1.0,"[40, 1097, 4106, 308, 38307, 323, 10032, 358, ...",I am hot n horny and willing I live local to y...
4452,1,0,1.0,"[24185, 264, 1060, 8312, 315, 62437, 220, 19, ...",WIN a year supply of CDs 4 a store of ur choic...


classification report

In [49]:
print(classification_report(worst_predictions_train["True_Label"], worst_predictions_train["Predicted"]))

              precision    recall  f1-score   support

           0       0.87      1.00      0.93      3864
           1       0.00      0.00      0.00       593

    accuracy                           0.87      4457
   macro avg       0.43      0.50      0.46      4457
weighted avg       0.75      0.87      0.81      4457




Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



Matrice de confusion

In [53]:
mat = confusion_matrix(worst_predictions_train["True_Label"], worst_predictions_train["Predicted"])

labels = df_spam["label_text"].unique()
df_mat = pd.DataFrame(mat, index=labels, columns=labels)

px.imshow(df_mat, text_auto=True)

Analyse des pires prédictions sur le set de validation

In [54]:
worst_predictions_val.head(10)

Unnamed: 0,True_Label,Predicted,Error,Inputs,Text
16,1,0,1.0,"[14331, 0, 578, 1193, 2035, 304, 6424, 311, 34...",YES! The only place in town to meet exciting a...
1109,1,0,1.0,"[52938, 0, 11361, 311, 264, 1695, 4333, 549, 6...",Congratulations! Thanks to a good friend U hav...
1112,1,0,1.0,"[8140, 6883, 502, 6505, 4731, 2532, 374, 1457,...",Our brand new mobile music service is now live...
1113,1,0,1.0,"[1539, 38, 1863, 0, 1226, 527, 4560, 311, 3729...",URGENT! We are trying to contact U. Todays dra...
10,1,0,1.0,"[83676, 13716, 7199, 279, 4033, 29950, 29490, ...",Marvel Mobile Play the official Ultimate Spide...
1069,1,0,1.0,"[11180, 4441, 304, 220, 17, 264, 17496, 1391, ...",Free entry in 2 a weekly comp for a chance to ...
1072,1,0,1.0,"[36286, 6082, 5742, 25, 220, 25665, 20617, 19,...",Promotion Number: 8714714 - UR awarded a City ...
42,1,0,1.0,"[33246, 1193, 0, 24805, 701, 6505, 220, 806, 7...",December only! Had your mobile 11mths+? You ar...
1087,1,0,1.0,"[21221, 2137, 220, 23103, 1758, 28, 3870, 2238...",83039 62735=å£450 UK Break AccommodationVouche...
1089,1,0,1.0,"[5618, 1650, 1057, 6130, 2532, 18740, 389, 220...",Please call our customer service representativ...


classification report

In [55]:
print(classification_report(worst_predictions_val["True_Label"], worst_predictions_val["Predicted"]))

              precision    recall  f1-score   support

           0       0.86      1.00      0.93       961
           1       0.00      0.00      0.00       154

    accuracy                           0.86      1115
   macro avg       0.43      0.50      0.46      1115
weighted avg       0.74      0.86      0.80      1115




Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



Matrice de confusion

In [57]:
mat = confusion_matrix(worst_predictions_val["True_Label"], worst_predictions_val["Predicted"])

labels = df_spam["label_text"].unique()
df_mat = pd.DataFrame(mat, index=labels, columns=labels)

px.imshow(df_mat, text_auto=True)

Conclusion :

Le modèle n'arrive pas du tout à identifier les spams.

Comme vu dés le départ, le Label 1 (Spam) est moins bien représenté dans le dataset, il est donc plus difficile à prédire.

Nous allons essayer de nous appuyer sur des modèles pré-existant, plus sophiqtiqués, et basés sur plus d'observations pour voir si la classification ham / spam s'améliore.

---

## 6. MODEL ZERO-SHOT CLASSIFICATION

In [58]:
classifier = pipeline("zero-shot-classification")
topics = df_spam["label_text"].unique() # ham et spam

pred = [classifier(x, topics) for x in df_spam["message"]]

No model was supplied, defaulted to facebook/bart-large-mnli and revision d7645e1 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


In [59]:
df_zero_shot = pd.DataFrame(pred)

# Extraction du label et du score ayant la valeur maximale
df_zero_shot["label"] = df_zero_shot.apply(lambda row: row["labels"][row["scores"].index(max(row["scores"]))], axis=1)
df_zero_shot["score"] = df_zero_shot.apply(lambda row: max(row["scores"]), axis=1)

# On garde uniquement les colonnes pertinentes
df_zero_shot = df_zero_shot[["sequence", "label", "score"]]

# Aperçu
df_zero_shot.head()

Unnamed: 0,sequence,label,score
0,"Go until jurong point, crazy.. Available only ...",ham,0.658293
1,Ok lar... Joking wif u oni...,spam,0.522032
2,Free entry in 2 a wkly comp to win FA Cup fina...,ham,0.665671
3,U dun say so early hor... U c already then say...,ham,0.503476
4,"Nah I don't think he goes to usf, he lives aro...",ham,0.553272


Classification report

In [60]:
print(classification_report(df_spam["label_text"], df_zero_shot["label"]))

              precision    recall  f1-score   support

         ham       0.87      0.74      0.80      4825
        spam       0.14      0.28      0.19       747

    accuracy                           0.68      5572
   macro avg       0.51      0.51      0.49      5572
weighted avg       0.77      0.68      0.72      5572



Matrice de confusion

In [None]:
mat = confusion_matrix(df_spam["label_text"], df_zero_shot["label"])

labels = df_spam["label_text"].unique()
df_mat = pd.DataFrame(mat, index=labels, columns=labels)

px.imshow(df_mat, text_auto=True)

Les spams sont mieux identifiés mais il y a encore beaucoup d'erreurs

---

## 7. Modèle spécifique entrainé pour reconnaitre les spams mshenoda/roberta-spam

In [62]:
pipeline = pipeline("text-classification", model="mshenoda/roberta-spam")


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development

Device set to use cpu


In [None]:
from transformers import AutoTokenizer

# Load model directly
checkpoint = "mshenoda/roberta-spam"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [64]:
def tokenize_function(example):
    return tokenizer(example["message"], truncation=True, padding=True)

In [None]:
from datasets import Dataset

# Conversion pandas -> Hugging Face Dataset
hf_dataset = Dataset.from_pandas(df_spam)

# Split 80% train, 20% test (ou val)
split_dataset = hf_dataset.train_test_split(test_size=0.2)

# Accès aux sous-datasets
train_dataset = split_dataset["train"]
val_dataset = split_dataset["test"]

# Appliquer la fonction de tokenization avec batching
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 4457/4457 [00:00<00:00, 8916.28 examples/s]
Map: 100%|██████████| 1115/1115 [00:00<00:00, 10771.97 examples/s]


In [None]:
from transformers import DataCollatorWithPadding

# Auto Padding : toutes les séquences sont de la même longueur
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# On vérifie la taille des séquences
samples = tokenized_train_dataset[:8]
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}
[len(x) for x in samples["input_ids"]]

[218, 218, 218, 218, 218, 218, 218, 218]

In [None]:
from transformers import TrainingArguments

# paramètres d'entrainement par défaut
training_args = TrainingArguments("test-trainer", report_to="none")

In [None]:
from transformers import AutoModelForSequenceClassification

# Création du modèle
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

In [70]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    data_collator=data_collator,
    processing_class=tokenizer,
)

In [71]:
# Entrainement
trainer.train()

Step,Training Loss
500,0.081
1000,0.0327
1500,0.0085


TrainOutput(global_step=1674, training_loss=0.03693101821264771, metrics={'train_runtime': 19094.2272, 'train_samples_per_second': 0.7, 'train_steps_per_second': 0.088, 'total_flos': 1740191339600820.0, 'train_loss': 0.03693101821264771, 'epoch': 3.0})

In [72]:
# Prédictions
predictions = trainer.predict(tokenized_val_dataset)
print(predictions.predictions.shape, predictions.label_ids.shape)

(1115, 2) (1115,)


In [73]:
preds = np.argmax(predictions.predictions, axis=-1)
preds

array([0, 1, 0, ..., 0, 0, 1], shape=(1115,))

Classification report

In [74]:
print(classification_report(tokenized_val_dataset["label"], preds))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       968
           1       1.00      0.97      0.99       147

    accuracy                           1.00      1115
   macro avg       1.00      0.99      0.99      1115
weighted avg       1.00      1.00      1.00      1115



Matrice de confusion

In [75]:
# produce the confusion matrix for your predictions, what comments can you make ?
mat = confusion_matrix(tokenized_val_dataset["label"], preds)

labels = df_spam["label_text"].unique()
df_mat = pd.DataFrame(mat, index=labels, columns=labels)

px.imshow(df_mat, text_auto=True)

Avec ce modèle nous avons de très bon résultats de classification HAM et SPAM