## 1. Lecture CSV

In [276]:
import numpy as np
import pandas as pd
import plotly.express as px
from plotly import graph_objects as go
from sklearn.metrics import classification_report, confusion_matrix
import tiktoken
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
from torchinfo import summary

In [277]:
df_spam = pd.read_csv("../datas/spam_clean.csv", encoding="iso-8859-1")
df_spam.head()

Unnamed: 0,label_text,message,label
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


---

## 2. Tokenization

Je tokenize les messages avec le tokenizer "cl100k_base" (basé sur le byte pair coding)

In [278]:
tokenizer = tiktoken.get_encoding("cl100k_base")

def encode_texts(texts):
    return [tokenizer.encode(text) for text in texts]

tokens = encode_texts(df_spam["message"])

In [279]:
tokens[0][:10]

[11087, 3156, 16422, 647, 1486, 11, 14599, 497, 16528, 1193]

In [280]:
tokens[1][:10]

[11839, 45555, 1131, 622, 10979, 289, 333, 577, 389, 72]

Les modèles de NLP exigent souvent des séquences (liste de tokens) de tailles uniformes.

Calcul de la taille moyenne des séquences

In [281]:
seq_lens = [len(seq) for seq in tokens]
np.mean(seq_lens)

np.float64(22.893933955491743)

Distribution de la taille des séquences

In [282]:
px.histogram(seq_lens,nbins=30)

Taille moyenne des séquences : 22 tokens. Nous allons garder des séquences à 30 tokens.

In [283]:
def pad_sequences(sequences, max_length=30):
    return [seq[:max_length] + [0] * (max_length - len(seq)) for seq in sequences]

tokens = pad_sequences(tokens)

---

## 3. Dataset and split datas

Création du Dataset, des DataLoader et split des messages : train et validation. (80% - 20%)

In [284]:
# Class ATTDataset
class ATTDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = torch.tensor(texts, dtype=torch.long)
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

df_dataset = ATTDataset(tokens, df_spam["label"])

# Split dataset into training (80%) and validation (20%)
train_size = int(0.8 * len(df_dataset))
val_size = len(df_dataset) - train_size
train_dataset, val_dataset = random_split(df_dataset, [train_size, val_size])

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [285]:
text, label = next(iter(train_loader))
print(label)
print(text)

tensor([0., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.])
tensor([[ 9642,    11, 40688,    13,  8886,   499,  2133,   311,  1304,   757,
          4647,   276,    30,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [47873,   285,   297,   708,  3221, 30125,  1131,   220,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [ 8161,   956, 10894,   889, 25241,   499,   323,   889,   596,   879,
          3424,   499,   527,  2564,  1628,   387,   856,  1695,  8334,  2744,
          5354,   353,  6519, 85999, 21735,     9,     0,     0,     0,     0],
        [ 1539,    38,  1863,     0,  1226,   527,  4560,   311,  3729,   549,
            13, 59683,   954,  4128,  5039,   430,   499,   61

---

## 4. First prediction model (Classification)

#### Définition du modèle

1 couche embedding (transforme les tokens en vecteurs)

1 couche pooling (réduit les outputs)

1 couche Linear

Activation Sigmoid car nous sommes sur un problème de classification

In [286]:
vocab_size = tokenizer.n_vocab

class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.pooling = nn.AdaptiveAvgPool1d(1)
        self.fc = nn.Linear(embed_dim, num_class)

    def forward(self, text):
        embedded = self.embedding(text)
        pooled = self.pooling(embedded.permute(0, 2, 1)).squeeze(2)
        return torch.sigmoid(self.fc(pooled))

model = TextClassifier(vocab_size=vocab_size,
                      embed_dim=16,
                      num_class=1)

In [287]:

print(model)

# Print model summary
summary(model, input_data=text)

TextClassifier(
  (embedding): Embedding(100277, 16, padding_idx=0)
  (pooling): AdaptiveAvgPool1d(output_size=1)
  (fc): Linear(in_features=16, out_features=1, bias=True)
)


Layer (type:depth-idx)                   Output Shape              Param #
TextClassifier                           [32, 1]                   --
├─Embedding: 1-1                         [32, 30, 16]              1,604,432
├─AdaptiveAvgPool1d: 1-2                 [32, 16, 1]               --
├─Linear: 1-3                            [32, 1]                   17
Total params: 1,604,449
Trainable params: 1,604,449
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 51.34
Input size (MB): 0.01
Forward/backward pass size (MB): 0.12
Params size (MB): 6.42
Estimated Total Size (MB): 6.55

#### Entrainement

Fonction de coût : Binary Cross Entropy pour la Classification

Optimiser : Adam

On entraine le modèle sur 20 epochs

In [288]:
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

def train(model, train_loader, val_loader, criterion, optimizer, epochs=100):

    # Dictionary to store training & validation loss and accuracy over epochs
    history = {"loss": [], "val_loss": [], "accuracy": [], "val_accuracy": []}

    for epoch in range(epochs):  # Loop over the number of epochs
        model.train()  # Set model to training mode
        total_loss, correct = 0, 0  # Initialize total loss and correct predictions

        # Training loop
        for inputs, labels in train_loader:
            optimizer.zero_grad()  # Reset gradients before each batch
            outputs = model(inputs).squeeze()  # Forward pass
            loss = criterion(outputs, labels)  # Compute loss
            loss.backward()  # Backpropagation (compute gradients)
            optimizer.step()  # Update model parameters

            total_loss += loss.item()  # Accumulate batch loss
            correct += ((outputs > 0.5) == labels).sum().item()  # Count correct predictions

        # Compute average loss and accuracy for training
        train_loss = total_loss / len(train_loader)
        train_acc = correct / len(train_loader.dataset)

        # Validation phase (without gradient computation)
        model.eval()  # Set model to evaluation mode
        val_loss, val_correct = 0, 0
        with torch.no_grad():  # No need to compute gradients during validation
            for inputs, labels in val_loader:
                outputs = model(inputs).squeeze()  # Forward pass
                loss = criterion(outputs, labels)  # Compute loss
                val_loss += loss.item()  # Accumulate validation loss
                val_correct += ((outputs > 0.5) == labels).sum().item()  # Count correct predictions

        # Compute average loss and accuracy for validation
        val_loss /= len(val_loader)
        val_acc = val_correct / len(val_loader.dataset)

        # Store metrics in history dictionary
        history["loss"].append(train_loss)
        history["val_loss"].append(val_loss)
        history["accuracy"].append(train_acc)
        history["val_accuracy"].append(val_acc)

        # Print training progress
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {train_loss:.4f}, Acc: {train_acc:.4f}, "
              f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

    return history  # Return training history

history = train(model,
                train_loader=train_loader,
                val_loader=val_loader,
                criterion=criterion,
                optimizer=optimizer,
                epochs=20)

Epoch [1/20], Loss: 0.5921, Acc: 0.8627, Val Loss: 0.5493, Val Acc: 0.8762
Epoch [2/20], Loss: 0.5114, Acc: 0.8649, Val Loss: 0.4584, Val Acc: 0.8834
Epoch [3/20], Loss: 0.4166, Acc: 0.8818, Val Loss: 0.3638, Val Acc: 0.9049
Epoch [4/20], Loss: 0.3292, Acc: 0.9069, Val Loss: 0.2882, Val Acc: 0.9345
Epoch [5/20], Loss: 0.2600, Acc: 0.9316, Val Loss: 0.2322, Val Acc: 0.9498
Epoch [6/20], Loss: 0.2082, Acc: 0.9542, Val Loss: 0.1915, Val Acc: 0.9578
Epoch [7/20], Loss: 0.1704, Acc: 0.9679, Val Loss: 0.1625, Val Acc: 0.9695
Epoch [8/20], Loss: 0.1425, Acc: 0.9746, Val Loss: 0.1411, Val Acc: 0.9731
Epoch [9/20], Loss: 0.1221, Acc: 0.9798, Val Loss: 0.1250, Val Acc: 0.9749
Epoch [10/20], Loss: 0.1052, Acc: 0.9834, Val Loss: 0.1125, Val Acc: 0.9785
Epoch [11/20], Loss: 0.0922, Acc: 0.9863, Val Loss: 0.1025, Val Acc: 0.9821
Epoch [12/20], Loss: 0.0819, Acc: 0.9877, Val Loss: 0.0945, Val Acc: 0.9830
Epoch [13/20], Loss: 0.0729, Acc: 0.9881, Val Loss: 0.0878, Val Acc: 0.9830
Epoch [14/20], Loss: 

#### Sauvegarde du modèle

In [289]:
checkpoint_path = "../models/AT_T_DeepLearning__Model.pth"
torch.save({
    "model_state_dict": model.state_dict(),
    "optimizer_state_dict": optimizer.state_dict(),
    "history": history,
}, checkpoint_path)

#### Analyse des résultats

Visualisation de la fonction de coût et de l'accuracy

In [290]:
color_chart = ["#4B9AC7", "#4BE8E0", "#9DD4F3", "#97FBF6", "#2A7FAF", "#23B1AB", "#0E3449", "#015955"]

fig = go.Figure(data=[
                      go.Scatter(
                          y=history["loss"],
                          name="Training loss",
                          mode="lines",
                          marker=dict(
                              color=color_chart[0]
                          )),
                      go.Scatter(
                          y=history["val_loss"],
                          name="Validation loss",
                          mode="lines",
                          marker=dict(
                              color=color_chart[1]
                          ))
])
fig.update_layout(
    title="Training and val loss across epochs",
    xaxis_title="epochs",
    yaxis_title="Cross Entropy"
)
fig.show()

In [291]:
color_chart = ["#4B9AC7", "#4BE8E0", "#9DD4F3", "#97FBF6", "#2A7FAF", "#23B1AB", "#0E3449", "#015955"]

fig = go.Figure(data=[
                      go.Scatter(
                          y=history["accuracy"],
                          name="Training Accuracy",
                          mode="lines",
                          marker=dict(
                              color=color_chart[0]
                          )),
                      go.Scatter(
                          y=history["val_accuracy"],
                          name="Validation Accuracy",
                          mode="lines",
                          marker=dict(
                              color=color_chart[1]
                          ))
])
fig.update_layout(
    title="Training and val Accuracy across epochs",
    xaxis_title="epochs",
    yaxis_title="Cross Entropy"
)
fig.show()

In [292]:
final_history = {key: valeur[-1] for key, valeur in history.items()}
print(final_history)

{'loss': 0.0363638885452279, 'val_loss': 0.06175712182053498, 'accuracy': 0.9946152120260264, 'val_accuracy': 0.9901345291479821}


A première vue, le modèle classe plutôt bien les spams et hams.

Sur le set de train la loss est de 0.03 et l'accuracy de 0.99.

Sur le set de validation la loss est de 0.06 et l'accuracy de 0.99.

#### Analyse des erreurs : là où le modèle s'est trompé

In [293]:
# Function to evaluate the model and get worst predictions
def evaluate_worst_predictions(model, dataloader, tokenizer):
    # Set model to evaluation mode to disable dropout and batch normalization
    model.eval()

    # Lists to store all predictions, labels, errors, and inputs for analysis
    list_predictions = []
    list_labels = []
    list_errors = []
    list_inputs = []

    # No gradients needed during evaluation for efficiency
    with torch.no_grad():
        for batch in dataloader:
            # Extract inputs and labels from the batch
            inputs, labels = batch
            outputs = model(inputs) # Forward pass: Get model predictions

            # Convert outputs to predicted class for classification problems
            #preds = torch.argmax(outputs, dim=1)
            preds = (outputs >= 0.5).int().squeeze()
            errors = (preds != labels).float()  # Misclassified observations
            
            # Store predictions, labels, errors, and raw inputs for further analysis
            list_predictions.extend(preds.cpu().numpy())
            list_labels.extend(int(x) for x in labels.cpu().numpy())
            list_errors.extend(errors.cpu().numpy())
            list_inputs.extend(inputs.cpu().numpy())

    # Convert stored results into a Pandas DataFrame for easy analysis
    # Decode tokenized text back into human-readable text
    df_results = pd.DataFrame({
        "True_Label": list_labels,
        "Predicted": list_predictions,
        "Error": list_errors,
        "Inputs": list_inputs,
        "Text" : [tokenizer.decode(input) for input in list_inputs]
    })

    # Sort the DataFrame by highest error to identify the worst predictions
    df_results_sorted = df_results.sort_values(by="Error", ascending=False)

    # Return the sorted DataFrame containing worst predictions
    return df_results_sorted

# Evaluate worst predictions on validation set
worst_predictions_val = evaluate_worst_predictions(model, val_loader, tokenizer)

# Evaluate worst predictions on training set
worst_predictions_train = evaluate_worst_predictions(model, train_loader, tokenizer)

In [294]:
worst_predictions_train["Predicted"].value_counts()

Predicted
0    3870
1     587
Name: count, dtype: int64

Analyse des pires prédictions sur le set d'entrainement

In [295]:
worst_predictions_train.tail(10)

Unnamed: 0,True_Label,Predicted,Error,Inputs,Text
1482,0,0,0.0,"[32, 2478, 1274, 527, 520, 279, 1847, 11, 358,...","A few people are at the game, I'm at the mall ..."
1490,0,0,0.0,"[278, 1315, 13, 11361, 369, 279, 9650, 13, 234...",alright. Thanks for the advice. Enjoy your nig...
1489,0,0,0.0,"[19701, 54788, 6574, 358, 3358, 1650, 3010, 0,...","Sorry,in meeting I'll call later!!!!!!!!!!!!!!..."
1488,0,0,0.0,"[45, 83, 3686, 523, 1609, 12407, 497, 23796, 6...",Nt yet chikku..simple habba..hw abt u?!!!!!!!!...
1487,0,0,0.0,"[12389, 499, 3940, 304, 1940, 9188, 0, 0, 0, 0...",Have you started in skye!!!!!!!!!!!!!!!!!!!!!!!!
1486,1,1,0.0,"[33562, 1148, 433, 5097, 220, 17, 1935, 961, 3...",Got what it takes 2 take part in the WRC Rally...
1485,0,0,0.0,"[40, 2846, 1695, 13, 12522, 499, 9879, 311, 70...",I'm good. Have you registered to vote?!!!!!!!!...
1484,1,1,0.0,"[1991, 279, 4033, 5301, 3910, 4064, 10062, 102...",Get the official ENGLAND poly ringtone or colo...
1483,0,0,0.0,"[7979, 2288, 0, 12522, 264, 17104, 3814, 12599...",Me too! Have a lovely night xxx!!!!!!!!!!!!!!!...
1491,1,1,0.0,"[6219, 502, 220, 17, 6469, 612, 15890, 282, 77...",Am new 2 club & dont fink we met yet Will B gr...


classification report

In [296]:
print(classification_report(worst_predictions_train["True_Label"], worst_predictions_train["Predicted"]))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      3848
           1       1.00      0.96      0.98       609

    accuracy                           0.99      4457
   macro avg       1.00      0.98      0.99      4457
weighted avg       0.99      0.99      0.99      4457



Matrice de confusion

In [297]:
mat = confusion_matrix(worst_predictions_train["True_Label"], worst_predictions_train["Predicted"])

labels = df_spam["label_text"].unique()
df_mat = pd.DataFrame(mat, index=labels, columns=labels)

px.imshow(df_mat, text_auto=True)

Analyse des pires prédictions sur le set de validation

In [298]:
worst_predictions_val.tail(10)

Unnamed: 0,True_Label,Predicted,Error,Inputs,Text
365,0,0,0.0,"[40, 1101, 270, 74, 2288, 5043, 1131, 1630, 88...",I also thk too fast... Xy suggest one not me. ...
373,0,0,0.0,"[13347, 35389, 499, 1781, 922, 2489, 30, 0, 0,...",Hi.what you think about match?!!!!!!!!!!!!!!!!...
372,0,0,0.0,"[39, 49986, 497, 666, 74, 2771, 2751, 892, 311...","Hmmm.. Thk sure got time to hop ard... Ya, can..."
371,0,0,0.0,"[33947, 502, 1667, 62684, 0, 0, 0, 0, 0, 0, 0,...",Happy new years melody!!!!!!!!!!!!!!!!!!!!!!!!!!
370,1,1,0.0,"[3915, 1828, 2305, 636, 81226, 220, 1135, 4, 4...",From next month get upto 50% More Calls 4 Ur s...
369,1,1,0.0,"[52938, 15334, 6, 4592, 43549, 13, 1472, 2351,...",Congratulations YOU'VE Won. You're a Winner in...
368,1,1,0.0,"[36152, 43438, 523, 1609, 13, 2175, 220, 1041,...",Rock yr chik. Get 100's of filthy films &XXX p...
367,1,1,0.0,"[8538, 263, 606, 499, 1440, 374, 4560, 311, 37...",Someonone you know is trying to contact you vi...
366,1,1,0.0,"[44891, 6781, 58200, 1507, 87588, 5744, 6005, ...",FREE UNLIMITED HARDCORE PORN direct 2 your mob...
374,0,0,0.0,"[6219, 10307, 3838, 46433, 126, 231, 19321, 12...",Am watching house ÂÃÃ very entertaining ÂÃ...


classification report

In [299]:
print(classification_report(worst_predictions_val["True_Label"], worst_predictions_val["Predicted"]))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       977
           1       0.98      0.94      0.96       138

    accuracy                           0.99      1115
   macro avg       0.98      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115



Matrice de confusion

In [300]:
mat = confusion_matrix(worst_predictions_val["True_Label"], worst_predictions_val["Predicted"])

labels = df_spam["label_text"].unique()
df_mat = pd.DataFrame(mat, index=labels, columns=labels)

px.imshow(df_mat, text_auto=True)

---

## Conclusion

Le modèle arrive plutôt bien à identifier les spams / hams.

Toutefois, sur le set de validation : Le recall pour les spams est de 0.94, ce qui signifie que 6 % des spams ne sont pas détectés.

Plusieurs facteurs peuvent expliquer cela :
- Le Label 1 (Spam) est moins bien représenté dans le dataset, il est donc plus difficile à prédire.
- Notre dataset comporte peu de données au départ (environ 5500)

Afin d'améliorer la détection de spams, nous allons nous appuyer sur des modèles pré-existants, plus sophiqtiqués et entrainés sur des jeux de données plus importants afin de voir si la classification ham / spam s'améliore.