In [None]:
import numpy as np
import pandas as pd
import matplolib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

df = pd.read_csv('mail_data.csv')

Comprendre l'ensemble des données.

In [None]:
df.isnull().sum()
"""
Category    0
Message     0
dtype: int64
"""
df.shape   # (5572, 2)
df['Category'].value_counts()
"""
ham     4825
spam     747
Name: Category, dtype: int64
"""

Il n'y a pas de valeur manquante.

Le sous-échantillonage diminue le nombre de lignes où la catégorie `ham`, rendant le nombre de ligne égale à `spam`. Le sur-échantillonnage est l'inverse.

⇒ Sous-échantilloner les données.

In [None]:
ham_messages = df[df['Category'] == 'ham']
spam_messages = df[df['Category'] == 'spam']
undersampled_ham = ham_messages.sample(frac=len(spam_messages)/len(ham_messages), random_state=1)
len(undersampled_ham)   # 747 (Maintenant, spam et ham ont le même nombre de lignes dans l'ensemble de données.)
new_df = pd.concat([undersampled_ham, spam_messages])
print(new_df)
"""
     Category                                            Message
2535      ham                      Ok enjoy . R u there in home.
1213      ham  Yo, the game almost over? Want to go to walmar...
522       ham                         Shall i come to get pickle
5398      ham  Hi. Hope you had a good day. Have a better night.
700       ham           K..u also dont msg or reply to his msg..
...       ...                                                ...
5537     spam  Want explicit SEX in 30 secs? Ring 02073162414...
5540     spam  ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE ...
5547     spam  Had your contract mobile 11 Mnths? Latest Moto...
5566     spam  REMINDER FROM O2: To get 2.50 pounds free call...
5567     spam  This is the 2nd time we have tried 2 contact u...
[1494 rows x 2 columns]
"""

Remplacer `ham`par 0 et `spam`par 1.

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
new_df['Category'] = le.fit_transform(new_df['Category'])
print(new_df.head())
"""
      Category                                            Message
2535         0                      Ok enjoy . R u there in home.
1213         0  Yo, the game almost over? Want to go to walmar...
522          0                         Shall i come to get pickle
5398         0  Hi. Hope you had a good day. Have a better night.
700          0           K..u also dont msg or reply to his msg..
"""

La concaténation déséquilibre le jeu de données, le sous-échantillonage incombe que tout soit `ham`.

In [None]:
balanced_df = new_df.sample(frac=1, random_state=42).reset_index(drop=True)
balanced_df.head(10)
"""
   Category                                            Message
0         1  URGENT, IMPORTANT INFORMATION FOR O2 USER. TOD...
1         1  Panasonic & BluetoothHdset FREE. Nokia FREE. M...
2         1  Do you want a new Video handset? 750 any time ...
3         1  Hi if ur lookin 4 saucy daytime fun wiv busty ...
4         1  09066362231 URGENT! Your mobile No 07xxxxxxxxx...
5         0                    Jus ans me lar. U'll noe later.
6         0  Need a coffee run tomo?Can't believe it's that...
7         0  Sorry . I will be able to get to you. See you ...
8         0                            Also andros ice etc etc
9         0  Hello, hello, hi lou sorry it took so long 2 r...
"""

Pour en savoir plus, à propos de la méthode d'échantillonage, diviser l'ensemble des données en Features et Labels.

In [None]:
X = balanced_df['Message']
y = balanced_df['Category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
X_train.shape, X_test.shape   #((1120,), (374,))
y_train.shape, y_test.shape   #((1120,), (374,))

C'est du texte, il faut le convertir en valeur numéique (e. g. : `TfidVectorized`).

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
     'This is the first document.',
     'This document is the second document.',
     'And this is the third one.',
     'Is this the first document?',
]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names_out())
"""array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this'], dtype=object)"""


X.toarray()
"""
array([[0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524],
       [0.        , 0.6876236 , 0.        , 0.28108867, 0.        ,
        0.53864762, 0.28108867, 0.        , 0.28108867],
       [0.51184851, 0.        , 0.        , 0.26710379, 0.51184851,
        0.        , 0.26710379, 0.51184851, 0.26710379],
       [0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524]])
"""

`TfidVectorized` renvoie une matrice peu dense, elle sera convertie en tableau pour avoir un aperçu.

In [None]:
vectorizer = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

Ignorer les mots apparaissants qu'une fois avec `min_df=1`, ignorer les mots courantse en anglais par `stop_words=english`, enfin utiliser `lowercase=True` pour que tout soit en minuscule.

In [None]:
new_X_train = vectorizer.fit_transform(X_train)
new_X_test = vectorizer.transform(X_test)

Le vectorizer est utiliser sur tout l'entraînement de test. Utilise le modèle de régression logistique.

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(new_X_train, y_train)
lr_prediction = lr.predict(new_X_test)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, lr_prediction)
print(accuracy)   # 0.9491978609625669

Ce modèle prédit un pourriel à près de 95%.