In [96]:
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import sys
sys.path.append("../src")  # dis à Python où chercher les fichiers

from model import train_nb, train_logreg, train_svm



In [97]:


# Lecture du fichier brut
df = pd.read_csv("../data/SMSSpamCollection", sep='\t', header=None, names=["label", "message"])

# Vérification
print(df.head())
print(df['label'].value_counts())


  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
label
ham     4825
spam     747
Name: count, dtype: int64


## 1. Nettoyer les messages texte

In [98]:


def clean_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text

# Appliquer à tous les messages
df['clean_message'] = df['message'].apply(clean_text)

# Vérification
df[['message', 'clean_message']].head()


Unnamed: 0,message,clean_message
0,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...
1,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...
3,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,"Nah I don't think he goes to usf, he lives aro...",nah i don t think he goes to usf he lives arou...


## 2. Encodage des labels (spam, ham)


In [99]:


le = LabelEncoder()
y = le.fit_transform(df['label'])  # ham = 0, spam = 1


## 3. Vectorisation des messages (TF-IDF)

In [100]:


vectorizer = TfidfVectorizer(max_features=3000)
X = vectorizer.fit_transform(df['clean_message'])


 ## 4. Séparer en données d'entraînement et test

In [101]:


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [102]:
model1 = train_nb(X_train, y_train)


In [103]:
y_pred = model1.predict(X_test)


In [104]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))
print("\n🧱 Matrice de confusion:\n", confusion_matrix(y_test, y_pred))


✅ Accuracy: 0.9748878923766816

📊 Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.99       966
           1       1.00      0.81      0.90       149

    accuracy                           0.97      1115
   macro avg       0.99      0.91      0.94      1115
weighted avg       0.98      0.97      0.97      1115


🧱 Matrice de confusion:
 [[966   0]
 [ 28 121]]


In [105]:
model2 = train_logreg(X_train, y_train)

In [106]:
y_pred = model2.predict(X_test)

In [107]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))
print("\n🧱 Matrice de confusion:\n", confusion_matrix(y_test, y_pred))


✅ Accuracy: 0.9730941704035875

📊 Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98       966
           1       1.00      0.80      0.89       149

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.94      1115
weighted avg       0.97      0.97      0.97      1115


🧱 Matrice de confusion:
 [[966   0]
 [ 30 119]]


In [108]:
model3 = train_svm(X_train, y_train)


In [109]:
y_pred = model3.predict(X_test)


In [110]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))
print("\n🧱 Matrice de confusion:\n", confusion_matrix(y_test, y_pred))


✅ Accuracy: 0.9802690582959641

📊 Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       0.99      0.86      0.92       149

    accuracy                           0.98      1115
   macro avg       0.99      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115


🧱 Matrice de confusion:
 [[965   1]
 [ 21 128]]


In [111]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import pandas as pd

def compare_models(X_train, X_test, y_train, y_test):
    from model import train_nb, train_logreg, train_svm

    models = {
        "Naive Bayes": train_nb(X_train, y_train),
        "Logistic Regression": train_logreg(X_train, y_train),
        "SVM": train_svm(X_train, y_train)
    }

    results = []

    for name, model in models.items():
        y_pred = model.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)

        results.append({
            "Modèle": name,
            "Accuracy": round(acc, 3),
            "F1-score": round(f1, 3),
            "Precision": round(precision, 3),
            "Recall": round(recall, 3)
        })

    # Affichage sous forme de tableau
    df_results = pd.DataFrame(results)
    return df_results


In [112]:
df_scores = compare_models(X_train, X_test, y_train, y_test)
print(df_scores)


                Modèle  Accuracy  F1-score  Precision  Recall
0          Naive Bayes     0.975     0.896      1.000   0.812
1  Logistic Regression     0.973     0.888      1.000   0.799
2                  SVM     0.980     0.921      0.992   0.859
