In [None]:
import numpy as np
import pandas as pd
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix
from scipy.sparse import hstack
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import os
import pickle

In [2]:
DATASET_PATH = "./spam.csv"

# Tested multiple value 0.2 seems good
TEST_SIZE = 0.2

In [3]:
df = pd.read_csv(DATASET_PATH, encoding="ISO-8859-1")
print(df.head())
print(df.shape)

  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
(5572, 2)


In [4]:
print("Label distribution :")
print(df["label"].value_counts())

print("Example of 'spam' message :")
print(df[df["label"] == "spam"].head(2).values)

print("Example of 'ham' message:")
print(df[df["label"] == "ham"].head(2).values)

Label distribution :
label
ham     4825
spam     747
Name: count, dtype: int64
Example of 'spam' message :
[['spam'
  "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"]
 ['spam'
  "FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, Ã¯Â¿Â½1.50 to rcv"]]
Example of 'ham' message:
[['ham'
  'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...']
 ['ham' 'Ok lar... Joking wif u oni...']]


In [5]:
##### Remove duplicates and null values #####

print(f"Before: {len(df)}")

df = df.drop_duplicates(subset=["message"])

print(f"After: {len(df)}")

Before: 5572
After: 5163


In [None]:
df["message_clean"] = df["message"].str.lower()
df["message_clean"] = df["message_clean"].apply(lambda x: re.sub(r"http\S+|www\S+|https\S+", "", x))
df["message_clean"] = df["message_clean"].apply(lambda x: re.sub(r"\S+@\S+", "", x))
df["message_clean"] = df["message_clean"].apply(lambda x: re.sub(r"\d+", "", x))
df["message_clean"] = df["message_clean"].apply(lambda x: x.translate(str.maketrans("", "", string.punctuation)))
df["message_clean"] = df["message_clean"].apply(lambda x: " ".join(x.split()))

print(f"Before: {df['message'].iloc[0]}")
print(f"After: {df['message_clean'].iloc[0]}")

Before: Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
After: go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat


In [7]:
df["message_length"] = df["message"].apply(len)
df["word_count"] = df["message"].apply(lambda x: len(x.split()))
df["avg_word_length"] = df["message_length"] / df["word_count"]

df["caps_count"] = df["message"].apply(lambda x: sum(1 for c in x if c.isupper()))
df["caps_ratio"] = df["caps_count"] / df["message_length"]

df["special_chars"] = df["message"].apply(lambda x: sum(1 for c in x if c in "!?$â‚¬Â£%"))

print("features added :")
print(df[["message_length", "word_count", "caps_ratio", "special_chars"]].describe())

features added :
       message_length   word_count   caps_ratio  special_chars
count     5163.000000  5163.000000  5163.000000    5163.000000
mean        79.119698    15.328298     0.063176       0.532636
std         58.316615    11.068738     0.110715       0.958379
min          2.000000     1.000000     0.000000       0.000000
25%         36.000000     7.000000     0.025000       0.000000
50%         60.000000    12.000000     0.035971       0.000000
75%        117.000000    22.000000     0.055556       1.000000
max        910.000000   171.000000     1.000000      13.000000


In [None]:
# Tested multiple value 5000 is good
# ngram with (1, 2) works
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), min_df=2, max_df=0.95, stop_words="english")

X_tfidf = vectorizer.fit_transform(df["message_clean"])
print(f"Shape TF-IDF: {X_tfidf.shape}")

Shape TF-IDF: (5163, 5000)


In [None]:
numerical_features = [
    "message_length",
    "word_count",
    "avg_word_length",
    "caps_ratio",
    "special_chars",
]

X_numerical = df[numerical_features].values

scaler = StandardScaler()
X_numerical_scaled = scaler.fit_transform(X_numerical)

X_numerical_sparse = csr_matrix(X_numerical_scaled)
X_combined = hstack([X_tfidf, X_numerical_sparse])

print(f"Shape finale: {X_combined.shape}")

Shape finale: (5163, 5005)


In [None]:
le = LabelEncoder()
y = le.fit_transform(df["label"])

print(f"Classes: {le.classes_}")
print(f"Distribution: {np.bincount(y)}")

Classes: ['ham' 'spam']
Distribution: [4516  647]


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=TEST_SIZE, random_state=42, stratify=y)

print(f"Train: {X_train.shape[0]}, Test: {X_test.shape[0]}")

classes, counts = np.unique(y_train, return_counts=True)

print("Classes :", classes)

Train: 4130, Test: 1033
Classes : [0 1]


In [None]:
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

y_pred_lr = lr_model.predict(X_test)

print("=== Logistic Regression ===")
print(f"Accuracy: {accuracy_score(y_test, y_pred_lr):.4f}")
print(classification_report(y_test, y_pred_lr, target_names=le.classes_))

=== Logistic Regression ===
Accuracy: 0.9535
              precision    recall  f1-score   support

         ham       0.96      0.99      0.97       904
        spam       0.91      0.70      0.79       129

    accuracy                           0.95      1033
   macro avg       0.93      0.84      0.88      1033
weighted avg       0.95      0.95      0.95      1033



In [None]:
xgb_model = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    random_state=42,
    use_label_encoder=False,
    eval_metric="logloss",
)
xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)

print("=== XGBoost ===")
print(f"Accuracy: {accuracy_score(y_test, y_pred_xgb):.4f}")
print(classification_report(y_test, y_pred_xgb, target_names=le.classes_))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


=== XGBoost ===
Accuracy: 0.9710
              precision    recall  f1-score   support

         ham       0.98      0.99      0.98       904
        spam       0.93      0.83      0.88       129

    accuracy                           0.97      1033
   macro avg       0.95      0.91      0.93      1033
weighted avg       0.97      0.97      0.97      1033



In [None]:
param_grid = {
    "C": [0.1, 0.5, 1.0, 5.0],
    "penalty": ["l1", "l2"],
    "solver": ["liblinear"],
}

grid_search = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5, scoring="f1", n_jobs=-1)

grid_search.fit(X_train, y_train)

print(f"Best params: {grid_search.best_params_}")
print(f"Best CV score: {grid_search.best_score_:.4f}")

best_model = grid_search.best_estimator_

Best params: {'C': 5.0, 'penalty': 'l2', 'solver': 'liblinear'}
Best CV score: 0.8819




In [None]:
y_pred_final = best_model.predict(X_test)
y_proba_final = best_model.predict_proba(X_test)[:, 1]

print(f"Accuracy: {accuracy_score(y_test, y_pred_final):.4f}")
print(f"ROC-AUC: {roc_auc_score(y_test, y_proba_final):.4f}")

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_final))

print("Classification Report:")
print(classification_report(y_test, y_pred_final, target_names=le.classes_))

Accuracy: 0.9758
ROC-AUC: 0.9810

Confusion Matrix:
[[899   5]
 [ 20 109]]
Classification Report:
              precision    recall  f1-score   support

         ham       0.98      0.99      0.99       904
        spam       0.96      0.84      0.90       129

    accuracy                           0.98      1033
   macro avg       0.97      0.92      0.94      1033
weighted avg       0.98      0.98      0.98      1033



In [None]:
fpr, tpr, _ = roc_curve(y_test, y_proba_final)
auc_score = roc_auc_score(y_test, y_proba_final)
precision, recall, _ = precision_recall_curve(y_test, y_proba_final)

fig = make_subplots(rows=1, cols=2, subplot_titles=("ROC Curve", "Precision-Recall Curve"))

fig.add_trace(
    go.Scatter(x=fpr, y=tpr, name=f"ROC (AUC = {auc_score:.3f})", mode="lines"),
    row=1,
    col=1,
)

fig.add_trace(
    go.Scatter(x=recall, y=precision, name="Precision-Recall", mode="lines"),
    row=1,
    col=2,
)

fig.update_xaxes(title_text="False Positive Rate", row=1, col=1)
fig.update_yaxes(title_text="True Positive Rate", row=1, col=1)

fig.update_xaxes(title_text="Recall", row=1, col=2)
fig.update_yaxes(title_text="Precision", row=1, col=2)

fig.update_layout(height=500, width=1000, title_text="Model Evaluation Metrics", showlegend=True)

fig.show()

In [17]:
feature_names = vectorizer.get_feature_names_out().tolist() + numerical_features

coefs = best_model.coef_[0]
top_positive = np.argsort(coefs)[-10:]
top_negative = np.argsort(coefs)[:10]

print("Top 10 features SPAM:")
for idx in reversed(top_positive):
    print(f"  {feature_names[idx]}: {coefs[idx]:.4f}")

print("Top 10 features HAM:")
for idx in top_negative:
    print(f"  {feature_names[idx]}: {coefs[idx]:.4f}")

Top 10 features SPAM:
  txt: 6.9519
  text: 4.8550
  claim: 4.7225
  reply: 4.6577
  mobile: 4.2933
  stop: 3.8811
  prize: 3.8467
  message: 3.5431
  service: 3.5060
  message_length: 3.3587
Top 10 features HAM:
  ltgt: -7.0816
  happy: -4.4348
  word_count: -2.5748
  amp: -2.3485
  Â½Ã¯: -2.3334
  think: -2.1673
  went: -1.9955
  ill: -1.9831
  im: -1.9137
  come: -1.8177


In [None]:
os.makedirs("models", exist_ok=True)

with open("models/model.pkl", "wb") as f:
    pickle.dump(best_model, f)

with open("models/vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

with open("models/scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

with open("models/label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)

print("OK")

OK


In [27]:
def predict_spam(text: str) -> tuple[str, float]:
    text_clean = text.lower()
    text_clean = re.sub(r"http\S+|www\S+|https\S+", "", text_clean)
    text_clean = re.sub(r"\S+@\S+", "", text_clean)
    text_clean = re.sub(r"\d+", "", text_clean)
    text_clean = text_clean.translate(str.maketrans("", "", string.punctuation))
    text_clean = " ".join(text_clean.split())

    X_tfidf = vectorizer.transform([text_clean])

    text_length = len(text)
    word_count = len(text.split())
    avg_word_length = text_length / word_count if word_count > 0 else 0
    caps_count = sum(1 for c in text if c.isupper())
    caps_ratio = caps_count / text_length if text_length > 0 else 0
    special_chars = sum(1 for c in text if c in "!?$â‚¬Â£%")

    X_numerical = [[text_length, word_count, avg_word_length, caps_ratio, special_chars]]
    X_numerical_scaled = scaler.transform(X_numerical)

    X_combined = hstack([X_tfidf, csr_matrix(X_numerical_scaled)])

    pred = best_model.predict(X_combined)[0]
    proba = best_model.predict_proba(X_combined)[0]

    label = le.inverse_transform([pred])[0]
    confidence = max(proba) * 100

    return label, confidence


test_messages = [
    "FREE!!! You have won a $1000 Walmart gift card! Click here to claim NOW!!!",
    "Hey, are we still meeting for coffee tomorrow at 3pm?",
    "URGENT: Your account will be suspended. Verify your details immediately.",
    "Thanks for your help with the project yesterday. Really appreciated it!",
    "Congratulations! You've been selected for a FREE iPhone 15! Call 0800-123-456",
    "Can you send me the meeting notes when you get a chance?",
    "I'm a Saudi prince. Would you like to give me some money so I can give you double back for free? It's not a scam trust me!"
]

for msg in test_messages:
    label, confidence = predict_spam(msg)
    emoji = "ðŸš«" if label == "spam" else "âœ…"
    print(f"{emoji} [{label:4}] ({confidence:.1f}%) {msg[:50]}...")

ðŸš« [spam] (91.8%) FREE!!! You have won a $1000 Walmart gift card! Cl...
âœ… [ham ] (98.9%) Hey, are we still meeting for coffee tomorrow at 3...
âœ… [ham ] (70.1%) URGENT: Your account will be suspended. Verify you...
âœ… [ham ] (93.8%) Thanks for your help with the project yesterday. R...
ðŸš« [spam] (60.5%) Congratulations! You've been selected for a FREE i...
âœ… [ham ] (96.4%) Can you send me the meeting notes when you get a c...
âœ… [ham ] (97.7%) I'm a Saudi prince. Would you like to give me some...


In [None]:
# # Ancien code de preprocessing - ne pas utiliser
# def old_preprocess(text):
#     text = text.lower()
#     text = re.sub(r'[^a-z\s]', '', text)
#     return text

# # Essai avec SVM - trop lent, abandonnÃ©
# from sklearn.svm import SVC
# svm_model = SVC(kernel='rbf', probability=True)
# svm_model.fit(X_train, y_train)

# # Test avec CountVectorizer au lieu de TF-IDF
# from sklearn.feature_extraction.text import CountVectorizer
# cv = CountVectorizer(max_features=5000)
# X_cv = cv.fit_transform(df['text_clean'])