# Spam Dataset Classification

In [2]:
import importlib
import functions
importlib.reload(functions)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import torch 
import torch.nn as nn
import torch.optim as optim
from functions import scaled_tensorize_data, datasets_and_loaders, train_model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [5]:
import pandas as pd
from pathlib import Path

data_path = Path("DATASET") / "spambase.data"
df = pd.read_csv(data_path, header=None)
df.columns = [f"col_{i}" for i in range(df.shape[1])]
display(df.head())

print(df.shape)      # (n_lignes, n_colonnes)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,...,col_48,col_49,col_50,col_51,col_52,col_53,col_54,col_55,col_56,col_57
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


(4601, 58)


In [6]:
# total de NaN dans tout le DataFrame
total_nan = int(df.isna().sum().sum())
print("Total NaN:", total_nan)

# NaN par colonne (ordonné)
nan_by_col = df.isna().sum().sort_values(ascending=False)
display(nan_by_col.head(10))  # top 10 colonnes avec le plus de NaN





Total NaN: 0


col_0     0
col_43    0
col_31    0
col_32    0
col_33    0
col_34    0
col_35    0
col_36    0
col_37    0
col_38    0
dtype: int64

In [7]:
# --- 1) pipe_var minimaliste (comme demandé) ---
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import make_scorer, f1_score, accuracy_score

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1 = make_scorer(f1_score)

def pipe_var(X_tr, y_tr, X_te, y_te, thrs, X, y):
    pipe = Pipeline([
        ("imp", SimpleImputer(strategy="median")),
        ("var", VarianceThreshold(threshold=thrs)),
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(max_iter=1000))
    ])
    pipe.fit(X_tr, y_tr)
    print("Score test:", pipe.score(X_te, y_te))

    # IMPORTANT: on passe l'ESTIMATEUR (pipe), pas la fonction pipe_var
    scores = cross_val_score(pipe, X, y, cv=cv, scoring=f1, n_jobs=-1)
    print("VarianceThreshold F1:", scores.mean(), "+/-", scores.std())
    return pipe


# --- 2) Corrélation: regrouper + shuffler un groupe ---
import numpy as np
import pandas as pd

def corr_groups(X, threshold=0.9):
    """
    Groupes de colonnes corrélées (|corr| >= threshold) via composantes connexes.
    Retourne: list[list[str]]
    """
    C = X.corr().abs()
    cols = list(X.columns)
    seen, groups = set(), []
    for c in cols:
        if c in seen:
            continue
        grp, stack = [], [c]
        seen.add(c)
        while stack:
            u = stack.pop()
            grp.append(u)
            neigh = C.index[(C[u] >= threshold) & (C.index != u)]
            for v in neigh:
                if v not in seen:
                    seen.add(v)
                    stack.append(v)
        groups.append(sorted(grp))
    return groups

def shuffle_group(X, group, random_state=42):
    """
    Retourne une copie de X où TOUTES les colonnes du groupe sont shufflées
    avec la MÊME permutation (on casse le lien de ce groupe avec y).
    """
    Xp = X.copy()
    rng = np.random.default_rng(random_state)
    perm = rng.permutation(len(Xp))
    for c in group:
        Xp[c] = Xp[c].to_numpy()[perm]
    return Xp


# helper pour évaluer l’impact moyen d’un groupe sur accuracy et F1
def impact_par_groupe(pipe, X_te, y_te, groups, n_repeats=10, seed=0):
    base_acc = accuracy_score(y_te, pipe.predict(X_te))
    if hasattr(pipe, "predict_proba"):
        p = pipe.predict_proba(X_te)[:, 1]
        base_f1 = f1_score(y_te, (p >= 0.5).astype(int))
    else:
        base_f1 = f1_score(y_te, pipe.predict(X_te))

    rng = np.random.default_rng(seed)
    res = []
    for g in groups:
        acc_drops, f1_drops = [], []
        for _ in range(n_repeats):
            X_te_shuf = shuffle_group(X_te, g, random_state=int(rng.integers(0, 1e9)))
            if hasattr(pipe, "predict_proba"):
                p = pipe.predict_proba(X_te_shuf)[:, 1]
                yhat = (p >= 0.5).astype(int)
            else:
                yhat = pipe.predict(X_te_shuf)
            acc_drops.append(base_acc - accuracy_score(y_te, yhat))
            f1_drops.append(base_f1 - f1_score(y_te, yhat))
        res.append({"group": tuple(g), "size": len(g),
                    "acc_drop": float(np.mean(acc_drops)),
                    "f1_drop": float(np.mean(f1_drops))})
    df_imp = pd.DataFrame(res).sort_values(["f1_drop","acc_drop"], ascending=False).reset_index(drop=True)
    print(f"Baseline  Acc={base_acc:.3f}  F1={base_f1:.3f}")
    return df_imp


target = df.columns[-1]
X = df.drop(columns=[target])
y = df[target]

X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 1) Groupes corrélés
groups = corr_groups(X_tr, threshold=0.9)
print("Aperçu groupes:", groups[:5])

# 2) Entraînement + CV
pipe = pipe_var(X_tr, y_tr, X_te, y_te, 1e-5, X, y)

# 3) Impact par groupe (moyenne sur quelques shuffles)
impacts = impact_par_groupe(pipe, X_te, y_te, groups, n_repeats=10)
print(impacts.head(10))





# 4) Test rapide d’un groupe particulier 

g0 = groups[2]
X_te_shuf = shuffle_group(X_te, g0, random_state=0)
print("Score test (groupe shufflé):", pipe.score(X_te_shuf, y_te))


#On garde les groupes les plus important
K = 8 
keep_groups = impacts.head(K)["group"].tolist()
keep_features = sorted({f for g in keep_groups for f in g})
print("Nb features gardées:", len(keep_features))

# Refit sur le sous-ensemble
pipe_sel = Pipeline([
    ("imp", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=1000))
])
pipe_sel.fit(X_tr[keep_features], y_tr)

# Évalue sur test
print("Score test (sélection):", pipe_sel.score(X_te[keep_features], y_te))

# CV sur tout X,y avec seulement les features gardées
scores_sel = cross_val_score(
    pipe_sel, X[keep_features], y, cv=cv, scoring=f1, n_jobs=-1
)
print("F1 CV (sélection):", scores_sel.mean(), "+/-", scores_sel.std())






Aperçu groupes: [['col_0'], ['col_1'], ['col_2'], ['col_3'], ['col_4']]
Score test: 0.9294245385450597
VarianceThreshold F1: 0.9023395347207346 +/- 0.009077532224183586
Baseline  Acc=0.929  F1=0.909
       group  size  acc_drop   f1_drop
0  (col_24,)     1  0.037134  0.051113
1  (col_15,)     1  0.027904  0.037768
2  (col_26,)     1  0.021824  0.031663
3  (col_45,)     1  0.020521  0.027882
4  (col_52,)     1  0.022910  0.027189
5   (col_6,)     1  0.020955  0.026447
6  (col_55,)     1  0.017481  0.021252
7  (col_25,)     1  0.012704  0.018922
8  (col_22,)     1  0.014984  0.017570
9  (col_28,)     1  0.008903  0.012802
Score test (groupe shufflé): 0.9283387622149837
Nb features gardées: 8
Score test (sélection): 0.8914223669923995
F1 CV (sélection): 0.8576121095324428 +/- 0.007335637180195787


### LDA classification

In [8]:
# classification
class_means,class_cov = functions.LDA_classifier_train_cov(X_tr, y_tr,2)
predicted_labels = functions.LDA_classifier_predict_cov(X_te,class_means,class_cov,2)

print("class' means :", class_means)
print("class' covariance :", class_cov)

InvalidIndexError: (2940     True
1303    False
3468     True
3181     True
794     False
        ...  
1861     True
2366     True
330     False
536     False
3114     True
Name: col_57, Length: 3680, dtype: bool, slice(None, None, None))

In [None]:
# train and test accuracies
training_accuracy, test_accuracy = functions.train_test_accuracy_cov(X_tr,y_tr,X_te,y_te, class_means, class_cov, LDA_classifier_predict_cov)

print("training_accuracy :", training_accuracy)
print("test_accuracy :", test_accuracy)

In [None]:
# plot the boundaries
grid_size = 10000

x = np.linspace(-100,100,grid_size)
y = np.linspace(-100,100,grid_size)
functions.plot_decision_boundary(x,y,X,Y,class_means,LDA_classifier_predict_cov)

### QDA classification

In [None]:
# using QDA classifier
class_means,cov = functions.QDA_classifier_train(X_tr, y_tr,2)
predicted_labels = functions.QDA_classifier_predict(X_te,class_means,cov,2)

print("class' means : {class_means}")
print("class' covariance : {cov}")


In [None]:
# train and test accuracies
training_accuracy, test_accuracy = functions.train_test_accuracy_cov(X_tr,y_tr,X_te,y_te, class_means, cov, QDA_classifier_predict)

print("training_accuracy :", training_accuracy)
print("test_accuracy :", test_accuracy)

In [None]:
# plot the boundaries
grid_size = 10000

x = np.linspace(-100,100,grid_size)
y = np.linspace(-100,100,grid_size)
functions.plot_decision_boundary_cov(x,y,X,labels,class_means,QDA_classifier_predict)