# Spam Dataset Classification

In [None]:
import pandas as pd
from pathlib import Path
import importlib
import functions
importlib.reload(functions)

data_path = Path("DATASET") / "spambase.data"
df = pd.read_csv(data_path, header=None)
df.columns = [f"col_{i}" for i in range(df.shape[1])]
display(df.head())

print(df.shape)      # (n_lignes, n_colonnes)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,...,col_48,col_49,col_50,col_51,col_52,col_53,col_54,col_55,col_56,col_57
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


(4601, 58)


In [None]:
# Total NaNs in dataframe
total_nan = int(df.isna().sum().sum())
print("Total NaN:", total_nan)

# NaN per column (ordered)
nan_by_col = df.isna().sum().sort_values(ascending=False)
display(nan_by_col.head(10))  # top 10 columns with the most NaN

Total NaN: 0


col_0    0
col_1    0
col_2    0
col_3    0
col_4    0
col_5    0
col_6    0
col_7    0
col_8    0
col_9    0
dtype: int64

In [None]:
from sklearn.metrics import make_scorer, f1_score, accuracy_score
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1 = make_scorer(f1_score)

target = df.columns[-1]
X = df.drop(columns=[target])
y = df[target]

X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Group with corelated features
groups = functions.corr_groups(X_tr, threshold=0.9)
print("Aperçu groupes:", groups[:5])

# Training with Variance Threshhold and Logisitic regression classifier
pipe = functions.pipe_with_variance_thresh(X_tr, y_tr, X_te, y_te, 1e-5, X, y)

# Impact per group (average over several shuffles)
impacts = functions.impact_per_group(pipe, X_te, y_te, groups, n_repeats=10)
print(impacts.head(10))


# We keep the K most important groups (based on impact) and retrain the model with only the features from these groups
K = 8 
keep_groups = impacts.head(K)["group"].tolist()
keep_features = sorted({f for g in keep_groups for f in g})
print("Nb features gardées:", len(keep_features))

# Refit sur le sous-ensemble
pipe_sel = Pipeline([
    ("imp", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=1000))
])
pipe_sel.fit(X_tr[keep_features], y_tr)

# Évalue sur test
print("Score test (sélection):", pipe_sel.score(X_te[keep_features], y_te))

# CV sur tout X,y avec seulement les features gardées
scores_sel = cross_val_score(
    pipe_sel, X[keep_features], y, cv=cv, scoring=f1, n_jobs=-1
)
print("F1 CV (sélection):", scores_sel.mean(), "+/-", scores_sel.std())


Aperçu groupes: [['col_0'], ['col_1'], ['col_2'], ['col_3'], ['col_4']]
Score test: 0.9294245385450597
       group  size      drop
0  (col_24,)     1  0.054210
1  (col_15,)     1  0.038523
2  (col_26,)     1  0.033973
3  (col_45,)     1  0.027469
4   (col_6,)     1  0.024128
5  (col_52,)     1  0.022553
6  (col_55,)     1  0.019749
7  (col_25,)     1  0.017189
8  (col_22,)     1  0.016597
9  (col_20,)     1  0.013129
Nb features gardées: 8
Score test: 0.8914223669923995


### LDA classification

In [8]:
# classification
class_means,class_cov = functions.LDA_classifier_train_cov(X_tr, y_tr,2)
predicted_labels = functions.LDA_classifier_predict_cov(X_te,class_means,class_cov,2)
pipe_with_K = functions.pipe_with_variance_thresh(X_tr[keep_features],y_tr,X_te[keep_features],y_te,1e-5, X, y)



In [None]:
# train and test accuracies
training_accuracy, test_accuracy = functions.train_test_accuracy_cov(X_tr,y_tr,X_te,y_te, class_means, class_cov, LDA_classifier_predict_cov)

print("training_accuracy :", training_accuracy)
print("test_accuracy :", test_accuracy)

### QDA classification

In [None]:
# using QDA classifier
class_means,cov = functions.QDA_classifier_train(X_tr, y_tr,2)
predicted_labels = functions.QDA_classifier_predict(X_te,class_means,cov,2)


In [None]:
# train and test accuracies
training_accuracy, test_accuracy = functions.train_test_accuracy_cov(X_tr,y_tr,X_te,y_te, class_means, cov, QDA_classifier_predict)

print("training_accuracy :", training_accuracy)
print("test_accuracy :", test_accuracy)

### Logistic Regression

In [None]:
pipeline = Pipeline([
        ('median', SimpleImputer()),
        ('scaler', StandardScaler()),
        ('classifier',LogisticRegression())])

pipeline.fit(X_train, Y_train)

print("Accuracy:", pipeline.score(X_test,Y_test ))

### Random Forest

In [None]:
# training of the model and prediction
Y_pred = functions.RF_pred(X_train, Y_train, X_test)

# accuracy
functions.RF_accuracy(Y_test, Y_pred)