## Assignment 2
Petter Eriksson , Oscar Andersson

## 1. Load and Preprocess Dataset

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedKFold
from scipy.stats import friedmanchisquare
import time

# Load and preprocess dataset
data_file = "spambase.data"
df = pd.read_csv(data_file, header=None)

# Features and target split
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

# Standardize features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Classifiers
classifiers = { "Log Reg": LogisticRegression(max_iter=500),
               "SVM": SVC(),
               "Random Forest": RandomForestClassifier()
}




## 2. Stratified ten-fold cross-validation tests

In [2]:
# Metrics placeholders
results = {"Algorithm": [], "Fold": [], "Training Time (s)": [], "Accuracy": [], "F-measure": []}

# Stratified 10-Fold Cross-Validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

for algo_name, clf in classifiers.items():
    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y), start=1):
        # Split data
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # Train and time the classifier
        start_time = time.time()
        clf.fit(X_train, y_train)
        train_time = time.time() - start_time

        # Predictions
        y_pred = clf.predict(X_test)

        # Calculate metrics
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')

        # Store results
        results["Algorithm"].append(algo_name)
        results["Fold"].append(fold)
        results["Training Time (s)"].append(train_time)
        results["Accuracy"].append(acc)
        results["F-measure"].append(f1)

# Convert results to DataFrame
results_df = pd.DataFrame(results)

In [3]:
print(results_df)

        Algorithm  Fold  Training Time (s)  Accuracy  F-measure
0         Log Reg     1           0.024499  0.915401   0.915188
1         Log Reg     2           0.014995  0.921739   0.921415
2         Log Reg     3           0.016999  0.928261   0.928004
3         Log Reg     4           0.013000  0.932609   0.932507
4         Log Reg     5           0.015001  0.908696   0.908782
5         Log Reg     6           0.016007  0.936957   0.936926
6         Log Reg     7           0.015000  0.934783   0.934507
7         Log Reg     8           0.014501  0.926087   0.925598
8         Log Reg     9           0.016000  0.939130   0.938489
9         Log Reg    10           0.014997  0.910870   0.910441
10            SVM     1           0.269497  0.919740   0.919450
11            SVM     2           0.265001  0.928261   0.928004
12            SVM     3           0.264499  0.939130   0.939010
13            SVM     4           0.261503  0.921739   0.921496
14            SVM     5           0.2644

## 3. Friedman Test