## Assignment 2
Petter Eriksson , Oscar Andersson

## 1. Load and Preprocess Dataset

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedKFold
from scipy.stats import friedmanchisquare
import time

# Load and preprocess dataset
data_file = "spambase.data"
df = pd.read_csv(data_file, header=None)

# Features and target split
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

# Standardize features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Classifiers
classifiers = { "Log Reg": LogisticRegression(max_iter=500),
               "SVM": SVC(),
               "Random Forest": RandomForestClassifier()
}




## 2. Stratified ten-fold cross-validation tests

In [2]:
# Metrics placeholders
results = {"Algorithm": [], "Fold": [], "Training Time (s)": [], "Accuracy": [], "F-measure": []}

# Stratified 10-Fold Cross-Validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

for algo_name, clf in classifiers.items():
    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y), start=1):
        # Split data
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # Train and time the classifier
        start_time = time.time()
        clf.fit(X_train, y_train)
        train_time = time.time() - start_time

        # Predictions
        y_pred = clf.predict(X_test)

        # Calculate metrics
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')

        # Store results
        results["Algorithm"].append(algo_name)
        results["Fold"].append(fold)
        results["Training Time (s)"].append(train_time)
        results["Accuracy"].append(acc)
        results["F-measure"].append(f1)

# Convert results to DataFrame
results_df = pd.DataFrame(results)

## 3. Ranking Folds

In [6]:
# Add ranking to the results DataFrame
ranked_results = []

# Group by fold and calculate ranks for accuracy
for fold, group in results_df.groupby("Fold"):
    group = group.copy()
    group["Rank"] = group["Accuracy"].rank(ascending=False, method="average")
    ranked_results.append(group)

# Combine all folds back into a single DataFrame
ranked_results_df = pd.concat(ranked_results, ignore_index=True)

# Compute rank sums for each algorithm
rank_sums = ranked_results_df.groupby("Algorithm")["Rank"].sum()


In [9]:
print(rank_sums)

Algorithm
Log Reg          28.0
Random Forest    10.0
SVM              22.0
Name: Rank, dtype: float64


## 3. Friedman Test

In [8]:
# Number of folds and classifiers
n_folds = len(results_df["Fold"].unique())
n_classifiers = len(results_df["Algorithm"].unique())

# Compute Friedman Test Statistic
chi_square_f = (12 / (n_folds * n_classifiers * (n_classifiers + 1))) * np.sum(rank_sums**2) - 3 * n_folds * (n_classifiers + 1)

# Degrees of freedom
df = n_classifiers - 1

# Output
print(f"Ranked Results DataFrame:\n{ranked_results_df}")
print(f"\nFriedman Test Statistic: {chi_square_f:.3f}")
print(f"Degrees of Freedom: {df}")

Ranked Results DataFrame:
        Algorithm  Fold  Training Time (s)  Accuracy  F-measure  Rank
0         Log Reg     1           0.017000  0.915401   0.915188   3.0
1             SVM     1           0.257499  0.919740   0.919450   2.0
2   Random Forest     1           0.721498  0.954447   0.954332   1.0
3         Log Reg     2           0.013000  0.921739   0.921415   3.0
4             SVM     2           0.260003  0.928261   0.928004   2.0
5   Random Forest     2           0.715001  0.956522   0.956389   1.0
6         Log Reg     3           0.013000  0.928261   0.928004   3.0
7             SVM     3           0.264490  0.939130   0.939010   2.0
8   Random Forest     3           0.688003  0.963043   0.963026   1.0
9         Log Reg     4           0.014000  0.932609   0.932507   2.0
10            SVM     4           0.258999  0.921739   0.921496   3.0
11  Random Forest     4           0.702000  0.963043   0.962988   1.0
12        Log Reg     5           0.014500  0.908696   0.908782 