## Assignment 2
Petter Eriksson , Oscar Andersson

## 1. Load and Preprocess Dataset

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedKFold
from scipy.stats import friedmanchisquare
import time

# Load and preprocess dataset
data_file = "spambase.data"
df = pd.read_csv(data_file, header=None)

# Features and target split
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

# Standardize features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Classifiers
classifiers = { "Log Reg": LogisticRegression(max_iter=500),
               "SVM": SVC(),
               "Random Forest": RandomForestClassifier()
}




## 2. Stratified ten-fold cross-validation tests

In [3]:
# Metrics placeholders
results = {"Algorithm": [], "Fold": [], "Training Time (s)": [], "Accuracy": [], "F-measure": []}

# Stratified 10-Fold Cross-Validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

for algo_name, clf in classifiers.items():
    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y), start=1):
        # Split data
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # Train and time the classifier
        start_time = time.time()
        clf.fit(X_train, y_train)
        train_time = time.time() - start_time

        # Predictions
        y_pred = clf.predict(X_test)

        # Calculate metrics
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')

        # Store results
        results["Algorithm"].append(algo_name)
        results["Fold"].append(fold)
        results["Training Time (s)"].append(train_time)
        results["Accuracy"].append(acc)
        results["F-measure"].append(f1)

# Convert results to DataFrame
results_df = pd.DataFrame(results)

In [9]:
print(results_df)

        Algorithm  Fold  Training Time (s)  Accuracy  F-measure
0         Log Reg     1           0.020999  0.915401   0.915188
1         Log Reg     2           0.013003  0.921739   0.921415
2         Log Reg     3           0.014000  0.928261   0.928004
3         Log Reg     4           0.013999  0.932609   0.932507
4         Log Reg     5           0.015001  0.908696   0.908782
5         Log Reg     6           0.014999  0.936957   0.936926
6         Log Reg     7           0.017002  0.934783   0.934507
7         Log Reg     8           0.014998  0.926087   0.925598
8         Log Reg     9           0.015005  0.939130   0.938489
9         Log Reg    10           0.014501  0.910870   0.910441
10            SVM     1           0.263499  0.919740   0.919450
11            SVM     2           0.265498  0.928261   0.928004
12            SVM     3           0.267501  0.939130   0.939010
13            SVM     4           0.260000  0.921739   0.921496
14            SVM     5           0.2705

In [17]:
# Reshape to 12.4 example
pivot_table = results_df.pivot(index="Fold", columns="Algorithm", values="Accuracy")

# Calculate averages and standard deviations for each algorithm
pivot_table.loc["avg"] = pivot_table.mean()
pivot_table.loc["stdev"] = pivot_table.std()

# Rename index
pivot_table.index.name = "Folds"
pivot_table.reset_index(inplace=True)

# Rename the columns for clarity (optional)
pivot_table.columns.name = None

# Display the final table
print(pivot_table)

    Folds   Log Reg  Random Forest       SVM
0       1  0.915401       0.954447  0.919740
1       2  0.921739       0.954348  0.928261
2       3  0.928261       0.960870  0.939130
3       4  0.932609       0.956522  0.921739
4       5  0.908696       0.947826  0.939130
5       6  0.936957       0.960870  0.947826
6       7  0.934783       0.945652  0.926087
7       8  0.926087       0.960870  0.936957
8       9  0.939130       0.952174  0.945652
9      10  0.910870       0.952174  0.934783
10    avg  0.925453       0.954575  0.933930
11  stdev  0.010365       0.005093  0.009155


## 3. Ranking Folds

In [31]:
# Add ranking to the results DataFrame
ranked_results = []

# Group by fold and calculate ranks for accuracy
for fold, group in results_df.groupby("Fold"):
    group = group.copy()
    group["Rank"] = group["Accuracy"].rank(ascending=False, method="average")  # Rank algorithms within each fold
    group["Accuracy (Rank)"] = group.apply(lambda x: f"{x['Accuracy']:.4f} ({int(x['Rank'])})", axis=1)
    ranked_results.append(group)

# Combine all folds back into a single DataFrame
ranked_results_df = pd.concat(ranked_results, ignore_index=True)

# Pivot to get the desired format
final_table = ranked_results_df.pivot(index="Fold", columns="Algorithm", values="Accuracy (Rank)")

# Calculate average ranks
rank_sums = ranked_results_df.groupby("Algorithm")["Rank"].sum()
avg_ranks = rank_sums / len(results_df["Fold"].unique())

# Compute rank sums for each algorithm
rank_sums = ranked_results_df.groupby("Algorithm")["Rank"].sum()

# Add average ranks as the last row
final_table.loc["avg rank"] = [
    f"{avg_ranks[algo]:.1f}" for algo in final_table.columns
]



In [19]:
print(rank_sums, avg_ranks)

Algorithm
Log Reg          28.0
Random Forest    10.0
SVM              22.0
Name: Rank, dtype: float64 Algorithm
Log Reg          2.8
Random Forest    1.0
SVM              2.2
Name: Rank, dtype: float64


## 4. Friedman Test

In [32]:
# Number of folds and classifiers
n_folds = len(results_df["Fold"].unique())
n_classifiers = len(results_df["Algorithm"].unique())

# Compute Friedman Test Statistic
chi_square_f = (12 / (n_folds * n_classifiers * (n_classifiers + 1))) * np.sum(rank_sums**2) - 3 * n_folds * (n_classifiers + 1)

# Degrees of freedom
df = n_classifiers - 1

# Output
print(final_table)
print(f"\nFriedman Test Statistic: {chi_square_f:.3f}")
print(f"Degrees of Freedom: {df}")

Algorithm     Log Reg Random Forest         SVM
Fold                                           
1          0.9154 (3)    0.9544 (1)  0.9197 (2)
2          0.9217 (3)    0.9543 (1)  0.9283 (2)
3          0.9283 (3)    0.9609 (1)  0.9391 (2)
4          0.9326 (2)    0.9565 (1)  0.9217 (3)
5          0.9087 (3)    0.9478 (1)  0.9391 (2)
6          0.9370 (3)    0.9609 (1)  0.9478 (2)
7          0.9348 (2)    0.9457 (1)  0.9261 (3)
8          0.9261 (3)    0.9609 (1)  0.9370 (2)
9          0.9391 (3)    0.9522 (1)  0.9457 (2)
10         0.9109 (3)    0.9522 (1)  0.9348 (2)
avg rank          2.8           1.0         2.2

Friedman Test Statistic: 16.800
Degrees of Freedom: 2
