## Assignment 2

Petter Eriksson , Oscar Andersson


## 1. Load and Preprocess Dataset


In [135]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold
from IPython.display import HTML
import time

# Load and preprocess dataset
data_file = "spambase.data"
df = pd.read_csv(data_file, header=None)

# Features and target split
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

# Standardize features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Classifiers
classifiers = {"Log Reg": LogisticRegression(max_iter=500), "SVM": SVC(), "Random Forest": RandomForestClassifier()}

## 2. Stratified ten-fold cross-validation tests


In [136]:
# Metrics placeholders
results_time = {"Fold": [], "Log Reg": [], "SVM": [], "Random Forest": []}
results_accuracy = {"Fold": [], "Log Reg": [], "SVM": [], "Random Forest": []}
results_fmeasure = {"Fold": [], "Log Reg": [], "SVM": [], "Random Forest": []}

# Stratified 10-Fold Cross-Validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

for algo_name, clf in classifiers.items():
    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y), start=1):
        # Split data
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # Train and time the classifier
        start_time = time.time()
        clf.fit(X_train, y_train)
        train_time = time.time() - start_time

        # Predictions
        y_pred = clf.predict(X_test)

        # Calculate metrics
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average="weighted")

        # Store results
        if fold not in results_time["Fold"]:
            results_time["Fold"].append(fold)
            results_accuracy["Fold"].append(fold)
            results_fmeasure["Fold"].append(fold)

        results_time[algo_name].append(train_time)
        results_accuracy[algo_name].append(acc)
        results_fmeasure[algo_name].append(f1)


def append_avg_std(df):
    avg = df.mean()
    std = df.std()
    avg["Fold"] = "avg"
    std["Fold"] = "stdev"
    return pd.concat([df, pd.DataFrame([avg]), pd.DataFrame([std])], ignore_index=True)


# Convert results to DataFrames
time_df = pd.DataFrame(results_time)
accuracy_df = pd.DataFrame(results_accuracy)
fmeasure_df = pd.DataFrame(results_fmeasure)

# Set datatype for Fold column
for df in [time_df, accuracy_df, fmeasure_df]:
    df["Fold"] = df["Fold"].astype(object)

# Append average and standard deviation to the DataFrames
time_df = append_avg_std(time_df)
accuracy_df = append_avg_std(accuracy_df)
fmeasure_df = append_avg_std(fmeasure_df)

In [137]:
pd.options.display.float_format = "{:.4f}".format

display(HTML("<h2>Training Time (s)</h2>"))
display(HTML(time_df.to_html(index=False)))

display(HTML("<h2>Accuracy</h2>"))
display(HTML(accuracy_df.to_html(index=False)))

display(HTML("<h2>F-measure</h2>"))
display(HTML(fmeasure_df.to_html(index=False)))

Fold,Log Reg,SVM,Random Forest
1,0.0167,0.1737,0.5504
2,0.0097,0.163,0.4786
3,0.0096,0.1836,0.4709
4,0.0093,0.1913,0.4786
5,0.0099,0.1682,0.5284
6,0.0111,0.1634,0.4965
7,0.0102,0.1677,0.4735
8,0.0098,0.1896,0.4907
9,0.0104,0.162,0.4853
10,0.0103,0.1647,0.5087


Fold,Log Reg,SVM,Random Forest
1,0.9154,0.9197,0.9523
2,0.9217,0.9283,0.9543
3,0.9283,0.9391,0.9652
4,0.9326,0.9217,0.9609
5,0.9087,0.9391,0.9435
6,0.937,0.9478,0.963
7,0.9348,0.9261,0.9543
8,0.9261,0.937,0.9543
9,0.9391,0.9457,0.9478
10,0.9109,0.9348,0.9478


Fold,Log Reg,SVM,Random Forest
1,0.9152,0.9195,0.9522
2,0.9214,0.928,0.9541
3,0.928,0.939,0.9652
4,0.9325,0.9215,0.9607
5,0.9088,0.939,0.9435
6,0.9369,0.9477,0.9629
7,0.9345,0.9258,0.9542
8,0.9256,0.9367,0.9543
9,0.9385,0.9453,0.9477
10,0.9104,0.9344,0.9475


## 3. Ranking Folds


In [212]:
def get_ranks(df: pd.DataFrame, reverse=False):
    ranked_results = pd.DataFrame()

    # Group by fold and calculate ranks for accuracy
    for fold, group in df.iloc[:-2].groupby("Fold"):
        rank = group.drop(columns="Fold").rank(1, ascending=reverse).astype(int)
        rank["Fold"] = fold  # Add the Fold column back
        ranked_results = pd.concat([ranked_results, rank], ignore_index=True)

    # Calculate average ran
    rank_sums = ranked_results.sum()
    ranks_df = df.copy().iloc[:-2]
    avg_ranks = rank_sums / len(ranks_df["Fold"].unique())

    # Merge the ranks back with the original scores
    merged_results = ranks_df.merge(ranked_results, on=["Fold"], suffixes=("", " (Rank)"))

    # Format the scores to include the ranks in parentheses
    for col in ranks_df.columns:
        if col != "Fold":
            merged_results[col] = merged_results.apply(lambda row: f"{row[col]:.4f} ({row[col + ' (Rank)']})", axis=1)

    # Drop the rank columns
    merged_results = merged_results.drop(columns=[col + " (Rank)" for col in ranks_df.columns if col != "Fold"])

    # Add average ranks as the last row
    merged_results = pd.concat(
        [
            merged_results,
            pd.DataFrame(
                [["avg rank"] + [f"{avg_ranks[algo]:.1f}" for algo in ranked_results.columns if algo != "Fold"]],
                columns=ranks_df.columns,
            ),
        ]
    )
    return ranked_results.drop("Fold", axis=1), merged_results


time_ranks, time_result = get_ranks(time_df, reverse=True)
accuracy_ranks, accuracy_result = get_ranks(accuracy_df)
fmeasure_ranks, fmeasure_result = get_ranks(fmeasure_df)

In [213]:
display(HTML("<h2>Training Time (s)</h2>"))
display(HTML(time_result.to_html(index=False)))

display(HTML("<h2>Accuracy</h2>"))
display(HTML(accuracy_result.to_html(index=False)))

display(HTML("<h2>F-measure</h2>"))
display(HTML(fmeasure_result.to_html(index=False)))

Fold,Log Reg,SVM,Random Forest
1,0.0167 (1),0.1737 (2),0.5504 (3)
2,0.0097 (1),0.1630 (2),0.4786 (3)
3,0.0096 (1),0.1836 (2),0.4709 (3)
4,0.0093 (1),0.1913 (2),0.4786 (3)
5,0.0099 (1),0.1682 (2),0.5284 (3)
6,0.0111 (1),0.1634 (2),0.4965 (3)
7,0.0102 (1),0.1677 (2),0.4735 (3)
8,0.0098 (1),0.1896 (2),0.4907 (3)
9,0.0104 (1),0.1620 (2),0.4853 (3)
10,0.0103 (1),0.1647 (2),0.5087 (3)


Fold,Log Reg,SVM,Random Forest
1,0.9154 (3),0.9197 (2),0.9523 (1)
2,0.9217 (3),0.9283 (2),0.9543 (1)
3,0.9283 (3),0.9391 (2),0.9652 (1)
4,0.9326 (2),0.9217 (3),0.9609 (1)
5,0.9087 (3),0.9391 (2),0.9435 (1)
6,0.9370 (3),0.9478 (2),0.9630 (1)
7,0.9348 (2),0.9261 (3),0.9543 (1)
8,0.9261 (3),0.9370 (2),0.9543 (1)
9,0.9391 (3),0.9457 (2),0.9478 (1)
10,0.9109 (3),0.9348 (2),0.9478 (1)


Fold,Log Reg,SVM,Random Forest
1,0.9152 (3),0.9195 (2),0.9522 (1)
2,0.9214 (3),0.9280 (2),0.9541 (1)
3,0.9280 (3),0.9390 (2),0.9652 (1)
4,0.9325 (2),0.9215 (3),0.9607 (1)
5,0.9088 (3),0.9390 (2),0.9435 (1)
6,0.9369 (3),0.9477 (2),0.9629 (1)
7,0.9345 (2),0.9258 (3),0.9542 (1)
8,0.9256 (3),0.9367 (2),0.9543 (1)
9,0.9385 (3),0.9453 (2),0.9477 (1)
10,0.9104 (3),0.9344 (2),0.9475 (1)


## 4. Friedman Test


In [220]:
def compute_friedman_test(ranked_results, n_folds, n_classifiers):
    # Compute rank sums for each algorithm
    rank_sums = ranked_results.sum()

    # Compute Friedman Test Statistic
    chi_square_f = (12 / (n_folds * n_classifiers * (n_classifiers + 1))) * np.sum(rank_sums**2) - 3 * n_folds * (
        n_classifiers + 1
    )

    return round(chi_square_f, 1)

In [221]:
statistic_time = compute_friedman_test(time_ranks, 10, 3)
print("Friedman statistic for time taken:", statistic_time)
statistic_accuracy = compute_friedman_test(accuracy_ranks, 10, 3)
print("Friedman statistic for time taken:", statistic_accuracy)
statistic_fmeasure = compute_friedman_test(fmeasure_ranks, 10, 3)
print("Friedman statistic for time taken:", statistic_fmeasure)

Friedman statistic for time taken: 20.0
Friedman statistic for time taken: 16.8
Friedman statistic for time taken: 16.8
