## Assignment 2

Petter Eriksson , Oscar Andersson


## 1. Load and Preprocess Dataset


In [272]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold
from IPython.display import HTML
import time

# Load and preprocess dataset
data_file = "spambase.data"
df = pd.read_csv(data_file, header=None)

# Features and target split
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

# Standardize features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Classifiers
classifiers = {"Log Reg": LogisticRegression(max_iter=500), "SVM": SVC(), "Random Forest": RandomForestClassifier()}

## 2. Stratified ten-fold cross-validation tests


In [273]:
# Metrics placeholders
results_time = {"Fold": [], "Log Reg": [], "SVM": [], "Random Forest": []}
results_accuracy = {"Fold": [], "Log Reg": [], "SVM": [], "Random Forest": []}
results_fmeasure = {"Fold": [], "Log Reg": [], "SVM": [], "Random Forest": []}

# Stratified 10-Fold Cross-Validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

for algo_name, clf in classifiers.items():
    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y), start=1):
        # Split data
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # Train and time the classifier
        start_time = time.time()
        clf.fit(X_train, y_train)
        train_time = time.time() - start_time

        # Predictions
        y_pred = clf.predict(X_test)

        # Calculate metrics
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average="weighted")

        # Store results
        if fold not in results_time["Fold"]:
            results_time["Fold"].append(fold)
            results_accuracy["Fold"].append(fold)
            results_fmeasure["Fold"].append(fold)

        results_time[algo_name].append(train_time)
        results_accuracy[algo_name].append(acc)
        results_fmeasure[algo_name].append(f1)


def append_avg_std(df):
    avg = df.mean()
    std = df.std()
    avg["Fold"] = "avg"
    std["Fold"] = "stdev"
    return pd.concat([df, pd.DataFrame([avg]), pd.DataFrame([std])], ignore_index=True)


# Convert results to DataFrames
time_df = pd.DataFrame(results_time)
accuracy_df = pd.DataFrame(results_accuracy)
fmeasure_df = pd.DataFrame(results_fmeasure)

# Set datatype for Fold column
for df in [time_df, accuracy_df, fmeasure_df]:
    df["Fold"] = df["Fold"].astype(object)

# Append average and standard deviation to the DataFrames
time_df = append_avg_std(time_df)
accuracy_df = append_avg_std(accuracy_df)
fmeasure_df = append_avg_std(fmeasure_df)

In [274]:
pd.options.display.float_format = "{:.4f}".format

display(HTML("<h2>Training Time (s)</h2>"))
display(HTML(time_df.to_html(index=False)))

display(HTML("<h2>Accuracy</h2>"))
display(HTML(accuracy_df.to_html(index=False)))

display(HTML("<h2>F-measure</h2>"))
display(HTML(fmeasure_df.to_html(index=False)))

Fold,Log Reg,SVM,Random Forest
1,0.0189,0.2266,0.6273
2,0.0123,0.2244,0.6384
3,0.0134,0.2551,0.6856
4,0.0129,0.2281,0.6448
5,0.0133,0.2282,0.6298
6,0.014,0.2258,0.6249
7,0.0132,0.2618,0.6344
8,0.0131,0.2482,0.6306
9,0.0135,0.2253,0.6264
10,0.0133,0.2265,0.6204


Fold,Log Reg,SVM,Random Forest
1,0.9154,0.9197,0.9479
2,0.9217,0.9283,0.9587
3,0.9283,0.9391,0.9609
4,0.9326,0.9217,0.9652
5,0.9087,0.9391,0.9413
6,0.937,0.9478,0.963
7,0.9348,0.9261,0.9522
8,0.9261,0.937,0.9522
9,0.9391,0.9457,0.9478
10,0.9109,0.9348,0.9435


Fold,Log Reg,SVM,Random Forest
1,0.9152,0.9195,0.9479
2,0.9214,0.928,0.9586
3,0.928,0.939,0.9608
4,0.9325,0.9215,0.9651
5,0.9088,0.939,0.9412
6,0.9369,0.9477,0.9629
7,0.9345,0.9258,0.9521
8,0.9256,0.9367,0.9521
9,0.9385,0.9453,0.9476
10,0.9104,0.9344,0.9432


## 3. Ranking Folds


In [275]:
def get_ranks(df: pd.DataFrame, reverse=False):
    ranked_results = pd.DataFrame()

    # Group by fold and calculate ranks for accuracy
    for fold, group in df.iloc[:-2].groupby("Fold"):
        rank = group.drop(columns="Fold").rank(1, ascending=reverse).astype(int)
        rank["Fold"] = fold  # Add the Fold column back
        ranked_results = pd.concat([ranked_results, rank], ignore_index=True)

    # Calculate average ran
    rank_sums = ranked_results.sum()
    ranks_df = df.copy().iloc[:-2]
    avg_ranks = rank_sums / len(ranks_df["Fold"].unique())

    # Merge the ranks back with the original scores
    merged_results = ranks_df.merge(ranked_results, on=["Fold"], suffixes=("", " (Rank)"))

    # Format the scores to include the ranks in parentheses
    for col in ranks_df.columns:
        if col != "Fold":
            merged_results[col] = merged_results.apply(lambda row: f"{row[col]:.4f} ({row[col + ' (Rank)']})", axis=1)

    # Drop the rank columns
    merged_results = merged_results.drop(columns=[col + " (Rank)" for col in ranks_df.columns if col != "Fold"])

    # Add average ranks as the last row
    merged_results = pd.concat(
        [
            merged_results,
            pd.DataFrame(
                [["avg rank"] + [f"{avg_ranks[algo]:.1f}" for algo in ranked_results.columns if algo != "Fold"]],
                columns=ranks_df.columns,
            ),
        ]
    )
    return ranked_results.drop("Fold", axis=1), merged_results.rename(columns={"Fold": "Data set"})


time_ranks, time_result = get_ranks(time_df, reverse=True)
accuracy_ranks, accuracy_result = get_ranks(accuracy_df)
fmeasure_ranks, fmeasure_result = get_ranks(fmeasure_df)

In [276]:
display(HTML("<h2>Training Time (s)</h2>"))
display(HTML(time_result.to_html(index=False)))

display(HTML("<h2>Accuracy</h2>"))
display(HTML(accuracy_result.to_html(index=False)))

display(HTML("<h2>F-measure</h2>"))
display(HTML(fmeasure_result.to_html(index=False)))

Data set,Log Reg,SVM,Random Forest
1,0.0189 (1),0.2266 (2),0.6273 (3)
2,0.0123 (1),0.2244 (2),0.6384 (3)
3,0.0134 (1),0.2551 (2),0.6856 (3)
4,0.0129 (1),0.2281 (2),0.6448 (3)
5,0.0133 (1),0.2282 (2),0.6298 (3)
6,0.0140 (1),0.2258 (2),0.6249 (3)
7,0.0132 (1),0.2618 (2),0.6344 (3)
8,0.0131 (1),0.2482 (2),0.6306 (3)
9,0.0135 (1),0.2253 (2),0.6264 (3)
10,0.0133 (1),0.2265 (2),0.6204 (3)


Data set,Log Reg,SVM,Random Forest
1,0.9154 (3),0.9197 (2),0.9479 (1)
2,0.9217 (3),0.9283 (2),0.9587 (1)
3,0.9283 (3),0.9391 (2),0.9609 (1)
4,0.9326 (2),0.9217 (3),0.9652 (1)
5,0.9087 (3),0.9391 (2),0.9413 (1)
6,0.9370 (3),0.9478 (2),0.9630 (1)
7,0.9348 (2),0.9261 (3),0.9522 (1)
8,0.9261 (3),0.9370 (2),0.9522 (1)
9,0.9391 (3),0.9457 (2),0.9478 (1)
10,0.9109 (3),0.9348 (2),0.9435 (1)


Data set,Log Reg,SVM,Random Forest
1,0.9152 (3),0.9195 (2),0.9479 (1)
2,0.9214 (3),0.9280 (2),0.9586 (1)
3,0.9280 (3),0.9390 (2),0.9608 (1)
4,0.9325 (2),0.9215 (3),0.9651 (1)
5,0.9088 (3),0.9390 (2),0.9412 (1)
6,0.9369 (3),0.9477 (2),0.9629 (1)
7,0.9345 (2),0.9258 (3),0.9521 (1)
8,0.9256 (3),0.9367 (2),0.9521 (1)
9,0.9385 (3),0.9453 (2),0.9476 (1)
10,0.9104 (3),0.9344 (2),0.9432 (1)


## 4. Friedman Test


In [277]:
def compute_friedman_test(ranked_results, n_datasets, n_classifiers):
    # Compute rank sums for each algorithm
    rank_sums = ranked_results.sum()

    # Compute Friedman Test Statistic
    chi_square_f = (12 / (n_datasets * n_classifiers * (n_classifiers + 1))) * np.sum(
        rank_sums**2
    ) - 3 * n_datasets * (n_classifiers + 1)

    return round(chi_square_f, 1)

In [278]:
statistic_time = compute_friedman_test(time_ranks, 10, 3)
print("Friedman statistic for time taken:", statistic_time)
statistic_accuracy = compute_friedman_test(accuracy_ranks, 10, 3)
print("Friedman statistic for accuracy:", statistic_accuracy)
statistic_fmeasure = compute_friedman_test(fmeasure_ranks, 10, 3)
print("Friedman statistic for F-measure:", statistic_fmeasure)

Friedman statistic for time taken: 20.0
Friedman statistic for accuracy: 16.8
Friedman statistic for F-measure: 16.8


The critical value for alpha=0.05 and k=3 is 7.8. So all statistics reject the null hypothesis that the algorithms perform equally.


## 5. Nemenyi test


In [279]:
critical_distance = 2.343 * ((3 * 4) / (6 * 10)) ** 0.5
print("Critial distance:", round(critical_distance, 4))

time_differences = time_result[-1:].drop(columns="Data set")
accuracy_differences = accuracy_result[-1:].drop(columns="Data set")
fmeasure_differences = fmeasure_result[-1:].drop(columns="Data set")


def compare_columns(df):
    columns = df.columns
    for i in range(len(columns)):
        for j in range(i + 1, len(columns)):
            diff = abs(float(df.iloc[0, i]) - float(df.iloc[0, j]))
            if diff > critical_distance:
                print(f"Difference between {columns[i]} and {columns[j]}: {round(diff, 2)}")


print("\nTime differences exceeding critical distance:")
compare_columns(time_differences)

print("\nAccuracy differences exceeding critical distance:")
compare_columns(accuracy_differences)

print("\nF-measure differences exceeding critical distance:")
compare_columns(fmeasure_differences)

Critial distance: 1.0478

Time differences exceeding critical distance:
Difference between Log Reg and Random Forest: 2.0

Accuracy differences exceeding critical distance:
Difference between Log Reg and Random Forest: 1.8
Difference between SVM and Random Forest: 1.2

F-measure differences exceeding critical distance:
Difference between Log Reg and Random Forest: 1.8
Difference between SVM and Random Forest: 1.2
