In [16]:
# Authors: Maria DaRocha (300399718), William Huang (300653623)
# AIML427: Group Project (A3)

# APACHE SPARK POST-PROCESSING CODE:
# (Paired t-Test / Wilcoxon / Data Visualisation)
#

# Training Results

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from scipy.stats import ttest_rel, wilcoxon
import warnings
warnings.filterwarnings("ignore")

# Load and parse the results
with open("AllResultsFinal", "r") as f:
    lines = f.readlines()

# Function to extract a metric from seed blocks
def extract_seed_metric(metric_name):
    data = []
    model = None
    seed = None
    for line in lines:
        if line.startswith("SEED"):
            match = re.search(r"SEED (\d+)", line)
            if match:
                seed = int(match.group(1))
                model = "LR" if "LOGISTIC" in line else "DT"
        elif line.startswith(metric_name + ":"):
            try:
                val = float(line.split(":")[1].strip())
                data.append({"seed": seed, "model": model, "metric": metric_name, "value": val})
            except (IndexError, ValueError):
                continue
    return pd.DataFrame(data)

# Extract training metrics
metrics = ["train_accuracy", "train_precision", "train_recall", "train_f1", "train_roc_auc"]
frames = [extract_seed_metric(m) for m in metrics]
df = pd.concat(frames, ignore_index=True)

# Plot each metric using box and strip plot
sns.set(style="whitegrid", context="notebook", font_scale=1.1)
for metric in metrics:
    plt.figure(figsize=(10, 6))
    df_metric = df[df.metric == metric]
    ax = sns.boxplot(data=df_metric, x="model", y="value", palette="pastel")
    sns.stripplot(data=df_metric, x="model", y="value", color="black", size=5, jitter=0.15, alpha=0.7)

    plt.title(f"Comparison of {metric.replace('_', ' ').title()} Across Models")
    plt.ylabel(metric.replace("_", " ").title())
    plt.xlabel("Model")
    plt.ylim(0.8, 1.05)

    plt.tight_layout()
    plt.savefig(f"plot_{metric}.png")
    plt.close()


# --- Mean Metrics ---

# Compute means
summary = df.groupby(["model", "metric"]).value.mean().unstack()
categories = list(summary.columns)
labels = list(summary.index)
num_vars = len(categories)

# Compute angles for radar
angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()
angles += angles[:1]

# Create radar plot
plt.figure(figsize=(8, 8))
ax = plt.subplot(111, polar=True)

for label in labels:
    values = summary.loc[label].tolist()
    values += values[:1]
    ax.plot(angles, values, label=label)
    ax.fill(angles, values, alpha=0.1)

ax.set_title("Average Training Metric Comparison (Radar Chart)", size=15)
ax.set_xticks(angles[:-1])
ax.set_xticklabels([m.replace("_", " ").title() for m in categories])
ax.set_yticks([0.85, 0.9, 0.95, 1.0])
ax.set_ylim(0.85, 1.0)
plt.legend(loc="upper right", bbox_to_anchor=(1.3, 1.1))
plt.tight_layout()
plt.savefig("radar_train_metrics.png")
plt.close()

# Perform and display significance tests
for metric in metrics:
    vals = extract_seed_metric(metric)
    # Filter by model, sort by seed
    dt_scores = vals[vals["model"] == "DT"].sort_values("seed")["value"]
    lr_scores = vals[vals["model"] == "LR"].sort_values("seed")["value"]


    # Paired t-Test
    t_stat, p_val = ttest_rel(dt_scores, lr_scores)
    # Also try Wilcoxon (non-parametric alternative)
    #  Note: no (normality) assumption violation; Added only for assignment completeness
    w_stat, p_val_w = wilcoxon(dt_scores, lr_scores)

    print(f"\n{metric.upper()}")
    print(f"  DT Mean: {dt_scores.mean():.4f}, Std: {dt_scores.std():.4f}")
    print(f"  LR Mean: {lr_scores.mean():.4f}, Std: {lr_scores.std():.4f}")
    print(f"  t-statistic: {t_stat:.4f}, p-value: {p_val:.6f}")
    print(f"  Wilcoxon p-value:     {p_val_w:.4f}")
    print("  Statistically significant?" + (" YES" if p_val < 0.05 else " NO"))

print("\nPlots saved as plot_<metric>.png")



TRAIN_ACCURACY
  DT Mean: 0.9523, Std: 0.0007
  LR Mean: 0.9301, Std: 0.0010
  t-statistic: 63.9096, p-value: 0.000000
  Wilcoxon p-value:     0.0020
  Statistically significant? YES

TRAIN_PRECISION
  DT Mean: 0.9524, Std: 0.0007
  LR Mean: 0.9307, Std: 0.0010
  t-statistic: 62.7084, p-value: 0.000000
  Wilcoxon p-value:     0.0020
  Statistically significant? YES

TRAIN_RECALL
  DT Mean: 0.9523, Std: 0.0007
  LR Mean: 0.9301, Std: 0.0010
  t-statistic: 63.9096, p-value: 0.000000
  Wilcoxon p-value:     0.0020
  Statistically significant? YES

TRAIN_F1
  DT Mean: 0.9523, Std: 0.0007
  LR Mean: 0.9301, Std: 0.0010
  t-statistic: 63.8642, p-value: 0.000000
  Wilcoxon p-value:     0.0020
  Statistically significant? YES

TRAIN_ROC_AUC
  DT Mean: 0.9521, Std: 0.0020
  LR Mean: 0.9806, Std: 0.0003
  t-statistic: -49.8908, p-value: 0.000000
  Wilcoxon p-value:     0.0020
  Statistically significant? YES

Plots saved as plot_<metric>.png


# Testing Results

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from scipy.stats import ttest_rel, wilcoxon
import warnings
warnings.filterwarnings("ignore")

# Load and parse the results
with open("AllResultsFinal", "r") as f:
    lines = f.readlines()

# Function to extract a metric from seed blocks
def extract_seed_metric(metric_name):
    data = []
    model = None
    seed = None
    for line in lines:
        if line.startswith("SEED"):
            match = re.search(r"SEED (\d+)", line)
            if match:
                seed = int(match.group(1))
                model = "LR" if "LOGISTIC" in line else "DT"
        elif line.startswith(metric_name + ":"):
            try:
                val = float(line.split(":")[1].strip())
                data.append({"seed": seed, "model": model, "metric": metric_name, "value": val})
            except (IndexError, ValueError):
                continue
    return pd.DataFrame(data)

# Extract testing metrics
metrics = ["test_accuracy", "test_precision", "test_recall", "test_f1", "test_roc_auc"]
frames = [extract_seed_metric(m) for m in metrics]
df = pd.concat(frames, ignore_index=True)

# Plot each metric using box and strip plot
sns.set(style="whitegrid", context="notebook", font_scale=1.1)
for metric in metrics:
    plt.figure(figsize=(10, 6))
    df_metric = df[df.metric == metric]
    ax = sns.boxplot(data=df_metric, x="model", y="value", palette="pastel")
    sns.stripplot(data=df_metric, x="model", y="value", color="black", size=5, jitter=0.15, alpha=0.7)

    plt.title(f"Comparison of {metric.replace('_', ' ').title()} Across Models")
    plt.ylabel(metric.replace("_", " ").title())
    plt.xlabel("Model")
    plt.ylim(0.8, 1.05)

    plt.tight_layout()
    plt.savefig(f"plot_{metric}.png")
    plt.close()


# --- Mean Metrics ---

# Compute means
summary = df.groupby(["model", "metric"]).value.mean().unstack()
categories = list(summary.columns)
labels = list(summary.index)
num_vars = len(categories)

# Compute angles for radar
angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()
angles += angles[:1]

# Create radar plot
plt.figure(figsize=(8, 8))
ax = plt.subplot(111, polar=True)

for label in labels:
    values = summary.loc[label].tolist()
    values += values[:1]
    ax.plot(angles, values, label=label)
    ax.fill(angles, values, alpha=0.1)

ax.set_title("Average Testing Metric Comparison (Radar Chart)", size=15)
ax.set_xticks(angles[:-1])
ax.set_xticklabels([m.replace("_", " ").title() for m in categories])
ax.set_yticks([0.85, 0.9, 0.95, 1.0])
ax.set_ylim(0.85, 1.0)
plt.legend(loc="upper right", bbox_to_anchor=(1.3, 1.1))
plt.tight_layout()
plt.savefig("radar_test_metrics.png")
plt.close()

# Perform and display significance tests
for metric in metrics:
    vals = extract_seed_metric(metric)
    # Filter by model, sort by seed
    dt_scores = vals[vals["model"] == "DT"].sort_values("seed")["value"]
    lr_scores = vals[vals["model"] == "LR"].sort_values("seed")["value"]

    # Paired t-Test
    t_stat, p_val = ttest_rel(dt_scores, lr_scores)
    # Also try Wilcoxon (non-parametric alternative)
    #  Note: no (normality) assumption violation; Added only for assignment completeness
    w_stat, p_val_w = wilcoxon(dt_scores, lr_scores)

    print(f"\n{metric.upper()}")
    print(f"  DT Mean: {dt_scores.mean():.4f}, Std: {dt_scores.std():.4f}")
    print(f"  LR Mean: {lr_scores.mean():.4f}, Std: {lr_scores.std():.4f}")
    print(f"  t-statistic: {t_stat:.4f}, p-value: {p_val:.6f}")
    print(f"  Wilcoxon p-value:     {p_val_w:.4f}")
    print("  Statistically significant?" + (" YES" if p_val < 0.05 else " NO"))

print("\nPlots saved as plot_<metric>.png")



TEST_ACCURACY
  DT Mean: 0.9519, Std: 0.0015
  LR Mean: 0.9297, Std: 0.0016
  t-statistic: 33.9416, p-value: 0.000000
  Wilcoxon p-value:     0.0020
  Statistically significant? YES

TEST_PRECISION
  DT Mean: 0.9520, Std: 0.0015
  LR Mean: 0.9304, Std: 0.0015
  t-statistic: 32.9664, p-value: 0.000000
  Wilcoxon p-value:     0.0020
  Statistically significant? YES

TEST_RECALL
  DT Mean: 0.9519, Std: 0.0015
  LR Mean: 0.9297, Std: 0.0016
  t-statistic: 33.9416, p-value: 0.000000
  Wilcoxon p-value:     0.0020
  Statistically significant? YES

TEST_F1
  DT Mean: 0.9519, Std: 0.0015
  LR Mean: 0.9297, Std: 0.0016
  t-statistic: 33.8797, p-value: 0.000000
  Wilcoxon p-value:     0.0020
  Statistically significant? YES

TEST_ROC_AUC
  DT Mean: 0.9518, Std: 0.0015
  LR Mean: 0.9806, Std: 0.0008
  t-statistic: -45.3142, p-value: 0.000000
  Wilcoxon p-value:     0.0020
  Statistically significant? YES

Plots saved as plot_<metric>.png


In [15]:
from scipy.stats import zscore

# Compute z-scores
# TEST only for brevity: (Other metrics sufficiently informative)
df["z_score"] = df.groupby("metric")["value"].transform(zscore)

# Plot z-score distributions
sns.set(style="whitegrid", context="notebook", font_scale=1.1)
for metric in metrics:
    plt.figure(figsize=(10, 6))
    df_metric = df[df.metric == metric]
    sns.violinplot(data=df_metric, x="model", y="z_score", inner="point", palette="muted")
    plt.title(f"Z-score Normalized Distribution of {metric.replace('_', ' ').title()}")
    plt.ylabel("Z-score")
    plt.xlabel("Model")
    plt.tight_layout()
    plt.savefig(f"zscore_{metric}.png")
    plt.close()

mean_z = df.groupby(["metric", "model"])["z_score"].mean().unstack()
print("\nMean Z-scores by Metric and Model:")
print(mean_z.round(4))

overall_mean_z = df.groupby("model")["z_score"].mean()
print("\nOverall Mean Z-score per Model:")
print(overall_mean_z.round(4))

print("\nPlots saved as zscore_<metric>.png")


Mean Z-scores by Metric and Model:
model               DT      LR
metric                        
test_accuracy   0.9913 -0.9913
test_f1         0.9913 -0.9913
test_precision  0.9914 -0.9914
test_recall     0.9913 -0.9913
test_roc_auc   -0.9968  0.9968

Overall Mean Z-score per Model:
model
DT    0.5937
LR   -0.5937
Name: z_score, dtype: float64

Plots saved as zscore_<metric>.png
