In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.base import clone
from scipy.stats import ttest_rel

In [3]:
# 1) Datafile
path = r"C:\Users\rashe\OneDrive\Desktop\MMC\1st semester\02452 Machine Learning\ML_project_2\Datasæt_Gallstone.xlsx"
df = pd.read_excel(path)
df.head()

Unnamed: 0,Gallstone Status,Age,Gender,Height,Weight,Vitamin D,C-Reactive Protein (CRP),Total Body Fat Ratio (TBFR) (%)
0,0,50,0,185,92.8,33.0,0.0,19.2
1,0,47,0,176,94.5,25.0,0.0,32.8
2,0,61,0,171,91.1,30.2,0.0,27.3
3,0,41,0,168,67.7,35.4,0.0,15.8
4,0,42,0,178,89.6,40.6,0.0,20.0


In [4]:
# 1. Features (inputvariabler)
X = df[["Gender", "Age", "Height", "Weight", "Vitamin D",
        "Total Body Fat Ratio (TBFR) (%)"]].values


# 2. Target (outputvariabel)
y = df["C-Reactive Protein (CRP)"].values

print(y[:10])


[0.   0.   0.   0.   0.   0.   0.   0.11 1.57 0.  ]


## Part B

### Cross-validation parameters

In [None]:
from sklearn.model_selection import KFold

K1, K2 = 10, 10  # outer/inner folds
outer_cv = KFold(n_splits=K1, shuffle=True, random_state=42)

ridge_lambdas = np.logspace(-2, 6, 13)   # [1e-2 ... 1e6]
ann_hidden    = [1, 3, 5, 10, 20]        # antal neuroner i hidden layer

### Nested Cross-Validation (Algorithm 6)

In [8]:
rows = []
fold_id = 0

for train_idx, test_idx in outer_cv.split(X):
    fold_id += 1
    X_par, X_test = X[train_idx], X[test_idx]
    y_par, y_test = y[train_idx], y[test_idx]

    # ----- Baseline -----
    baseline_pred = np.full_like(y_test, y_par.mean(), dtype=float)
    Etest_baseline = mean_squared_error(y_test, baseline_pred)

    # ----- Ridge -----
    inner_cv = KFold(n_splits=K2, shuffle=True, random_state=fold_id)
    best_lambda, best_inner_mse_ridge = None, np.inf

    for lam in ridge_lambdas:
        pipe_ridge = Pipeline([
            ("scaler", StandardScaler()),
            ("ridge", Ridge(alpha=lam, random_state=42))
        ])
        inner_mses = []
        for tr_idx, val_idx in inner_cv.split(X_par):
            X_tr, X_val = X_par[tr_idx], X_par[val_idx]
            y_tr, y_val = y_par[tr_idx], y_par[val_idx]
            model = clone(pipe_ridge).fit(X_tr, y_tr)
            inner_mses.append(mean_squared_error(y_val, model.predict(X_val)))
        mean_val = np.mean(inner_mses)
        if mean_val < best_inner_mse_ridge:
            best_inner_mse_ridge, best_lambda = mean_val, lam

    ridge_best = Pipeline([
        ("scaler", StandardScaler()),
        ("ridge", Ridge(alpha=best_lambda, random_state=42))
    ]).fit(X_par, y_par)
    Etest_ridge = mean_squared_error(y_test, ridge_best.predict(X_test))

    # ----- ANN -----
    best_h, best_inner_mse_ann = None, np.inf
    for h in ann_hidden:
        pipe_ann = Pipeline([
            ("scaler", StandardScaler()),
            ("ann", MLPRegressor(hidden_layer_sizes=(h,),
                                 activation="relu", solver="adam",
                                 max_iter=2000, early_stopping=True,
                                 n_iter_no_change=20, random_state=42))
        ])
        inner_mses = []
        for tr_idx, val_idx in inner_cv.split(X_par):
            X_tr, X_val = X_par[tr_idx], X_par[val_idx]
            y_tr, y_val = y_par[tr_idx], y_par[val_idx]
            model = clone(pipe_ann).fit(X_tr, y_tr)
            inner_mses.append(mean_squared_error(y_val, model.predict(X_val)))
        mean_val = np.mean(inner_mses)
        if mean_val < best_inner_mse_ann:
            best_inner_mse_ann, best_h = mean_val, h

    ann_best = Pipeline([
        ("scaler", StandardScaler()),
        ("ann", MLPRegressor(hidden_layer_sizes=(best_h,),
                             activation="relu", solver="adam",
                             max_iter=2000, early_stopping=True,
                             n_iter_no_change=20, random_state=42))
    ]).fit(X_par, y_par)
    Etest_ann = mean_squared_error(y_test, ann_best.predict(X_test))

    rows.append({
        "i": fold_id,
        "h*": best_h,
        "E_test(ANN)": Etest_ann,
        "lambda*": best_lambda,
        "E_test(Ridge)": Etest_ridge,
        "E_test(Baseline)": Etest_baseline
    })

table_df = pd.DataFrame(rows)
table_df.round(4)


Unnamed: 0,i,h*,E_test(ANN),lambda*,E_test(Ridge),E_test(Baseline)
0,1,10,6.3218,2154.4347,7.6553,7.6784
1,2,10,12.0008,2154.4347,11.9514,12.458
2,3,10,2.3532,2154.4347,3.4941,3.6367
3,4,20,15.0367,2154.4347,14.3188,14.631
4,5,10,37.6724,2154.4347,40.302,40.5224
5,6,10,5.8193,2154.4347,3.4497,3.6214
6,7,10,4.4311,2154.4347,3.554,3.6721
7,8,10,74.3511,464.1589,76.8704,76.5464
8,9,10,5.0788,464.1589,3.4357,3.2956
9,10,3,91.5514,464.1589,88.5766,86.6047


In [11]:
table_df["lambda*"]

0    2154.434690
1    2154.434690
2    2154.434690
3    2154.434690
4    2154.434690
5    2154.434690
6    2154.434690
7     464.158883
8     464.158883
9     464.158883
Name: lambda*, dtype: float64

In [9]:
display(table_df.round(4))

summary = pd.DataFrame({
    "metric": ["mean"],
    "E_test(ANN)": [table_df["E_test(ANN)"].mean()],
    "E_test(Ridge)": [table_df["E_test(Ridge)"].mean()],
    "E_test(Baseline)": [table_df["E_test(Baseline)"].mean()]

})
display(summary.round(4))

Unnamed: 0,i,h*,E_test(ANN),lambda*,E_test(Ridge),E_test(Baseline)
0,1,10,6.3218,2154.4347,7.6553,7.6784
1,2,10,12.0008,2154.4347,11.9514,12.458
2,3,10,2.3532,2154.4347,3.4941,3.6367
3,4,20,15.0367,2154.4347,14.3188,14.631
4,5,10,37.6724,2154.4347,40.302,40.5224
5,6,10,5.8193,2154.4347,3.4497,3.6214
6,7,10,4.4311,2154.4347,3.554,3.6721
7,8,10,74.3511,464.1589,76.8704,76.5464
8,9,10,5.0788,464.1589,3.4357,3.2956
9,10,3,91.5514,464.1589,88.5766,86.6047


Unnamed: 0,metric,E_test(ANN),E_test(Ridge),E_test(Baseline)
0,mean,25.4617,25.3608,25.2667


### Part b3: Statistical Evaluation

In [10]:
# Hent testfejl fra Table 1 (en værdi per outer fold)
ann_errors   = table_df["E_test(ANN)"].values
ridge_errors = table_df["E_test(Ridge)"].values
base_errors  = table_df["E_test(Baseline)"].values


# Funktion til paired t-test + CI
def paired_ttest_with_ci(x, y, alpha=0.05):
    """
    Returnerer t-værdi, p-værdi og 95% CI for forskellen x - y
    """
    diff = x - y
    n = len(diff)
    mean_diff = np.mean(diff)
    std_diff = np.std(diff, ddof=1)
    se = std_diff / np.sqrt(n)
    
    # t-værdi for (n-1) frihedsgrader
    t_stat, p_val = ttest_rel(x, y)
    ci = 1.96 * se  # 95% CI
    return t_stat, p_val, mean_diff, (mean_diff - ci, mean_diff + ci)


# De tre sammenligninger
tests = {
    "ANN vs Ridge": paired_ttest_with_ci(ann_errors, ridge_errors),
    "ANN vs Baseline": paired_ttest_with_ci(ann_errors, base_errors),
    "Ridge vs Baseline": paired_ttest_with_ci(ridge_errors, base_errors)
}

print("Paired t-tests (setup I – 11.3.4)")
print("-------------------------------------------------------------")
print("Comparison\t\t t-stat\t\t p-value\t  Mean Δ\t\t 95% CI")
print("-------------------------------------------------------------")

for name, (t_stat, p_val, mean_diff, ci) in tests.items():
    print(f"{name:17s}  {t_stat:6.3f}\t {p_val:8.4f}\t {mean_diff:8.4f}\t [{ci[0]:.4f}, {ci[1]:.4f}]")


# Konklusion
print("\nInterpretation:")
for name, (t_stat, p_val, mean_diff, ci) in tests.items():
    if p_val < 0.05:
        better = "first" if mean_diff < 0 else "second"
        print(f" → {name}: Significant difference (p={p_val:.4f}), the {better} model performs better.")
    else:
        print(f" → {name}: No significant difference (p={p_val:.4f}).")


Paired t-tests (setup I – 11.3.4)
-------------------------------------------------------------
Comparison		 t-stat		 p-value	  Mean Δ		 95% CI
-------------------------------------------------------------
ANN vs Ridge        0.162	   0.8745	   0.1009	 [-1.1160, 1.3177]
ANN vs Baseline     0.263	   0.7985	   0.1950	 [-1.2583, 1.6482]
Ridge vs Baseline   0.426	   0.6801	   0.0941	 [-0.3388, 0.5270]

Interpretation:
 → ANN vs Ridge: No significant difference (p=0.8745).
 → ANN vs Baseline: No significant difference (p=0.7985).
 → Ridge vs Baseline: No significant difference (p=0.6801).
