In [1]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score
from scipy.stats import ttest_rel, t


In [2]:
calories = pd.read_csv('../Dataset/calories.csv')
exercise = pd.read_csv('../Dataset/exercise.csv')
df = pd.merge(exercise, calories, on = 'User_ID')
#Mapper male til 1 og kvinder til 0 for at spare data
df['Gender'] = df['Gender'].map({'male': 1, 'female': 0})
df = df.reset_index()
df.head()

Unnamed: 0,index,User_ID,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,0,14733363,1,68,190.0,94.0,29.0,105.0,40.8,231.0
1,1,14861698,0,20,166.0,60.0,14.0,94.0,40.3,66.0
2,2,11179863,1,69,179.0,79.0,5.0,88.0,38.7,26.0
3,3,16180408,0,34,179.0,71.0,13.0,100.0,40.5,71.0
4,4,17771927,0,27,154.0,58.0,10.0,81.0,39.8,35.0


# Part 2


In [3]:
# DEFINE FEATURES
X = df[['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp', 'Calories']]  
y = df['Gender']

# SPLIT INTO TRAIN AND TEST SETS
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42, 
                                                    stratify=y)

# BASELINE MODEL
majority_class = y_train.value_counts().idxmax()
y_pred_baseline = np.full(shape=len(y_test), fill_value=majority_class)
baseline_accuracy = accuracy_score(y_test, y_pred_baseline)
print(f"Baseline Accuracy = {baseline_accuracy:.4f}")


# LOGISTIC REGRESSION

lambdas = [0.01, 0.1, 1, 10, 100, 150, 175, 200, 225, 250, 400]
logreg_accuracies = []

for lam in lambdas:
    if lam == 0:
        continue
    
    C_val = 1.0 / lam
    
    logreg = LogisticRegression(penalty='l2',
                                C=C_val, 
                                solver='lbfgs', 
                                max_iter=1000,
                                random_state=42)
    logreg.fit(X_train, y_train)
    y_pred = logreg.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    logreg_accuracies.append((lam, acc))

# Print logistic regression results
print("\nLogistic Regression (varying λ):")
for (lam, acc) in logreg_accuracies:
    print(f"  λ={lam} => Accuracy = {acc:.4f}")

# KNN CLASSIFIER

k_values = [1, 3, 5, 7, 9, 11, 13, 15]
knn_accuracies = []

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred_knn = knn.predict(X_test)
    acc_knn = accuracy_score(y_test, y_pred_knn)
    knn_accuracies.append((k, acc_knn))

# Print  results
print("\nKNN (varying k):")
for (k, acc) in knn_accuracies:
    print(f"  k={k} => Accuracy = {acc:.4f}")


Baseline Accuracy = 0.5037

Logistic Regression (varying λ):
  λ=0.01 => Accuracy = 0.9173
  λ=0.1 => Accuracy = 0.9173
  λ=1 => Accuracy = 0.9173
  λ=10 => Accuracy = 0.9177
  λ=100 => Accuracy = 0.9180
  λ=150 => Accuracy = 0.9183
  λ=175 => Accuracy = 0.9187
  λ=200 => Accuracy = 0.9183
  λ=225 => Accuracy = 0.9183
  λ=250 => Accuracy = 0.9183
  λ=400 => Accuracy = 0.9180

KNN (varying k):
  k=1 => Accuracy = 0.8887
  k=3 => Accuracy = 0.9077
  k=5 => Accuracy = 0.9023
  k=7 => Accuracy = 0.9053
  k=9 => Accuracy = 0.9020
  k=11 => Accuracy = 0.9043
  k=13 => Accuracy = 0.9060
  k=15 => Accuracy = 0.9033


# Part 3


In [4]:
# Define an error scorer 
def error_rate(y_true, y_pred):
    return 1.0 - accuracy_score(y_true, y_pred)

error_scorer = make_scorer(error_rate, greater_is_better=True)

# Outer Cross-validation Setup
outer_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)  

results = {
    'Fold': [],
    'lambda*': [], 'LogReg Error': [],
    'k*': [],      'KNN Error': [],
    'Baseline Error': []
}

# Define the hyperparameter grids
log_lambdas = [0.0001, 0.001, 0.01, 0.1, 1, 10]  
log_param_grid = {'C': [1.0 / lam for lam in log_lambdas]}

knn_k_values = [1, 3, 5, 7, 9, 11, 13]
knn_param_grid = {'n_neighbors': knn_k_values}


#Outer Loop for final evaluatioin
fold_idx = 1
for train_index, test_index in outer_cv.split(X, y):
    X_train_outer, X_test_outer = X.iloc[train_index], X.iloc[test_index]
    y_train_outer, y_test_outer = y.iloc[train_index], y.iloc[test_index]

    majority_class = y_train_outer.value_counts().idxmax()
    baseline_preds = np.full_like(y_test_outer, majority_class)
    baseline_error = error_rate(y_test_outer, baseline_preds)

    logreg = LogisticRegression(penalty='l2', solver='lbfgs', max_iter=1000)
    inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)  
    logreg_gs = GridSearchCV(estimator=logreg,
                             param_grid=log_param_grid,
                             scoring=error_scorer,   
                             cv=inner_cv,
                             n_jobs=-1)
    logreg_gs.fit(X_train_outer, y_train_outer)

    # Best logistic regression model on this outer fold
    best_logreg = logreg_gs.best_estimator_
    logreg_preds = best_logreg.predict(X_test_outer)
    logreg_error = error_rate(y_test_outer, logreg_preds)

    knn = KNeighborsClassifier()
    knn_gs = GridSearchCV(estimator=knn,
                          param_grid=knn_param_grid,
                          scoring=error_scorer,
                          cv=inner_cv,
                          n_jobs=-1)
    knn_gs.fit(X_train_outer, y_train_outer)

    best_knn = knn_gs.best_estimator_
    knn_preds = best_knn.predict(X_test_outer)
    knn_error = error_rate(y_test_outer, knn_preds)

    # Collect results for this outer fold
    chosen_lambda = 1.0 / best_logreg.C
    chosen_k = best_knn.n_neighbors

    results['Fold'].append(fold_idx)
    results['lambda*'].append(chosen_lambda)
    results['LogReg Error'].append(logreg_error)
    results['k*'].append(chosen_k)
    results['KNN Error'].append(knn_error)
    results['Baseline Error'].append(baseline_error)

    fold_idx += 1

# Create a results DataFrame 
results_df = pd.DataFrame(results)
print("\nTwo-Level Cross-Validation Results:\n")
print(results_df)

# Compute average errors across folds 
avg_logreg_err = results_df['LogReg Error'].mean()
avg_knn_err = results_df['KNN Error'].mean()
avg_base_err = results_df['Baseline Error'].mean()

print("\nAverage test errors across outer folds:")
print(f"  Logistic Regression: {avg_logreg_err:.4f}")
print(f"  KNN:                {avg_knn_err:.4f}")
print(f"  Baseline:           {avg_base_err:.4f}")



Two-Level Cross-Validation Results:

   Fold  lambda*  LogReg Error  k*  KNN Error  Baseline Error
0     1   0.0001      0.092667   1   0.119333        0.496667
1     2   0.0001      0.068000   1   0.098000        0.496667
2     3  10.0000      0.088000   1   0.106000        0.496667
3     4   0.0001      0.086667   1   0.118000        0.496667
4     5   0.0001      0.090000   1   0.111333        0.496667
5     6  10.0000      0.090667   1   0.108000        0.496667
6     7   1.0000      0.083333   1   0.118667        0.496667
7     8   0.0001      0.084000   1   0.118000        0.496000
8     9   0.0001      0.091333   1   0.117333        0.496000
9    10   1.0000      0.068667   1   0.113333        0.496000

Average test errors across outer folds:
  Logistic Regression: 0.0843
  KNN:                0.1128
  Baseline:           0.4965


# Part 4


In [5]:
# Extract the vectors of errrors
logreg_err = results_df['LogReg Error'].values
knn_err = results_df['KNN Error'].values
baseline_err = results_df['Baseline Error'].values

def paired_ttest_with_ci(a, b, alpha=0.05):
    # Differences per fold
    d = a - b
    n = len(d)
    d_mean = np.mean(d)
    d_std = np.std(d, ddof=1)          
   
    t_stat, p_value = ttest_rel(a, b)
    
    t_crit = t.ppf(1 - alpha/2, df=n - 1)
   
    half_width = t_crit * (d_std / np.sqrt(n))
    ci_lower = d_mean - half_width
    ci_upper = d_mean + half_width
    return d_mean, p_value, (ci_lower, ci_upper), t_stat

#Pairwice comparisons
#   Logistic Regression vs KNN
diff_lr_knn, p_lr_knn, ci_lr_knn, tstat_lr_knn = paired_ttest_with_ci(logreg_err, knn_err)

#   Logistic Regression vs Baseline
diff_lr_base, p_lr_base, ci_lr_base, tstat_lr_base = paired_ttest_with_ci(logreg_err, baseline_err)

#   KNN vs Baseline
diff_knn_base, p_knn_base, ci_knn_base, tstat_knn_base = paired_ttest_with_ci(knn_err, baseline_err)

# Print results
print("Pairwise Paired T-Tests on Error Rates:\n")

print(f"Logistic Regression vs KNN:")
print(f"  Mean diff (LR - KNN): {diff_lr_knn:.4f}")
print(f"  95% CI for diff:      [{ci_lr_knn[0]:.4f}, {ci_lr_knn[1]:.4f}]")
print(f"  p-value:              {p_lr_knn:.4e}")
print(f"  t-statistic:          {tstat_lr_knn:.4f}\n")

print(f"Logistic Regression vs Baseline:")
print(f"  Mean diff (LR - Baseline): {diff_lr_base:.4f}")
print(f"  95% CI for diff:           [{ci_lr_base[0]:.4f}, {ci_lr_base[1]:.4f}]")
print(f"  p-value:                   {p_lr_base:.4e}")
print(f"  t-statistic:               {tstat_lr_base:.4f}\n")

print(f"KNN vs Baseline:")
print(f"  Mean diff (KNN - Baseline): {diff_knn_base:.4f}")
print(f"  95% CI for diff:            [{ci_knn_base[0]:.4f}, {ci_knn_base[1]:.4f}]")
print(f"  p-value:                    {p_knn_base:.4e}")
print(f"  t-statistic:                {tstat_knn_base:.4f}\n")


Pairwise Paired T-Tests on Error Rates:

Logistic Regression vs KNN:
  Mean diff (LR - KNN): -0.0285
  95% CI for diff:      [-0.0345, -0.0224]
  p-value:              2.1339e-06
  t-statistic:          -10.6378

Logistic Regression vs Baseline:
  Mean diff (LR - Baseline): -0.4121
  95% CI for diff:           [-0.4185, -0.4058]
  p-value:                   1.6307e-16
  t-statistic:               -146.5457

KNN vs Baseline:
  Mean diff (KNN - Baseline): -0.3837
  95% CI for diff:            [-0.3888, -0.3786]
  p-value:                    4.2339e-17
  t-statistic:                -170.2411


# Part 5

In [6]:
# Choose a suitable value of lambda (λ) 
best_lambda = 0.1
C_val = 1.0 / best_lambda

# Define our features and target
X = df[['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp', 'Calories']]
y = df['Gender']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Train logistic regression
logreg = LogisticRegression(
    penalty='l2',
    C=C_val,
    solver='lbfgs',
    max_iter=1000,
    random_state=42
)
logreg.fit(X_train, y_train)

#  Show how the logistic model makes predictions
y_pred = logreg.predict(X_test)

# Examine the learned coefficients
features = X.columns
coef_values = logreg.coef_.flatten()  
intercept_value = logreg.intercept_[0]

print("Chosen λ (lambda):", best_lambda)
print("Corresponding C   :", C_val)
print("\nLogistic Regression Coefficients:")
for feat, val in zip(features, coef_values):
    print(f"  {feat}: {val:.4f}")
print(f"Intercept (bias): {intercept_value:.4f}")

Chosen λ (lambda): 0.1
Corresponding C   : 10.0

Logistic Regression Coefficients:
  Age: -0.0454
  Height: -0.2918
  Weight: 0.5861
  Duration: -0.0398
  Heart_Rate: 0.0063
  Body_Temp: 0.2273
  Calories: 0.0020
Intercept (bias): 0.0057
