In [1]:
import pandas as pd

calories = pd.read_csv('../Dataset/calories.csv')
exercise = pd.read_csv('../Dataset/exercise.csv')
df = pd.merge(exercise, calories, on = 'User_ID')
#Mapper male til 1 og kvinder til 0 for at spare data
df['Gender'] = df['Gender'].map({'male': 1, 'female': 0})
df = df.reset_index()
df.head()

Unnamed: 0,index,User_ID,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,0,14733363,1,68,190.0,94.0,29.0,105.0,40.8,231.0
1,1,14861698,0,20,166.0,60.0,14.0,94.0,40.3,66.0
2,2,11179863,1,69,179.0,79.0,5.0,88.0,38.7,26.0
3,3,16180408,0,34,179.0,71.0,13.0,100.0,40.5,71.0
4,4,17771927,0,27,154.0,58.0,10.0,81.0,39.8,35.0


# Part 2


In [6]:
import numpy as np
import pandas as pd

# Scikit-learn imports
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 2. DEFINE FEATURES (X) AND TARGET (y)
X = df[['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp', 'Calories']]  # Example features
y = df['Gender']

# 3. SPLIT INTO TRAIN AND TEST SETS
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42, 
                                                    stratify=y)

# 4. BASELINE MODEL

# Identify the majority class
majority_class = y_train.value_counts().idxmax()

# Predict the majority class for all test samples
y_pred_baseline = np.full(shape=len(y_test), fill_value=majority_class)

# Evaluate the baseline
baseline_accuracy = accuracy_score(y_test, y_pred_baseline)
print(f"Baseline Accuracy = {baseline_accuracy:.4f}")


# 5. LOGISTIC REGRESSION
#
#   We’ll vary λ >= 0. In scikit-learn, the regularization
#   parameter is `C = 1/λ`. For example, we’ll examine a 
#   few values of λ below. 

lambdas = [0.01, 0.1, 1, 10, 100, 150, 175, 200, 225, 250, 400]
logreg_accuracies = []

for lam in lambdas:
    if lam == 0:
        continue
    
    C_val = 1.0 / lam
    
    logreg = LogisticRegression(penalty='l2',
                                C=C_val, 
                                solver='lbfgs', 
                                max_iter=1000,
                                random_state=42)
    logreg.fit(X_train, y_train)
    y_pred = logreg.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    logreg_accuracies.append((lam, acc))

# Print logistic regression results
print("\nLogistic Regression (varying λ):")
for (lam, acc) in logreg_accuracies:
    print(f"  λ={lam} => Accuracy = {acc:.4f}")

###################################################
# 6. KNN CLASSIFIER
#
#   For method 2, we use KNN. We’ll vary k (the #neighbors)
#   as the complexity-controlling parameter.
###################################################

k_values = [1, 3, 5, 7, 9, 11, 13, 15]
knn_accuracies = []

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred_knn = knn.predict(X_test)
    acc_knn = accuracy_score(y_test, y_pred_knn)
    knn_accuracies.append((k, acc_knn))

# Print KNN results
print("\nKNN (varying k):")
for (k, acc) in knn_accuracies:
    print(f"  k={k} => Accuracy = {acc:.4f}")


Baseline Accuracy = 0.5037

Logistic Regression (varying λ):
  λ=0.01 => Accuracy = 0.9173
  λ=0.1 => Accuracy = 0.9173
  λ=1 => Accuracy = 0.9173
  λ=10 => Accuracy = 0.9177
  λ=100 => Accuracy = 0.9180
  λ=150 => Accuracy = 0.9183
  λ=175 => Accuracy = 0.9187
  λ=200 => Accuracy = 0.9183
  λ=225 => Accuracy = 0.9183
  λ=250 => Accuracy = 0.9183
  λ=400 => Accuracy = 0.9180

KNN (varying k):
  k=1 => Accuracy = 0.8887
  k=3 => Accuracy = 0.9077
  k=5 => Accuracy = 0.9023
  k=7 => Accuracy = 0.9053
  k=9 => Accuracy = 0.9020
  k=11 => Accuracy = 0.9043
  k=13 => Accuracy = 0.9060
  k=15 => Accuracy = 0.9033


# Part 3


In [15]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score

# 2. Define an error scorer 
#    error = 1 - accuracy
def error_rate(y_true, y_pred):
    return 1.0 - accuracy_score(y_true, y_pred)

error_scorer = make_scorer(error_rate, greater_is_better=True)

# 3. Outer Cross-validation Setup
outer_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)  # for example, 5 outer folds

# We'll store results in a list/dict for making a final table.
results = {
    'Fold': [],
    'lambda*': [], 'LogReg Error': [],
    'k*': [],      'KNN Error': [],
    'Baseline Error': []
}

# 4. Define the hyperparameter grids
#    Note scikit-learn's 'C' = 1/lambda. We’ll invert below.
log_lambdas = [0.0001, 0.001, 0.01, 0.1, 1, 10]  # Example range
log_param_grid = {'C': [1.0 / lam for lam in log_lambdas]}

knn_k_values = [1, 3, 5, 7, 9, 11, 13]
knn_param_grid = {'n_neighbors': knn_k_values}


# 5. Outer Loop for final evaluation
fold_idx = 1
for train_index, test_index in outer_cv.split(X, y):
    X_train_outer, X_test_outer = X.iloc[train_index], X.iloc[test_index]
    y_train_outer, y_test_outer = y.iloc[train_index], y.iloc[test_index]

    # === BASELINE ===
    majority_class = y_train_outer.value_counts().idxmax()
    baseline_preds = np.full_like(y_test_outer, majority_class)
    baseline_error = error_rate(y_test_outer, baseline_preds)

    # === INNER CROSS-VALIDATION (Hyperparameter Tuning) ===
    #  5a. Logistic Regression with GridSearch
    logreg = LogisticRegression(penalty='l2', solver='lbfgs', max_iter=1000)
    inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)  # for tuning
    logreg_gs = GridSearchCV(estimator=logreg,
                             param_grid=log_param_grid,
                             scoring=error_scorer,   # measure error directly
                             cv=inner_cv,
                             n_jobs=-1)
    logreg_gs.fit(X_train_outer, y_train_outer)

    # Best logistic regression model on this outer fold
    best_logreg = logreg_gs.best_estimator_
    # Evaluate on the outer test fold
    logreg_preds = best_logreg.predict(X_test_outer)
    logreg_error = error_rate(y_test_outer, logreg_preds)

    #  5b. KNN with GridSearch
    knn = KNeighborsClassifier()
    knn_gs = GridSearchCV(estimator=knn,
                          param_grid=knn_param_grid,
                          scoring=error_scorer,
                          cv=inner_cv,
                          n_jobs=-1)
    knn_gs.fit(X_train_outer, y_train_outer)

    best_knn = knn_gs.best_estimator_
    knn_preds = best_knn.predict(X_test_outer)
    knn_error = error_rate(y_test_outer, knn_preds)

    # 6. Collect results for this outer fold
    # Extract "best" hyperparameters in the original scale (lambda, k)
    # logistic: we have best_logreg.C => lambda = 1/C
    chosen_lambda = 1.0 / best_logreg.C
    chosen_k = best_knn.n_neighbors

    results['Fold'].append(fold_idx)
    results['lambda*'].append(chosen_lambda)
    results['LogReg Error'].append(logreg_error)
    results['k*'].append(chosen_k)
    results['KNN Error'].append(knn_error)
    results['Baseline Error'].append(baseline_error)

    fold_idx += 1

# 7. Create a results DataFrame resembling your “Table 2”
results_df = pd.DataFrame(results)
print("\nTwo-Level Cross-Validation Results:\n")
print(results_df)

# 8. Compute average errors across folds for final summary
avg_logreg_err = results_df['LogReg Error'].mean()
avg_knn_err = results_df['KNN Error'].mean()
avg_base_err = results_df['Baseline Error'].mean()

print("\nAverage test errors across outer folds:")
print(f"  Logistic Regression: {avg_logreg_err:.4f}")
print(f"  KNN:                {avg_knn_err:.4f}")
print(f"  Baseline:           {avg_base_err:.4f}")



Two-Level Cross-Validation Results:

   Fold  lambda*  LogReg Error  k*  KNN Error  Baseline Error
0     1   0.0001      0.092667   1   0.119333        0.496667
1     2   0.0001      0.068000   1   0.098000        0.496667
2     3  10.0000      0.088000   1   0.106000        0.496667
3     4   0.0001      0.086667   1   0.118000        0.496667
4     5   0.0001      0.090000   1   0.111333        0.496667
5     6  10.0000      0.090667   1   0.108000        0.496667
6     7   1.0000      0.083333   1   0.118667        0.496667
7     8   0.0001      0.084000   1   0.118000        0.496000
8     9   0.0001      0.091333   1   0.117333        0.496000
9    10   1.0000      0.068667   1   0.113333        0.496000

Average test errors across outer folds:
  Logistic Regression: 0.0843
  KNN:                0.1128
  Baseline:           0.4965


# Part 4


In [17]:
import numpy as np
import pandas as pd

# Suppose you have your final predictions from each of the three models on the entire dataset,
# each row corresponding to one data sample (never used in training for that model),
# and columns = ["Logistic", "KNN", "Baseline"] plus the true label "y_true".

# For example, you could store them as:
df_preds = pd.DataFrame({
    'y_true':       all_true_labels,          # shape (N,)
    'logreg_pred':  all_logreg_predictions,   # shape (N,)
    'knn_pred':     all_knn_predictions,      # shape (N,)
    'base_pred':    all_baseline_predictions, # shape (N,)
})

# Check dimension
print(df_preds.shape)
df_preds.head()


NameError: name 'all_true_labels' is not defined

In [None]:
# Option A: statsmodels built-in
from statsmodels.stats.contingency_tables import mcnemar

def run_mcnemar_test(y_true, predA, predB):
    """
    Runs McNemar's test comparing two classifiers (A and B).
    Returns the test statistic and p-value.
    """
    # 1) Convert to 'correct' or 'incorrect' for each classifier
    correctA = (predA == y_true).astype(int)
    correctB = (predB == y_true).astype(int)

    # 2) Construct the 2x2 contingency table
    #    a01 = # data points: A correct, B wrong
    #    a10 = # data points: A wrong, B correct
    a01 = np.sum((correctA == 1) & (correctB == 0))
    a10 = np.sum((correctA == 0) & (correctB == 1))

    # statsmodels wants the table in this form:
    # [[both_correct, a01],
    #  [a10, both_wrong]]
    # But for McNemar’s p-value, only a01 and a10 matter.
    both_correct = np.sum((correctA == 1) & (correctB == 1))
    both_wrong   = np.sum((correctA == 0) & (correctB == 0))
    table = [[both_correct, a01],
             [a10,         both_wrong]]

    # 3) Run McNemar test (can choose exact or approximate)
    result = mcnemar(table=table, exact=False, correction=True)  # continuity correction
    return result.statistic, result.pvalue


# Part 5

In [16]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Logistic Regression with lambda = 175 => C ~ 0.0057
lambda_val = 175
C_val = 1.0 / lambda_val

logreg = LogisticRegression(penalty='l2', C=C_val, solver='lbfgs', max_iter=1000)
logreg.fit(X_train, y_train)

# Evaluate
y_pred = logreg.predict(X_test)
test_error = 1.0 - accuracy_score(y_test, y_pred)
print(f"Test Error Rate (lambda={lambda_val}) = {test_error:.4f}")

# Inspect coefficients
print("Coefficients (w):", logreg.coef_)
print("Intercept (w0):", logreg.intercept_)


Test Error Rate (lambda=175) = 0.0813
Coefficients (w): [[-4.19014625e-02 -2.65063534e-01  5.53112708e-01 -2.57832265e-02
   1.21864425e-02  1.53195433e-01  1.02837791e-04]]
Intercept (w0): [0.00344442]
