In [38]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, LeaveOneOut
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, mean_squared_error
from sklearn.pipeline import Pipeline
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from scipy.stats import ttest_rel

# Load data

In [33]:
# Mikel path
data = pd.read_csv('/Users/mikelyu/Desktop/Uni/02455 Statistical Evaluation/02455-projects/HR_data.csv')

# Karl path

# Display the first few rows of the dataset
print(data.head())

   Unnamed: 0    HR_Mean  HR_Median    HR_std  HR_Min  HR_Max     HR_AUC  \
0           0  77.965186      78.00  3.345290   73.23   83.37  22924.945   
1           1  70.981097      70.57  2.517879   67.12   78.22  21930.400   
2           2  73.371959      73.36  3.259569   67.88   80.22  21647.085   
3           3  78.916822      77.88  4.054595   72.32   84.92  25258.905   
4           4  77.322226      74.55  6.047603   70.52   90.15  23890.565   

     Round   Phase  Individual  Puzzler  Frustrated Cohort  
0  round_3  phase3           1        1           1   D1_1  
1  round_3  phase2           1        1           5   D1_1  
2  round_3  phase1           1        1           0   D1_1  
3  round_2  phase3           1        1           1   D1_1  
4  round_2  phase2           1        1           5   D1_1  


In [34]:
# Check for missing values
print(data.isnull().sum())

Unnamed: 0    0
HR_Mean       0
HR_Median     0
HR_std        0
HR_Min        0
HR_Max        0
HR_AUC        0
Round         0
Phase         0
Individual    0
Puzzler       0
Frustrated    0
Cohort        0
dtype: int64


No missing values.

In [35]:
# Drop rows with missing target values (Frustrated)
data = data.dropna(subset=['Frustrated'])

# Encode categorical variables if necessary
data = pd.get_dummies(data, columns=['Puzzler', 'Cohort'], drop_first=True)

# Display the first few rows of the processed data
data.head()

Unnamed: 0.1,Unnamed: 0,HR_Mean,HR_Median,HR_std,HR_Min,HR_Max,HR_AUC,Round,Phase,Individual,Frustrated,Puzzler_1,Cohort_D1_2
0,0,77.965186,78.0,3.34529,73.23,83.37,22924.945,round_3,phase3,1,1,True,False
1,1,70.981097,70.57,2.517879,67.12,78.22,21930.4,round_3,phase2,1,5,True,False
2,2,73.371959,73.36,3.259569,67.88,80.22,21647.085,round_3,phase1,1,0,True,False
3,3,78.916822,77.88,4.054595,72.32,84.92,25258.905,round_2,phase3,1,1,True,False
4,4,77.322226,74.55,6.047603,70.52,90.15,23890.565,round_2,phase2,1,5,True,False


# Models

## Regression with backward selection

In [36]:
# Define the features and target variable
X = data.drop(columns=['Frustrated'])
y = data['Frustrated']

# Add a constant to the model (intercept)
X = sm.add_constant(X)

# Perform backward selection
def backward_elimination(X, y, significance_level=0.05):
    num_vars = len(X.columns)
    for i in range(num_vars):
        regressor_ols = sm.OLS(y, X).fit()
        max_p_value = max(regressor_ols.pvalues)
        if max_p_value > significance_level:
            max_p_var = regressor_ols.pvalues.idxmax()
            X = X.drop(columns=[max_p_var])
    return X

# Fit the model
X_selected = backward_elimination(X, y)
regressor = sm.OLS(y, X_selected).fit()

# Evaluate using Leave-One-Out Cross-Validation
loo = LeaveOneOut()
mse_scores = []

for train_index, test_index in loo.split(X_selected):
    X_train, X_test = X_selected.iloc[train_index], X_selected.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    model = sm.OLS(y_train, X_train).fit()
    predictions = model.predict(X_test)
    mse_scores.append(mean_squared_error(y_test, predictions))

reg_loo_mse = np.mean(mse_scores)
print(f'Regression Model LOOCV MSE: {reg_loo_mse}')


ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).

In [40]:
# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Build the ANN model function
def build_ann():
    model = Sequential()
    model.add(Dense(64, input_dim=X_scaled.shape[1], activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

# Evaluate using Leave-One-Out Cross-Validation
loo = LeaveOneOut()
ann_mse_scores = []

for train_index, test_index in loo.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    ann_model = build_ann()
    ann_model.fit(X_train, y_train, epochs=100, batch_size=10, verbose=0)
    predictions = ann_model.predict(X_test)
    ann_mse_scores.append(mean_squared_error(y_test, predictions))

ann_loo_mse = np.mean(ann_mse_scores)
print(f'ANN Model LOOCV MSE: {ann_loo_mse}')

ValueError: could not convert string to float: 'round_3'

In [39]:
# Perform a paired t-test on the LOOCV results
t_stat, p_value = ttest_rel(mse_scores, ann_mse_scores)
print(f'Paired t-test results: t-statistic = {t_stat}, p-value = {p_value}')

NameError: name 'mse_scores' is not defined

PCR first?

Leave one out cross validation

Regression - Backward selection?

ANN?