In [None]:
import pandas as pd

In [None]:
data_path = '/Users/jk1/temp/cereblink/direct_R_fallback/pupillometry_timebins/reassembled_pupillometry_24h_timebin_normalised.csv'

In [None]:
df = pd.read_csv(data_path)

In [None]:
df.head()

In [None]:
df.drop(columns=['Unnamed: 0'], inplace=True)

In [None]:
id_columns = ['pNr', 'Name', 'Date_birth', 'label', 'timebin_end']
# all other columns are features
feature_columns = [col for col in df.columns if col not in id_columns]

In [None]:
metric = 'CV_inter_eye_min_timebin_max'

In [None]:
from sklearn.metrics import roc_auc_score

roc_auc_score(df.dropna(subset=[metric])['label'], -1 * df.dropna(subset=[metric])[metric])

In [None]:
from sklearn.model_selection import StratifiedKFold
from testing import youdens_index, test_predictor

results_df = pd.DataFrame()
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for train_index, test_index in skf.split(df, df['label']):
    train_df = df.iloc[train_index]
    train_df.dropna(subset=[metric], inplace=True)
    
    test_df = df.iloc[test_index]
    test_df.dropna(subset=[metric], inplace=True)
    
    train_n_pos = train_df['label'].sum()
    test_n_pos = test_df['label'].sum()
    
    if train_df[train_df['label'] == 1][metric].median() > train_df[train_df['label'] == 0][metric].median():
        youdens = youdens_index(train_df['label'], train_df[metric])
    else:
        youdens = -1 * youdens_index(train_df['label'], -1 * train_df[metric])
        
    # check direction of comparison (to know which if should be thresholded above or below)
    # median of label 1 > median of label 0 -> threshold above; else threshold below
    if train_df[train_df['label'] == 1][metric].median() > train_df[train_df['label'] == 0][metric].median():
        y_pred_binary = test_df[metric] > youdens    
    else:
        y_pred_binary = test_df[metric] <= youdens
    y_pred_binary = y_pred_binary.astype(int)
        
    fold_results = test_predictor(test_df['label'], y_pred_binary)
    fold_roc_auc = roc_auc_score(test_df['label'], test_df[metric])
    if fold_roc_auc < 0.5:
        fold_roc_auc = 1 - fold_roc_auc
    fold_results['roc_auc'] = fold_roc_auc
    fold_results['youdens'] = youdens
    fold_results['n_pos'] = test_n_pos
    fold_results['n_neg'] = len(test_index) - test_n_pos
    fold_results['fold'] = len(results_df)
    
    results_df = pd.concat([results_df, pd.DataFrame(fold_results, index=[0])])
    
overall_roc_auc = roc_auc_score(df['label'], df[metric])
if overall_roc_auc < 0.5:
    overall_roc_auc = 1 - overall_roc_auc
results_df['overall_roc_auc'] = overall_roc_auc

results_df['metric'] = metric

    
    
    

In [None]:
len(test_df)

In [None]:
results_df

In [None]:
# plot roc curve
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

fpr, tpr, thresholds = roc_curve(train_df['label'], -1* train_df[metric])
plt.plot(fpr, tpr)

In [None]:
from sklearn.metrics import multilabel_confusion_matrix, confusion_matrix

mcm = confusion_matrix([0,1,1,0], [0,0,0,1])
# mcm = multilabel_confusion_matrix(test_df['label'], y_pred_binary)
tn = mcm[0, 0]
tp = mcm[1, 1]
fn = mcm[1, 0]
fp = mcm[0, 1]


In [None]:
tn / (tn + fp)

In [None]:
    specificity = tn / (tn + fp)
    results_dict['specificity'] = specificity
    neg_pred_value = tn / (tn + fn)

In [None]:
train_df[train_df['label'] == 1][metric].median() > train_df[train_df['label'] == 0][metric].median()

In [None]:
from pupillometry.marker_evaluation.threshold_and_test_metric import test_pupillometry_metrics

results_df = test_pupillometry_metrics(df)

In [None]:
results_df

In [None]:
import re

data_filename = 'reassembled_pupillometry_112h_timebin_normalised.csv'

int(re.search(r'_(\d+)h_', data_filename).group(1))

In [None]:
from sklearn.utils import resample

for i in range(5):
    df_bs = resample(df.dropna(subset=[metric]), replace=True)
    print(df_bs.pNr.nunique())

In [None]:
df.pNr.nunique()