In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# load gz csv file
df_ecg = pd.read_csv('final_ecg_rows_2.csv.gz', compression='gzip')
df_patient = pd.read_csv('patient_rows_2.csv.gz', compression='gzip')
df_pred = pd.read_csv('/media/data1/gjabbour/predictions.csv')

In [3]:
print(df_ecg.shape)
print(df_patient.shape)
print(df_pred.columns)

(795546, 12)
(159608, 27)
Index(['waveform_path', 'pred'], dtype='object')


In [4]:
print(df_ecg[df_ecg['dt_af'].isnull()].shape)
print(df_ecg[df_ecg['y_5y'].isnull()].shape)

# replace rows where y_5y is null with 0
df_ecg['y_5y'] = df_ecg['y_5y'].fillna(0)

(453753, 12)
(453753, 12)


In [5]:
# add pred column from df_pred to df_ecg by matching waveform_path in df_pred with path in df_ecg
df_ecg = pd.merge(df_ecg, df_pred, how='inner', left_on='path', right_on='waveform_path')

In [6]:
# convert pred column to float
df_ecg['pred'] = df_ecg['pred'].str.replace('[', '').str.replace(']', '').astype(float)

In [7]:
# filter df_ecg 
df_ecg = df_ecg[((df_ecg['sinus'] == 1) & ((df_ecg['dt_af'] < 0) | (df_ecg['dt_af'].isnull())) & (df_ecg['age_at_ecg'] >= 18) & (df_ecg['dt_max'] > 0))]

In [8]:
print(df_ecg.describe())

         subject_id       afl      afib     sinus         dt_af  \
count  4.422810e+05  442281.0  442281.0  442281.0  76688.000000   
mean   1.501549e+07       0.0       0.0       1.0   -755.408604   
std    2.876757e+06       0.0       0.0       0.0    887.402740   
min    1.000003e+07       0.0       0.0       1.0  -4299.000000   
25%    1.253470e+07       0.0       0.0       1.0  -1207.000000   
50%    1.500225e+07       0.0       0.0       1.0   -394.000000   
75%    1.751054e+07       0.0       0.0       1.0    -33.000000   
max    1.999999e+07       0.0       0.0       1.0     -1.000000   

                y_5y         dt_max     age_at_ecg           pred  
count  442281.000000  442281.000000  442281.000000  442281.000000  
mean        0.149299     985.860677      61.156665       0.557496  
std         0.356383    1054.337130      16.461466       0.241019  
min         0.000000       1.000000      18.000000       0.001300  
25%         0.000000      54.000000      51.000000      

In [9]:
# calculate AUC using y_5y for label and pred for prediction
from sklearn.metrics import roc_auc_score
auc = roc_auc_score(df_ecg['y_5y'], df_ecg['pred'])
print(auc)

0.5838532586434837


In [10]:
print(df_ecg.columns)

Index(['subject_id', 'ecg_time_x', 'path', 'afl', 'afib', 'sinus',
       'aggregated_report', 'dt_af', 'y_5y', 'dt_max', 'sex', 'age_at_ecg',
       'waveform_path', 'pred'],
      dtype='object')


In [11]:
df_ecg[['waveform_path']].to_csv('pred_waveform_path.csv.gz', compression='gzip', index=False)