In [None]:
import pandas as pd
import os

In [None]:
main_data_path = '/Users/jk1/stroke_datasets/ptiO2-Studie/moberg_data'
pbto2_meta_data_path = '/Users/jk1/stroke_datasets/ptiO2-Studie/pbto2_for_extraction.csv'

In [None]:
pbto2_meta_data_df = pd.read_csv(pbto2_meta_data_path)

In [None]:
from utils import format_date_column

pbto2_meta_data_df['Recording Start Time'] = format_date_column(pbto2_meta_data_df['Recording Start Time'])

In [None]:
pbto2_meta_data_df

In [None]:
manually_verified_supplementary_data_files = {
    '23_03': ['P3', 'P4'],
    '23_04': ['P3'],
    '23_05': ['P3'],
    '23_06': ['P3', 'P4'],
    '23_09': ['P3'],
}
# list of P3/P4 that do not seem to be PbtO2 or overlap with another PbtO2 file
manually_excluded = ['21_08 P3', '20_06 P3', '20_12']

In [None]:
# for a given variable to extract, check if all files are present

subj_with_icp_files = []
for year_dir in os.listdir(main_data_path):
    year_path = os.path.join(main_data_path, year_dir)
    if not os.path.isdir(year_path):
        continue
    for subj_dir in os.listdir(year_path):
        subj_path = os.path.join(year_path, subj_dir)
        if not os.path.isdir(subj_path):
            continue
            
        icp_files = []
        for file in os.listdir(subj_path):
            file_path = os.path.join(subj_path, file)
            # check if a file starting with PbtO2 and not ending with quality.txt and not containing 'AvgTime' exists
            if file.startswith('ICP') and not file.endswith('quality.txt') and 'AvgTime' not in file:
                icp_files.append(file)
        
        if len(icp_files) < 1:
            print(subj_dir, 'ICP file not found')
        else: 
            print(subj_dir, icp_files)
            subj_with_icp_files.append(subj_dir)

In [None]:
cpp_df = pd.DataFrame()
ptio2_df = pd.DataFrame()
temperature_df = pd.DataFrame()
hr_df = pd.DataFrame()
lpr_df = pd.DataFrame()
ci_df = pd.DataFrame()
etco2_df = pd.DataFrame()
prx_df = pd.DataFrame()
icp_df = pd.DataFrame()

for year_dir in os.listdir(main_data_path):
    year_path = os.path.join(main_data_path, year_dir)
    if not os.path.isdir(year_path):
        continue
    for subj_dir in os.listdir(year_path):
        subj_path = os.path.join(year_path, subj_dir)
        if not os.path.isdir(subj_path):
            continue

        pbto2_files = []
        cpp_files = []
        hr_files = []
        temperature_files = []
        lpr_files = []
        ci_files = []
        etco2_files = []
        prx_files = []
        icp_files = []
        for file in os.listdir(subj_path):
            file_path = os.path.join(subj_path, file)
            # check if a file starting with PbtO2 and not ending with quality.txt and not containing 'AvgTime' exists
            if file.startswith('PbtO2') and not file.endswith('quality.txt') and 'AvgTime' not in file:
                pbto2_files.append(file)
            if subj_dir in manually_verified_supplementary_data_files.keys():
                # if file startswith any of manually_verified_supplementary_data_files[subj_dir]
                if any([file.startswith(f) for f in manually_verified_supplementary_data_files[subj_dir]]) and not file.endswith('quality.txt') and 'AvgTime' not in file:
                    pbto2_files.append(file)
            if file.startswith('CPP') and not file.endswith('quality.txt') and 'AvgTime' not in file and 'CPP2' not in file:
                cpp_files.append(file)
            if file.startswith('HR') and not file.endswith('quality.txt') and 'AvgTime' not in file and 'PiCCO' not in file:
                hr_files.append(file)
            if file.startswith('Tcore') and not file.endswith('quality.txt') and 'AvgTime' not in file:
                temperature_files.append(file)
            if file.startswith('LPR') and not file.endswith('quality.txt') and 'AvgTime' not in file:
                lpr_files.append(file)
            if file.startswith('CI') and not file.endswith('quality.txt') and 'AvgTime' not in file:
                ci_files.append(file)
            if file.startswith('EtCO2') and not file.endswith('quality.txt') and 'AvgTime' not in file:
                etco2_files.append(file)
            if file.startswith('PRx') and not file.endswith('quality.txt') and 'AvgTime' not in file:
                prx_files.append(file)
            if file.startswith('ICP') and not file.endswith('quality.txt') and 'AvgTime' not in file:
                icp_files.append(file)
                
        if len(pbto2_files) > 1:
            # remove files with 'inc' in the name (if not all files have 'inc' in the name)
            if not all(['inc' in f for f in pbto2_files]):
                pbto2_files = [f for f in pbto2_files if 'inc' not in f]
                    
        subj_pbto2_df = pd.DataFrame()
        for pbto2_file in pbto2_files:
            pbto2_path = os.path.join(subj_path, pbto2_file)
            single_pbto2_df = pd.read_csv(pbto2_path, sep='\t')
            if 'inc' in pbto2_file:
                single_pbto2_df = single_pbto2_df[['Day#', 'ClockTime', 'PbtO2,na,Numeric,Float,LicoxLCX02 (mmHg):Med']]
            # set last column as to ptio2
            single_pbto2_df.columns = [*single_pbto2_df.columns[:-1], 'ptio2']
            subj_pbto2_df = pd.concat([subj_pbto2_df, single_pbto2_df])
            
                
        subj_cpp_df = pd.DataFrame()
        for cpp_file in cpp_files:
            cpp_path = os.path.join(subj_path, cpp_file)
            single_cpp_df = pd.read_csv(cpp_path, sep='\t')
            if 'inc' in cpp_file:
                single_cpp_df = single_cpp_df[['Day#', 'ClockTime', 'CPP,na,Numeric,Float,CARESCAPE (mmHg):Med']]
            single_cpp_df.columns = [*single_cpp_df.columns[:-1], 'cpp']
            subj_cpp_df = pd.concat([subj_cpp_df, single_cpp_df])
            
        subj_temperature_df = pd.DataFrame()
        for temperature_file in temperature_files:
            temperature_path = os.path.join(subj_path, temperature_file)
            single_temperature_df = pd.read_csv(temperature_path, sep='\t', encoding_errors='ignore')
            if 'inc' in temperature_file:
                single_temperature_df = single_temperature_df[['Day#', 'ClockTime', 'Tcore,na,Numeric,Float,CARESCAPE (C):Med']]
            single_temperature_df.columns = [*single_temperature_df.columns[:-1], 'temperature']
            subj_temperature_df = pd.concat([subj_temperature_df, single_temperature_df])
        
        subj_hr_df = pd.DataFrame()
        for hr_file in hr_files:
            hr_path = os.path.join(subj_path, hr_file)
            single_hr_df = pd.read_csv(hr_path, sep='\t')
            single_hr_df.columns = [*single_hr_df.columns[:-1], 'hr']
            subj_hr_df = pd.concat([subj_hr_df, single_hr_df])
            
        subj_lpr_df = pd.DataFrame()
        for lpr_file in lpr_files:
            lpr_path = os.path.join(subj_path, lpr_file)
            single_lpr_df = pd.read_csv(lpr_path, sep='\t')
            if 'inc' in lpr_file:
                lpr_name = lpr_file.split(',')[0]
                single_lpr_df = single_lpr_df[['Day#', 'ClockTime', f'{lpr_name},na,SparseNumeric,Float,ManualEntry:Med']]
            single_lpr_df.columns = [*single_lpr_df.columns[:-1], 'lpr']
            subj_lpr_df = pd.concat([subj_lpr_df, single_lpr_df])
            
        subj_ci_df = pd.DataFrame()
        for ci_file in ci_files:
            ci_path = os.path.join(subj_path, ci_file)
            single_ci_df = pd.read_csv(ci_path, sep='\t')
            # single_ci_df.columns = [*single_ci_df.columns[:-1], 'ci']
            subj_ci_df = pd.concat([subj_ci_df, single_ci_df])
            
        subj_etco2_df = pd.DataFrame()
        for etco2_file in etco2_files:
            etco2_path = os.path.join(subj_path, etco2_file)
            single_etco2_df = pd.read_csv(etco2_path, sep='\t')
            if 'inc' in etco2_file:
                etco2_name = etco2_file.split(',')[0]
                single_etco2_df = single_etco2_df[['Day#', 'ClockTime', f'{etco2_name},na,Numeric,Float,CARESCAPE (mmHg):Med']]
            single_etco2_df.columns = [*single_etco2_df.columns[:-1], 'etco2']

            subj_etco2_df = pd.concat([subj_etco2_df, single_etco2_df])
            
        subj_prx_df = pd.DataFrame()
        for prx_file in prx_files:
            prx_path = os.path.join(subj_path, prx_file)
            single_prx_df = pd.read_csv(prx_path, sep='\t')
            if 'inc' in prx_file:
                prx_name = prx_file.split('_')[0]
                # PRx,na,Numeric,Float,Reader Plugins,ICP=ICP (CARESCAPE),ABP=ART (CARESCAPE):Med
                single_prx_df = single_prx_df[['Day#', 'ClockTime', f'{prx_name}:Med']]
            single_prx_df.columns = [*single_prx_df.columns[:-1], 'prx']
            subj_prx_df = pd.concat([subj_prx_df, single_prx_df])

        subj_icp_df = pd.DataFrame()
        for icp_file in icp_files:
            icp_path = os.path.join(subj_path, icp_file)
            single_icp_df = pd.read_csv(icp_path, sep='\t')
            if 'inc' in icp_file:
                icp_name = icp_file.split(',')[0]
                single_icp_df = single_icp_df[['Day#', 'ClockTime', f'{icp_name},Mean,Numeric,Float,CARESCAPE (mmHg):Med']]

            single_icp_df.columns = [*single_icp_df.columns[:-1], 'icp']
            subj_icp_df = pd.concat([subj_icp_df, single_icp_df])
            
        subj_recording_start_date = pbto2_meta_data_df[pbto2_meta_data_df['pat_nr'] == subj_dir]['Recording Start Time']
        subj_recording_start_date = pd.to_datetime(subj_recording_start_date.str[:17], format="%Y %b %d %H:%M").dt.date.values[0]
                
        for subj_var_df in [subj_pbto2_df, subj_cpp_df, subj_temperature_df, subj_hr_df, subj_lpr_df, subj_ci_df, subj_etco2_df, subj_prx_df,
                            subj_icp_df]:
            if subj_var_df.shape == (0,0):
                continue
            subj_var_df['ClockTime'] = subj_var_df['ClockTime'].apply(lambda x: x if len(x) > 5 else x + ':00')
            subj_var_df['recording_start_date'] = subj_recording_start_date
            subj_var_df['datetime'] = subj_var_df['recording_start_date'] + pd.to_timedelta(subj_var_df['Day#'], unit='d') 
            subj_var_df['datetime'] = subj_var_df['datetime'].astype(str) + ' ' + subj_var_df['ClockTime']
                
            subj_var_df['pat_nr'] = subj_dir

        
        cpp_df = pd.concat([cpp_df, subj_cpp_df])
        ptio2_df = pd.concat([ptio2_df, subj_pbto2_df])
        temperature_df = pd.concat([temperature_df, subj_temperature_df])
        hr_df = pd.concat([hr_df, subj_hr_df])
        lpr_df = pd.concat([lpr_df, subj_lpr_df])
        ci_df = pd.concat([ci_df, subj_ci_df])
        etco2_df = pd.concat([etco2_df, subj_etco2_df])
        prx_df = pd.concat([prx_df, subj_prx_df])
        icp_df = pd.concat([icp_df, subj_icp_df])
        
for var_df in [cpp_df, ptio2_df, temperature_df, hr_df, lpr_df, ci_df, etco2_df, prx_df, icp_df]:
    var_df.drop(columns=['MRN'], inplace=True)

In [None]:
icp_df.icp.describe()

In [None]:
ci_df.head()

In [None]:
temperature_df.head()

In [None]:
cpp_df.head()

In [None]:
ptio2_df.head()

In [None]:
lpr_df.head()

In [None]:
hr_df.head()

In [None]:
icp_df.head()

In [None]:
# cpp_df.to_csv('/Users/jk1/Downloads/cpp_df.csv', index=False)
# ptio2_df.to_csv('/Users/jk1/Downloads/ptio2_df.csv', index=False)
# hr_df.to_csv('/Users/jk1/Downloads/hr_df.csv', index=False)
# lpr_df.to_csv('/Users/jk1/Downloads/lpr_df.csv', index=False)
# temperature_df.to_csv('/Users/jk1/Downloads/temperature_df.csv', index=False)
# ci_df.to_csv('/Users/jk1/Downloads/ci_df.csv', index=False)
# etco2_df.to_csv('/Users/jk1/Downloads/etco2_df.csv', index=False)
# prx_df.to_csv('/Users/jk1/Downloads/prx_df.csv', index=False)
# icp_df.to_csv('/Users/jk1/Downloads/icp_df.csv', index=False)