In [None]:
import pandas as pd
import os

In [None]:
main_data_path = '/Users/jk1/stroke_datasets/ptiO2-Studie'
registry_path = '/Users/jk1/stroke_datasets/ptiO2-Studie/moberg_registry_kssg.xlsx'

In [None]:
manually_verified_supplementary_data_files = {
    '23_03': ['P3', 'P4'],
    '23_04': ['P3'],
    '23_05': ['P3'],
    '23_06': ['P3', 'P4'],
    '23_09': ['P3'],
}

In [None]:
# loop through directories and subdirectories 
n_patients_with_pbto2_data = 0

columns = ['pat_nr', 'mrn', 'first_name', 'last_name', 'dob', 'year', 'gender', 'recording_start_time', 'recording_end_time', 'first_pbtO2_recording_day', 'last_pbtO2_recording_day', 'first_pbtO2_recording_time', 'last_pbtO2_recording_time', 'n_pbtO2_recordings']
screened_population_df = pd.DataFrame(columns=columns)

for year_dir in os.listdir(main_data_path):
    year_path = os.path.join(main_data_path, year_dir)
    if not os.path.isdir(year_path):
        continue
    for subj_dir in os.listdir(year_path):
        subj_path = os.path.join(year_path, subj_dir)
        if not os.path.isdir(subj_path):
            continue
        subj_df = pd.DataFrame(columns=columns)
        subj_df['pat_nr'] = [subj_dir]
        subj_df['year'] = year_dir

        pbto2_files = []
        annotation_files = []
        for file in os.listdir(subj_path):
            file_path = os.path.join(subj_path, file)
            # check if a file starting with PbtO2 and not ending with quality.txt and not containing 'AvgTime' exists
            if file.startswith('PbtO2') and not file.endswith('quality.txt') and 'AvgTime' not in file:
                pbto2_files.append(file)
            if subj_dir in manually_verified_supplementary_data_files.keys():
                # if file startswith any of manually_verified_supplementary_data_files[subj_dir]
                if any([file.startswith(f) for f in manually_verified_supplementary_data_files[subj_dir]]) and not file.endswith('quality.txt') and 'AvgTime' not in file:
                    pbto2_files.append(file)
            if file.endswith('Annotations.csv'):
                annotation_files.append(file)
        
        if len(annotation_files) > 0:
            annotation_file_path = os.path.join(subj_path, annotation_files[0])
            annotation_df = pd.read_csv(annotation_file_path, header=None, on_bad_lines='skip').set_index(0).T
            subj_df['first_name'] = annotation_df['Patient First Name'].values[0]
            subj_df['last_name'] = annotation_df['Patient Last Name'].values[0]
            subj_df['mrn'] = annotation_df['Medical Record Number'].values[0]
            subj_df['dob'] = annotation_df['Date of Birth'].values[0]
            subj_df['gender'] = annotation_df['Gender'].values[0]
            subj_df['recording_start_time'] = annotation_df['Recording Start Time'].values[0]
            subj_df['recording_end_time'] = annotation_df['Recording End Time'].values[0]
        else:
            print(f'No annotation file found for patient {subj_dir}')
            
        if len(pbto2_files) > 1:
            # remove files with 'inc' in the name
            pbto2_files = [f for f in pbto2_files if 'inc' not in f]
                
        if len(pbto2_files) > 0:
            n_patients_with_pbto2_data += 1
            pbto2_df = pd.DataFrame()
            for pbto2_file in pbto2_files:
                pbto2_path = os.path.join(subj_path, pbto2_file)
                single_pbto2_df = pd.read_csv(pbto2_path, sep='\t')
                # set last column as to ptio2
                single_pbto2_df.columns = [*single_pbto2_df.columns[:-1], 'ptio2']
                pbto2_df = pd.concat([pbto2_df, single_pbto2_df])
                
            min_day = pbto2_df['Day#'].min()
            max_day = pbto2_df['Day#'].max()
            min_time = pbto2_df[pbto2_df['Day#'] == min_day]['ClockTime'].min()
            max_time = pbto2_df[pbto2_df['Day#'] == max_day]['ClockTime'].max()
            subj_df['first_pbtO2_recording_day'] = min_day
            subj_df['last_pbtO2_recording_day'] = max_day
            subj_df['first_pbtO2_recording_time'] = min_time
            subj_df['last_pbtO2_recording_time'] = max_time
            subj_df['n_pbtO2_recordings'] = pbto2_df.shape[0]
            
        else:
            print(f'No pbto2 file found for patient {subj_dir}')
            
        screened_population_df = pd.concat([screened_population_df, subj_df])

print(f'Number of patients with pbto2 data: {n_patients_with_pbto2_data}')

In [None]:
screened_population_df

In [None]:
registry_df = pd.read_excel(registry_path)

In [None]:
registry_df = registry_df.drop(columns=['Nr.', 'Jahr'])

In [None]:
registry_df

In [None]:
joined_population_df = screened_population_df.merge(registry_df, left_on='pat_nr', right_on='Pat. Nr.', how='left')

In [None]:
joined_population_df

In [None]:
# replace nans
joined_population_df['mrn'] = joined_population_df['mrn'].fillna(joined_population_df['Medical Record Number'].astype(str).str[:-2])
joined_population_df['Patient First Name'] = joined_population_df['Patient First Name'].fillna(joined_population_df['first_name'])
joined_population_df['Patient Last Name'] = joined_population_df['Patient Last Name'].fillna(joined_population_df['last_name'])
joined_population_df['Date of Birth'] = joined_population_df['Date of Birth'].fillna(joined_population_df['dob'])
joined_population_df['Gender'] = joined_population_df['Gender'].fillna(joined_population_df.gender)

In [None]:
to_drop = ['Pat. Nr.', 'Medical Record Number', 'first_name', 'last_name', 'dob', 'gender']
joined_population_df = joined_population_df.drop(columns=to_drop)

In [None]:
joined_population_df.columns

Prepare for extraction

In [None]:
for_extraction_df = joined_population_df[['pat_nr', 'mrn', 'Patient First Name', 'Patient Last Name', 'Date of Birth', 'recording_start_time', 'recording_end_time',
       'first_pbtO2_recording_day', 'last_pbtO2_recording_day',
       'first_pbtO2_recording_time', 'last_pbtO2_recording_time', 'Recording Start Time', 'Recording End Time']]
for_extraction_df

In [None]:
# for_extraction_df.to_csv('/Users/jk1/Downloads/pbto2_for_extraction.csv', index=False)