In [None]:
import os
import pandas as pd

In [None]:
ptio2_path = '/Users/jk1/stroke_datasets/ptiO2-Studie/moberg_extracted_data/ptio2_df.csv'
main_data_path = '/Users/jk1/stroke_datasets/ptiO2-Studie/moberg_data'
pbto2_meta_data_path = '/Users/jk1/stroke_datasets/ptiO2-Studie/pbto2_for_extraction.csv'

In [None]:
manually_verified_supplementary_data_files = {
    '23_03': ['P3', 'P4'],
    '23_04': ['P3'],
    '23_05': ['P3'],
    '23_06': ['P3', 'P4'],
    '23_09': ['P3'],
}

In [None]:
ptio2_df = pd.read_csv(ptio2_path)
pbto2_meta_data_df = pd.read_csv(pbto2_meta_data_path)

In [None]:
from utils import format_date_column

pbto2_meta_data_df['Recording Start Time'] = format_date_column(pbto2_meta_data_df['Recording Start Time'])

In [None]:
ptio2_df.head()

# Extract quality data

In [None]:
ptio2_quality_df = pd.DataFrame()

for year_dir in os.listdir(main_data_path):
    year_path = os.path.join(main_data_path, year_dir)
    if not os.path.isdir(year_path):
        continue
    for subj_dir in os.listdir(year_path):
        subj_path = os.path.join(year_path, subj_dir)
        if not os.path.isdir(subj_path):
            continue

        pbto2_quality_files = []
        for file in os.listdir(subj_path):
            file_path = os.path.join(subj_path, file)
            # check if a file starting with PbtO2 and not ending with quality.txt and not containing 'AvgTime' exists
            if file.startswith('PbtO2') and file.endswith('quality.txt') and 'AvgTime' not in file:
                pbto2_quality_files.append(file)
            if subj_dir in manually_verified_supplementary_data_files.keys():
                # if file startswith any of manually_verified_supplementary_data_files[subj_dir]
                if any([file.startswith(f) for f in manually_verified_supplementary_data_files[subj_dir]]) and file.endswith('quality.txt') and 'AvgTime' not in file:
                    pbto2_quality_files.append(file)
         
        if len(pbto2_quality_files) > 1:
            # remove files with 'inc' in the name
            pbto2_quality_files = [f for f in pbto2_quality_files if 'inc' not in f]
        
        subj_pbto2_quality_df = pd.DataFrame()
        for pbto2_quality_file in pbto2_quality_files:
            pbto2_path = os.path.join(subj_path, pbto2_quality_file)
            single_pbto2_df = pd.read_csv(pbto2_path, sep='\t')
            # if 'inc' in pbto2_file:
            #     single_pbto2_df = single_pbto2_df[['Day#', 'ClockTime', 'PbtO2,na,Numeric,Float,LicoxLCX02 (mmHg):Med']]
            # set last column as to ptio2_quality
            single_pbto2_df.columns = [*single_pbto2_df.columns[:-1], 'ptio2_quality']
            subj_pbto2_quality_df = pd.concat([subj_pbto2_quality_df, single_pbto2_df])
            
        subj_recording_start_date = pbto2_meta_data_df[pbto2_meta_data_df['pat_nr'] == subj_dir]['Recording Start Time']
        subj_recording_start_date = pd.to_datetime(subj_recording_start_date.str[:17], format="%Y %b %d %H:%M").dt.date.values[0]
                
        for subj_var_df in [subj_pbto2_quality_df]:
            if subj_var_df.shape == (0,0):
                continue
            subj_var_df['ClockTime'] = subj_var_df['ClockTime'].apply(lambda x: x if len(x) > 5 else x + ':00')
            subj_var_df['recording_start_date'] = subj_recording_start_date
            subj_var_df['datetime'] = pd.to_datetime(subj_var_df['recording_start_date']) + pd.to_timedelta(subj_var_df['Day#'], unit='d') 
            subj_var_df['datetime'] = subj_var_df['datetime'].astype(str) + ' ' + subj_var_df['ClockTime']
                
            subj_var_df['pat_nr'] = subj_dir

        ptio2_quality_df = pd.concat([ptio2_quality_df, subj_pbto2_quality_df])

for var_df in [ptio2_quality_df]:
    var_df.drop(columns=['MRN'], inplace=True)

In [None]:
ptio2_quality_df

In [None]:
ptio2_quality_df.ptio2_quality.value_counts()

In [None]:
# flags that all next measures until next normal dataquality flag are bad
bad_data_quality_flags = ['Value Out Of Range', 'Suppress Data; Unapproved Source', 'Suppress Data', 'Value Out Of Range; Unapproved Source']
good_data_quality_flags = ['Data Quality Normal', 'Unapproved Source']

In [None]:
filtered_ptio2_df = pd.DataFrame()
for subj in ptio2_df['pat_nr'].unique():
    subj_df = ptio2_df[ptio2_df['pat_nr'] == subj]
    subj_quality_df = ptio2_quality_df[ptio2_quality_df['pat_nr'] == subj]
    subj_df['bad_quality_data'] = 0
    subj_quality_df.sort_values(by='datetime', inplace=True)
    
    for i, row in subj_quality_df.iterrows():
        if row['ptio2_quality'] in bad_data_quality_flags:
            subj_df.loc[subj_df['datetime'] >= row['datetime'], 'bad_quality_data'] = 1
        elif row['ptio2_quality'] in good_data_quality_flags:
            subj_df.loc[subj_df['datetime'] >= row['datetime'], 'bad_quality_data'] = 0
            
    filtered_ptio2_df = pd.concat([filtered_ptio2_df, subj_df])
            

In [None]:
filtered_ptio2_df

In [None]:
filtered_ptio2_df.groupby('pat_nr')['bad_quality_data'].value_counts(normalize=True)

In [None]:
ptio2_quality_df[ptio2_quality_df['pat_nr'] == '20_12']

In [None]:
temp = filtered_ptio2_df[(filtered_ptio2_df['pat_nr'] == '20_12') & (filtered_ptio2_df['bad_quality_data'] == 1)]

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
# subj list 19_15, 20_01, 20_06, 20_09, 20_12, 21_12 21_16
subj_list = ['19_15', '20_01', '20_06', '20_09', '20_12', '21_12', '21_16']

for subj in filtered_ptio2_df['pat_nr'].unique():
    fig = plt.figure()
    temp = filtered_ptio2_df[(filtered_ptio2_df['pat_nr'] == subj) & (filtered_ptio2_df['bad_quality_data'] == 1)]
    try:
        ax = sns.histplot(temp['ptio2'].astype(float))
        ax.set_title(subj)
    except:
        pass


In [None]:
# filtered_ptio2_df.to_csv('/Users/jk1/stroke_datasets/ptiO2-Studie/moberg_extracted_data/ptio2_df_filtered.csv', index=False)