In [None]:
import pandas as pd
import numpy as np

In [None]:
data_path = '/Users/jk1/stroke_datasets/stroke_unit_dataset/per_value/Extraction_20220815'
admission_data_path = '/Users/jk1/OneDrive - unige.ch/stroke_research/geneva_stroke_unit_dataset/data/stroke_registry/post_hoc_modified/stroke_registry_post_hoc_modified.xlsx'
patient_selection_path = '/Users/jk1/temp/opsum_extraction_output/high_frequency_data_patient_selection_with_details.csv'

In [None]:
from preprocessing.variable_assembly.variable_database_assembly import assemble_variable_database

feature_df = assemble_variable_database(data_path, admission_data_path, patient_selection_path)

## Elaborating transformation function

In [None]:
feature_df.head()

In [None]:
datatime_format = '%d.%m.%Y %H:%M'

In [None]:
# find first sample_date for each case_admission id
feature_df['sample_date'] = pd.to_datetime(feature_df['sample_date'], format=datatime_format)
first_sample_date = feature_df.groupby('case_admission_id').sample_date.min()

In [None]:
first_sample_date.head()

In [None]:
first_sample_date = first_sample_date.reset_index(level=0)


In [None]:
feature_df['case_admission_id'] = feature_df['case_admission_id'].astype(str)
first_sample_date['case_admission_id'] = first_sample_date['case_admission_id'].astype(str)
first_sample_date.rename(columns={'sample_date': 'first_sample_date'}, inplace=True)

In [None]:
feature_df = feature_df.merge(first_sample_date, on='case_admission_id')


In [None]:
feature_df.head()

In [None]:
feature_df['relative_sample_date'] = (pd.to_datetime(feature_df['sample_date'], format=datatime_format) - pd.to_datetime(feature_df['first_sample_date'], format=datatime_format)).dt.seconds / (60*60)

In [None]:
feature_df.head(500)

In [None]:
(feature_df['relative_sample_date'] < 0).any()

#### Finding edge cases where EHR start too early

In [None]:
temp_df = feature_df.copy()

In [None]:
datatime_format = '%d.%m.%Y %H:%M'

# find first sample_date for each case_admission id
temp_df['sample_date'] = pd.to_datetime(temp_df['sample_date'], format=datatime_format)

In [None]:
import numpy as np
# find number of case admission ids in feature_df where sample date of source == stroke registry is after first sample date of source == EHR

first_ehr_sample_date = temp_df[temp_df.source == 'EHR'].groupby('case_admission_id').sample_date.min().reset_index(level=0)
first_ehr_sample_date.rename(columns={'sample_date': 'first_ehr_sample_date'}, inplace=True)
first_registry_sample_date = temp_df[temp_df.source == 'stroke_registry'].groupby('case_admission_id').sample_date.min().reset_index(level=0)
first_registry_sample_date.rename(columns={'sample_date': 'first_registry_sample_date'}, inplace=True)

merged_df = first_ehr_sample_date.merge(first_registry_sample_date, on='case_admission_id')
merged_df['delta_first_sample_date_dt'] = (merged_df['first_ehr_sample_date'] - merged_df['first_registry_sample_date'])
merged_df['delta_first_sample_date_h'] = merged_df['delta_first_sample_date_dt'] / np.timedelta64(1, 'h')

In [None]:
first_NIHSS_sample_date = temp_df[(temp_df.sample_label == 'NIHSS') & (temp_df.source == 'EHR')].groupby('case_admission_id').sample_date.min().reset_index(level=0)
# for all cases with missing NIHSS data, use nan
missing_NIHSS_sample_date = pd.DataFrame(set(temp_df.case_admission_id.unique())
                                         .difference(set(first_NIHSS_sample_date.case_admission_id.unique())),
                                         columns=['case_admission_id'])
missing_NIHSS_sample_date['sample_date'] = np.nan
first_NIHSS_sample_date = first_NIHSS_sample_date.append(missing_NIHSS_sample_date)
first_NIHSS_sample_date.rename(columns={'sample_date': 'first_NIHSS_sample_date'}, inplace=True)

merged_df = merged_df.merge(first_NIHSS_sample_date, on='case_admission_id')

In [None]:
merged_df['delta_first_NIHSS_to_registry_start_date_h'] = (merged_df['first_NIHSS_sample_date'] - merged_df['first_registry_sample_date']) / np.timedelta64(1, 'h')


In [None]:
merged_df[merged_df['delta_first_sample_date_h'] < -24]

In [None]:
cids_with_problematic_delta = merged_df[merged_df['delta_first_sample_date_h'] < -24].case_admission_id.unique()

In [None]:
temp_df[temp_df.case_admission_id == '207196_7187']

In [None]:
# find first EHR sample in the 24h before first registry sample

def find_first_EHR_in_24h_from_registry_start(pa_id):
    first_registry_sample_date_minus_24h = merged_df[merged_df.case_admission_id == pa_id].first_registry_sample_date - pd.Timedelta(hours=24)
    subj_df = temp_df[(temp_df.case_admission_id == pa_id)]
    subj_df['delta_sample_date_to_registry_sample_date_minus_24h'] = (temp_df.sample_date - first_registry_sample_date_minus_24h.iloc[0]) / np.timedelta64(
            1, 'h')
    return subj_df[(subj_df.delta_sample_date_to_registry_sample_date_minus_24h > 0) & (temp_df.source == 'EHR')].sample_date.min()


In [None]:
# plot number of samples per sample_date for a given case_admission_id
import matplotlib.pyplot as plt
import seaborn as sns

def plot_samples_per_sample_date(cid):
    n_samples_per_sample_date = temp_df.groupby(['case_admission_id', 'sample_date']).sample_label.count().reset_index()
    n_samples_per_sample_date.rename(columns={'sample_label': 'n_samples'}, inplace=True)

    cid_df = n_samples_per_sample_date[n_samples_per_sample_date.case_admission_id == cid]
    ax = sns.scatterplot(x='sample_date', y='n_samples', data=cid_df)
    sns.scatterplot(x=[first_registry_sample_date[first_registry_sample_date.case_admission_id == cid].first_registry_sample_date.iloc[0]], y=[10], color='red', ax=ax)
    sns.scatterplot(x=[first_NIHSS_sample_date[first_NIHSS_sample_date.case_admission_id == cid].first_NIHSS_sample_date.iloc[0]], y=[11], color='green', ax=ax)
    first_EHR_in_24h_from_registry_start = find_first_EHR_in_24h_from_registry_start(cid)
    sns.scatterplot(x=first_EHR_in_24h_from_registry_start, y=[12], color='purple', ax=ax)
    # rotate x axis labels
    plt.setp(ax.get_xticklabels(), rotation=45)
    print(f'Subj: {cid}')
    print('Registry start:', first_registry_sample_date[first_registry_sample_date.case_admission_id == cid].first_registry_sample_date.iloc[0])
    print('first NIHSS', first_NIHSS_sample_date[first_NIHSS_sample_date.case_admission_id == cid].first_NIHSS_sample_date.iloc[0])
    plt.show()

In [None]:
for cid in cids_with_problematic_delta:
    plot_samples_per_sample_date(cid)

Possible rule for finding edge case:
- default reference as start: first sample date of EHR
- when first sample date of EHR is more than 1 day before first sample date of stroke registry:
-> use first sample of EHR in the 24h before start according to registry


Old way, not used anymore:
- when first sample date of EHR is more than 1 day before first sample date of stroke registry:
    - if first sample of NIHSS is before first sample of stroke registry -> reference is first sample of EHR (as first sample of NIHSS occurs after other data is available)
    - if first sample of NIHSS is after first sample of stroke registry -> reference is first sample of stroke registry
    -> remove samples occurring before reference


## Testing transformation function

In [None]:
temp_df = feature_df.copy()

In [None]:
len(temp_df.case_admission_id.unique())

In [None]:
from preprocessing.variable_assembly.relative_timestamps import transform_to_relative_timestamps

temp_df = transform_to_relative_timestamps(temp_df, drop_old_columns=False,
                                         restrict_to_time_range=True, desired_time_range=72,
                                         enforce_min_time_range=True, min_time_range=12,
                                           log_dir='/Users/jk1/temp/opsum_extraction_output/test')

In [None]:
len(temp_df.case_admission_id.unique())


In [None]:
temp_df.head()

#### Testing functional selection of starting date

In [None]:
# Find first sample date of EHR
first_ehr_sample_date = temp_df[(temp_df.source == 'EHR') & (temp_df.sample_label != 'FIO2')] \
    .groupby('case_admission_id').sample_date.min().reset_index(level=0)
first_ehr_sample_date.rename(columns={'sample_date': 'first_ehr_sample_date'}, inplace=True)

# Find first sample date of stroke registry
first_registry_sample_date = temp_df[temp_df.source == 'stroke_registry'].groupby(
    'case_admission_id').sample_date.min().reset_index(level=0)
first_registry_sample_date.rename(columns={'sample_date': 'first_registry_sample_date'}, inplace=True)
merged_first_sample_dates_df = first_ehr_sample_date.merge(first_registry_sample_date, on='case_admission_id')
merged_first_sample_dates_df['delta_first_sample_date_h'] = (
                    merged_first_sample_dates_df[
                        'first_ehr_sample_date']
                    - merged_first_sample_dates_df[
                        'first_registry_sample_date']) / np.timedelta64(
    1, 'h')

In [None]:
merged_first_sample_dates_df

Is functional if stroke registry date and first sample date of EHR are at < 24h

##### Check restriction to minimum sampling range

In [None]:
min_time_range = 12

In [None]:
# check range of relative_sample_date for each case_admission_id
max_sampling_dates = temp_df[temp_df.source != 'stroke_registry'].groupby('case_admission_id').relative_sample_date.max().reset_index()
max_sampling_dates[max_sampling_dates.relative_sample_date < 24]

In [None]:
max_sampling_dates = temp_df[temp_df.source != 'stroke_registry'].groupby(
            'case_admission_id').relative_sample_date.max().reset_index()
cid_with_short_range = max_sampling_dates[
            max_sampling_dates.relative_sample_date < min_time_range].case_admission_id.unique()

In [None]:
cid_with_short_range

In [None]:
plot_samples_per_sample_date('268410_7611')

268410_7611 in an exception because having been transferred between two hospitals