In [1]:
import pandas as pd
import seaborn as sns
import os
import numpy as np
from gsd_pipeline.dataset_tools.filter_for_clinical_param import filter_for_clinical_param
from gsd_pipeline.clinical_data.from_2018.extract_clinical_outcomes import extract_clinical_outcomes as extract_2018
from gsd_pipeline.clinical_data.years_2015_2016_2017.extract_clinical_outcomes import extract_clinical_outcomes as extract_2015
from gsd_pipeline.clinical_data.join_multi_annual_outcome_df import join_multi_annual_outcome_df

In [2]:
variable_names = '/Users/jk1/OneDrive - unige.ch/stroke_research/scope/variables/scope_variable_selection.xlsx'
patient_ids_2018 = '/Users/jk1/OneDrive - unige.ch/stroke_research/geneva_stroke_dataset/imaging_keys/2018/patient_ids_keys_2018.xlsx'
clinical_data_2018 = '/Users/jk1/temp/scope_clinical_data_extraction/SSR_cases_of_2018_(Adm,_Hosp_and_FU).xlsx'

patient_ids_2015_2016_2017 = '/Users/jk1/OneDrive - unige.ch/stroke_research/geneva_stroke_dataset/imaging_keys/2015-2017/anonymisation_key_pCT_2016_2017.xlsx'
clinical_data_2015_2016_2017 = '/Users/jk1/temp/scope_clinical_data_extraction/190419_Données 2015-16-17.xlsx'

variable_names = pd.read_excel(variable_names, header=None)

Extracting data for 2018

In [21]:
variable_names_2018 = variable_names.dropna(axis=1).loc[0,1:].values.tolist()

extract_2018(
    patient_ids_2018,
    clinical_data_2018, id_sheet='Sheet1',
    info_sheet='Export cases registered in.',
    anonymise=False,
    selected_outcomes = variable_names_2018
)

Output may contain duplicates, please remove them manually as not all duplicate entries are the same.


Extracting data for 2015-2017

In [28]:
variable_names_2015_2016_2017 = variable_names.loc[1,1:].values.tolist()

extract_2015(
        patient_ids_2015_2016_2017,
        clinical_data_2015_2016_2017,
        id_sheet = 'Sheet1', info_sheet = 'Sheet1 (2)', anonymise=False,
        selected_outcomes = variable_names_2015_2016_2017)

For 2015-2017 patients, additional data is added manually to mRS follow-up


Joining data

In [36]:
modified_variable_names_2015_2016_2017 = variable_names.dropna(axis=1).loc[2,1:].values.tolist()
extracted_2018_data_path = '/Users/jk1/OneDrive - unige.ch/stroke_research/geneva_stroke_dataset/clinical_data/2018/scope_extracted_variables/extracted_clinical_outcomes_SSR_cases_of_2018_(Adm,_Hosp_and_FU).xlsx'
complemented_extracted_2015_2016_2017_data = '/Users/jk1/OneDrive - unige.ch/stroke_research/geneva_stroke_dataset/clinical_data/2015-2017/scope_extracted_variables_data/complemented_extracted_clinical_outcomes_190419_Données 2015-16-17.xlsx'

join_multi_annual_outcome_df(extracted_2018_data_path,
                             complemented_extracted_2015_2016_2017_data,
                             'pid',
                             'anonymised_id',
                             variable_names_2018,
                             modified_variable_names_2015_2016_2017)



#### Action required

Intermediary steps
- delete duplicates in 2018 data
- manually completion of missing outcome in database

## Data curation

In [149]:
joined_data_path = '/Users/jk1/OneDrive - unige.ch/stroke_research/geneva_stroke_dataset/clinical_data/multi_annual_joined_data/scope_joined_variables/completed_joined_anon_outcome_df.xlsx'
joined_data = pd.read_excel(joined_data_path)

In [150]:
joined_data['3M mRS'].isnull().sum()

36

Exclude patients with either: missing 3 month outcome or missing NIHSS (n=1)

In [151]:
joined_data = joined_data[joined_data['3M mRS'].notnull()]

joined_data = joined_data[joined_data['NIH on admission'].notnull()]

In [152]:
joined_data['3M mRS'].value_counts()

0.0    44
1.0    34
4.0    29
2.0    24
3.0    22
6.0     8
5.0     7
Name: 3M mRS, dtype: int64

In [153]:
joined_data.isnull().sum(axis = 0)

Unnamed: 0                        0
pid                               0
Age (calc.)                       0
Sex                               0
Time of symptom onset known       0
Referral                          0
Prestroke disability (Rankin)     0
NIH on admission                  0
1st syst. bp                      0
1st diast. bp                     0
BMI                              13
1st glucose                       3
1st creatinine                    0
Antiplatelet drugs                0
Anticoagulants                    0
IVT with rtPA                     0
IAT                               1
MedHist Stroke                    0
MedHist TIA                       0
MedHist ICH                       0
MedHist Hypertension              0
MedHist Diabetes                  0
MedHist Hyperlipidemia            0
MedHist Smoking                   0
MedHist Atrial Fibr.              0
3M mRS                            0
Door to image (min.)              7
dtype: int64

Curate BMI variable

In [154]:
joined_data.loc[joined_data['BMI'] == '?', 'BMI'] = np.nan
joined_data['BMI'] = joined_data['BMI'].astype('float')

Curate Sex variable

In [155]:
joined_data.loc[joined_data['Sex'] == 'F', 'Sex'] = 'Female'
joined_data.loc[joined_data['Sex'] == 'M', 'Sex'] = 'Male'
joined_data['Sex'].value_counts()

Male      92
Female    76
Name: Sex, dtype: int64

Curate Referral variable

In [156]:
joined_data.loc[joined_data['Referral'] == 'Emergency service (144)', 'Referral'] = 'Emergency service'
joined_data.loc[joined_data['Referral'] == 'General practionner', 'Referral'] = 'general practitioner'
joined_data.loc[joined_data['Referral'] == 'in hospital stroke', 'Referral'] = 'in-hospital event'
joined_data['Referral'] = joined_data['Referral'].str.lower()
joined_data['Referral'].value_counts()

emergency service       131
self referral            19
in-hospital event         9
other hospital            7
general practitioner      2
Name: Referral, dtype: int64

Curate IVT treatment variable

In [157]:
joined_data['IVT with rtPA'] = joined_data['IVT with rtPA'].str.strip()
joined_data.loc[joined_data['IVT with rtPA'] == 'oui', 'IVT with rtPA'] = 'yes'
joined_data.loc[joined_data['IVT with rtPA'] == 'non', 'IVT with rtPA'] = 'no'
joined_data['IVT with rtPA'].value_counts()

yes    138
no      30
Name: IVT with rtPA, dtype: int64

Strip whitespaces in all medical history columns

In [158]:
filter_col = [col for col in joined_data if col.startswith('MedHist')]
joined_data[filter_col] = joined_data[filter_col].apply(lambda column: column.str.strip())
joined_data['MedHist Hyperlipidemia'].value_counts()

no     106
yes     62
Name: MedHist Hyperlipidemia, dtype: int64

Convert categorical variables to integers

*Note: missing variables are encoded as -1 -> there are then removed again*

In [159]:
char_cols = joined_data.dtypes.pipe(lambda x: x[x == 'object']).index
# Ignore pid column
char_cols = char_cols.drop('pid')
label_mapping = {}

for c in char_cols:
    joined_data[c], label_mapping[c] = pd.factorize(joined_data[c])
    joined_data.loc[joined_data[c] < 0, c] = np.nan

#### Action required
Verify for duplicates (these should be removed before converting to categories to binaries)

In [160]:
label_mapping


{'Sex': Index(['Female', 'Male'], dtype='object'),
 'Time of symptom onset known': Index(['yes', 'wake up', 'no'], dtype='object'),
 'Referral': Index(['emergency service', 'in-hospital event', 'self referral',
        'other hospital', 'general practitioner'],
       dtype='object'),
 'Antiplatelet drugs': Index(['no', 'yes'], dtype='object'),
 'Anticoagulants': Index(['no', 'yes'], dtype='object'),
 'IVT with rtPA': Index(['yes', 'no'], dtype='object'),
 'IAT': Index(['no', 'yes'], dtype='object'),
 'MedHist Stroke': Index(['no', 'yes'], dtype='object'),
 'MedHist TIA': Index(['no', 'yes'], dtype='object'),
 'MedHist ICH': Index(['no', 'yes'], dtype='object'),
 'MedHist Hypertension': Index(['no', 'yes'], dtype='object'),
 'MedHist Diabetes': Index(['no', 'yes'], dtype='object'),
 'MedHist Hyperlipidemia': Index(['yes', 'no'], dtype='object'),
 'MedHist Smoking': Index(['no', 'yes'], dtype='object'),
 'MedHist Atrial Fibr.': Index(['no', 'yes'], dtype='object')}

In [161]:
# save curated data
curated_data_path = os.path.join(os.path.dirname(joined_data_path), 'curated_completed_joined_anon_outcome_df.xlsx')
joined_data.to_excel(curated_data_path)

## Restrict to patients with imaging data available

In [162]:
imaging_dataset_path = '/Users/jk1/stroke_datasets/dataset_files/perfusion_data_sets/noGT_datasets/noGT_pmaps_15-19_dataset.npz'
ids = np.load(imaging_dataset_path, allow_pickle=True)['ids']

In [163]:
data_with_imaging = joined_data[joined_data['pid'].isin(ids)]
data_with_imaging.to_excel(os.path.join(os.path.dirname(joined_data_path), 'with_imaging_curated_completed_joined_anon_outcome_df.xlsx'))

## Filter imaging dataset for subjects having all clinical variables

In [103]:
filter_for_clinical_param(imaging_dataset_path, curated_data_path, '3M mRS','pid')


Loading a total of 201 subjects.
Sequences used: {'ct_sequences': ['wcoreg_Tmax', 'wcoreg_CBF', 'wcoreg_MTT', 'wcoreg_CBV'], 'mri_sequences': ['masked_wcoreg_VOI']}
0 subjects had been excluded.
subj-799c5528 not found in clinical database. Will be removed.
subj-5ca2adc0 not found in clinical database. Will be removed.
subj-96beef92 not found in clinical database. Will be removed.
subj-b34c2105 not found in clinical database. Will be removed.
subj-61f13baa not found in clinical database. Will be removed.
subj-60c7d944 not found in clinical database. Will be removed.
subj-dcdfb2af not found in clinical database. Will be removed.
subj-b1a35c04 not found in clinical database. Will be removed.
subj-9b85c60e not found in clinical database. Will be removed.
subj-56a1e19d not found in clinical database. Will be removed.
subj-e54598a6 not found in clinical database. Will be removed.
subj-f082265f not found in clinical database. Will be removed.
subj-538a58f1 not found in clinical database. Wil

In [104]:
data_with_imaging['IAT'].value_counts()

no     90
yes    71
Name: IAT, dtype: int64

In [105]:
data_with_imaging['IVT with rtPA'].value_counts()

yes     97
oui     38
no      23
non      4
Name: IVT with rtPA, dtype: int64