In [2]:
import pandas as pd
import numpy as np
import os

In [5]:
data_path = '/Users/jk1/OneDrive - unige.ch/stroke_research/geneva_stroke_dataset/volumetric_perfusion_data/original/Total2016_2019IVTEVT_RAPID_IMAGE.xlsx'
selected_variables_path = '/Users/jk1/OneDrive - unige.ch/stroke_research/scope/variables/onset/volumetric_onset_scope_variable_selection.xlsx'

In [6]:
data_df = pd.read_excel(data_path)
selected_variables_df = pd.read_excel(selected_variables_path, header=None)
selected_variables = selected_variables_df.values.tolist()[0]


Cleaning Timings:
- Negative timings are due to missing or erroneous CT time / onset time

In [7]:
print(data_df['TimeOnsetCT'].isnull().sum())
(data_df['TimeOnsetCT'] < 0).sum()

161


7

In [8]:
data_df = data_df[(data_df['TimeOnsetCT'].isnull() == False) & (data_df['TimeOnsetCT'] > 0)]

data_df['TimeOnsetCT'].describe()

count     724.000000
mean      268.675414
std       337.738977
min        19.000000
25%        85.750000
50%       132.000000
75%       260.500000
max      2020.000000
Name: TimeOnsetCT, dtype: float64

Cleaning Volumetric Perfusion Parameters

In [9]:
print(data_df['CBF'].isnull().sum())
print(data_df['T4'].isnull().sum())
print(data_df['T6'].isnull().sum())
print(data_df['T8'].isnull().sum())
print(data_df['T10'].isnull().sum())
data_df = data_df[data_df['CBF'].isnull() == False]


222
222
222
221
222


In [10]:
print(data_df['NIH on admission'].isnull().sum())
data_df = data_df[data_df['NIH on admission'].isnull() == False]
data_df['NIH on admission'].describe()

3


count    499.000000
mean      11.026052
std        7.733599
min        0.000000
25%        4.000000
50%        9.000000
75%       17.000000
max       37.000000
Name: NIH on admission, dtype: float64

In [11]:
data_df[selected_variables].isnull().sum(axis = 0)

Age (calc.)                        0
Sex                                0
Time of symptom onset known        0
Referral                           0
Prestroke disability (Rankin)      6
NIH on admission                   0
BMI                              124
Antiplatelet drugs                 0
Anticoagulants                     1
MedHist Stroke                     2
MedHist TIA                        2
MedHist ICH                        2
MedHist Hypertension               0
MedHist Diabetes                   0
MedHist Hyperlipidemia             0
MedHist Smoking                    0
MedHist Atrial Fibr.               0
TimeOnsetCT                        0
T10                                0
T8                                 0
T6                                 0
T4                                 0
CBF                                0
dtype: int64

Because of too many missing values for BMI, this variable will be dropped

For missing variables of Patient history, absence is considered default

In [12]:
selected_variables.remove('BMI')

In [13]:
data_df.loc[data_df['Anticoagulants'].isnull(), 'Anticoagulants'] = 'no'
data_df.loc[data_df['MedHist Stroke'].isnull(), 'MedHist Stroke'] = 'no'
data_df.loc[data_df['MedHist TIA'].isnull(), 'MedHist TIA'] = 'no'
data_df.loc[data_df['MedHist ICH'].isnull(), 'MedHist ICH'] = 'no'
data_df.loc[data_df['Prestroke disability (Rankin)'].isnull(), 'Prestroke disability (Rankin)'] = 0


## Curate clinical variables

Curate Referral variable

In [14]:
data_df.loc[data_df['Referral'] == 'Emergency service (144)', 'Referral'] = 'Emergency service'
data_df.loc[data_df['Referral'] == 'SAMU', 'Referral'] = 'Emergency service'
data_df.loc[data_df['Referral'] == 'General practionner', 'Referral'] = 'general practitioner'
data_df.loc[data_df['Referral'] == 'in hospital stroke', 'Referral'] = 'in-hospital event'
data_df['Referral'] = data_df['Referral'].str.lower()
data_df['Referral'].value_counts()

emergency service       368
self referral            52
other hospital           50
in-hospital event        23
general practitioner      6
Name: Referral, dtype: int64

In [15]:
selected_data_df = data_df[selected_variables]

Strip whitespaces in all medical history columns

In [16]:
filter_col = [col for col in selected_data_df if col.startswith('MedHist')]
selected_data_df[filter_col] = selected_data_df[filter_col].apply(lambda column: column.str.strip())
selected_data_df['MedHist Hyperlipidemia'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


no     340
yes    159
Name: MedHist Hyperlipidemia, dtype: int64

Convert categorical variables to integers

*Note: missing variables are encoded as -1 -> there are then removed again*

In [17]:
char_cols = selected_data_df.dtypes.pipe(lambda x: x[x == 'object']).index
# Ignore onset known column
char_cols = char_cols.drop('Time of symptom onset known')
label_mapping = {}

for c in char_cols:
    selected_data_df[c], label_mapping[c] = pd.factorize(selected_data_df[c])
    selected_data_df.loc[selected_data_df[c] < 0, c] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_data_df[c], label_mapping[c] = pd.factorize(selected_data_df[c])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_data_df[c], label_mapping[c] = pd.factorize(selected_data_df[c])
A value is tr

In [18]:
selected_data_df['Time of symptom onset known'].value_counts()

yes        390
wake up     60
no          49
Name: Time of symptom onset known, dtype: int64

In [19]:
onset_known_df = selected_data_df[selected_data_df['Time of symptom onset known'] == 'yes']

# The following subpopulation is probably suboptimally selected as only patients with an estimated onset where selected above
onset_unknown_df = selected_data_df[selected_data_df['Time of symptom onset known'] == 'no']
wake_up_df = selected_data_df[selected_data_df['Time of symptom onset known'] == 'wake up']


In [20]:
curated_data_path = os.path.join(os.path.dirname(data_path), 'curated_onset_known_volumetric_data.xlsx')
onset_known_df.to_excel(curated_data_path)