In [52]:

"""Predicting which patients are at a higher risk of complications"""
import warnings
import os
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from example_pipelines.healthcare.healthcare_utils import MyW2VTransformer, MyKerasClassifier, \
    create_model
from mlinspect.utils import get_project_root




# Preprocessing



## adding labels


In [24]:

# FutureWarning: Sklearn 0.24 made a change that breaks remainder='drop', that change will be fixed
#  in an upcoming version: https://github.com/scikit-learn/scikit-learn/pull/19263
warnings.filterwarnings('ignore')

COUNTIES_OF_INTEREST = ['county2', 'county3'] # domain: 1, 2, 3, 4

patients = pd.read_csv(os.path.join(str(get_project_root()), "example_pipelines", "healthcare",
                                    "patients.csv"), na_values='?')
histories = pd.read_csv(os.path.join(str(get_project_root()), "example_pipelines", "healthcare",
                                     "histories.csv"), na_values='?')
print("len(patients) = {}, len(histories) = {}".format(len(patients), len(histories)))

data = patients.merge(histories, on=['ssn'])
print("length after merging = {}".format(len(data)))
print(data[:5])
data.to_csv(r"patients_merged_w_histories.csv", index=False)


len(patients) = 887, len(histories) = 887
length after merging = 887
   id  first_name last_name   race   county  num_children     income  \
0   1       Tabby      Ward  race3  county3             1  159027.00   
1   4        Romy   Hawkins  race3  county1             0    8868.00   
2   6       Cozmo     Frank  race1  county3             3  120171.75   
3   8  Georgeanna     Frank  race1      NaN             2  220113.00   
4  13        Pepe        Le  race1      NaN             4   87996.00   

  age_group          ssn smoker  complications  
0    group3  670-18-6875  False              2  
1    group3  117-52-3454  False              1  
2    group3  751-43-6257  False              2  
3    group3  844-92-1943   True             10  
4    group3  421-81-5857  False              1  


In [53]:

data = pd.read_csv("patients_merged_w_histories.csv")

complications = data.groupby('age_group') \
    .agg(mean_complications=('complications', 'mean'))

complications

Unnamed: 0_level_0,mean_complications
age_group,Unnamed: 1_level_1
group1,5.206406
group2,4.959732
group3,5.269481


In [54]:

data = data.merge(complications, on=['age_group'])
data[:4]



Unnamed: 0,id,first_name,last_name,race,county,num_children,income,age_group,ssn,smoker,complications,mean_complications
0,1,Tabby,Ward,race3,county3,1,159027.0,group3,670-18-6875,False,2,5.269481
1,4,Romy,Hawkins,race3,county1,0,8868.0,group3,117-52-3454,False,1,5.269481
2,6,Cozmo,Frank,race1,county3,3,120171.75,group3,751-43-6257,False,2,5.269481
3,8,Georgeanna,Frank,race1,,2,220113.0,group3,844-92-1943,True,10,5.269481


In [55]:

# groupby and size
for att in ['smoker', 'county', 'num_children', 'race', 'age_group', 'complications', 'label']:
    print(data.groupby(att).size())



smoker
False    464
True     353
dtype: int64
county
county1    259
county2    154
county3    136
county4    155
dtype: int64
num_children
0    155
1    157
2    156
3    149
4    146
5    124
dtype: int64
race
race1    284
race2    232
race3    276
dtype: int64
age_group
group1    281
group2    298
group3    308
dtype: int64
complications
0     76
1     68
2     87
3     80
4     57
5     93
6     90
7     81
8     86
9     96
10    73
dtype: int64


KeyError: 'label'

In [56]:

data['label'] = data['complications'] > 1.2 * data['mean_complications']
data[:9]



Unnamed: 0,id,first_name,last_name,race,county,num_children,income,age_group,ssn,smoker,complications,mean_complications,label
0,1,Tabby,Ward,race3,county3,1,159027.0,group3,670-18-6875,False,2,5.269481,False
1,4,Romy,Hawkins,race3,county1,0,8868.0,group3,117-52-3454,False,1,5.269481,False
2,6,Cozmo,Frank,race1,county3,3,120171.75,group3,751-43-6257,False,2,5.269481,False
3,8,Georgeanna,Frank,race1,,2,220113.0,group3,844-92-1943,True,10,5.269481,True
4,13,Pepe,Le,race1,,4,87996.0,group3,421-81-5857,False,1,5.269481,False
5,17,Devonna,Elliott,,county4,3,121710.0,group3,527-20-0150,False,5,5.269481,False
6,18,Kelcie,Hawkins,,county1,2,352186.5,group3,709-91-9697,True,8,5.269481,True
7,21,Brady,Watson,race2,,1,12029.25,group3,582-90-4020,False,3,5.269481,False
8,25,Munroe,Cervantes,race1,county4,4,333417.0,group3,551-10-6630,,9,5.269481,True


## selection based on county


In [21]:

# attributes used in training
# not used in my pipeline analysis
data = data[['smoker', 'last_name', 'county', 'num_children', 'race', 'income', 'label']]
data = data[data['county'].isin(COUNTIES_OF_INTEREST)]
print(len(data))
print(data[:5])
data.to_csv(r"data_after_pipeline_for_training.csv", index=False)


290
   smoker  last_name   county  num_children   race     income  label
0   False       Ward  county3             1  race3  159027.00  False
2   False      Frank  county3             3  race1  120171.75  False
9    True  Wilkerson  county2             1  race1  241864.50   True
14   True    Hawkins  county3             4  race1  207534.00   True
15  False  Wilkerson  county2             1  race1  153892.50  False


In [57]:

data = data[['smoker', 'county', 'num_children', 'race', 'income', 'age_group', 'complications', 'label']]
data.to_csv(r"before_selection.csv", index=False)



In [None]:

# full attributes used in training
# this will be used in my analysis
data = data[['smoker', 'last_name', 'county', 'num_children', 'race', 'income', 'age_group', 'complications', 'label']]
data = data[data['county'].isin(COUNTIES_OF_INTEREST)]
print(len(data))
print(data[:5])
data.to_csv(r"data_after_pipeline_full_attributes.csv", index=False)


## analyze attributes


In [61]:
data=pd.read_csv(r"before_selection.csv")

COUNTIES_OF_INTEREST = ['county1', 'county2', 'county3'] # domain: 1, 2, 3, 4

data = data[data['county'].isin(COUNTIES_OF_INTEREST)]
data.groupby('race').size()

# full_attributes = ['smoker', 'county', 'num_children', 'race', 'age_group', 'complications', 'label'] # without names, income
# for att in full_attributes:
#     print("group by {}".format(att))
#     print(data.groupby(att).size())


race
race1    171
race2    138
race3    186
dtype: int64

In [62]:
data=pd.read_csv(r"before_selection.csv")
COUNTIES_OF_INTEREST = ['county2', 'county3', 'county4'] # domain: 1, 2, 3, 4

data = data[data['county'].isin(COUNTIES_OF_INTEREST)]
data.groupby('race').size()



race
race1    170
race2    151
race3     66
dtype: int64



## imputation and training


In [None]:

impute_and_one_hot_encode = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('encode', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])
featurisation = ColumnTransformer(transformers=[
    ("impute_and_one_hot_encode", impute_and_one_hot_encode, ['smoker', 'county', 'race']),
    ('word2vec', MyW2VTransformer(min_count=2), ['last_name']),
    ('numeric', StandardScaler(), ['num_children', 'income']),
], remainder='drop')
neural_net = MyKerasClassifier(build_fn=create_model, epochs=10, batch_size=1, verbose=0)
pipeline = Pipeline([
    ('features', featurisation),
    ('learner', neural_net)])

train_data, test_data = train_test_split(data)
model = pipeline.fit(train_data, train_data['label'])
print("Mean accuracy: {}".format(model.score(test_data, test_data['label'])))