In [None]:
# import and load the required libraries and modules 
import random
random.seed(732)
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
# setting display perference options in jupyter notbook 
plt.rc("font", size=14)
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)
pd.options.display.max_rows = 2000
pd.options.display.max_columns = 1000

In [None]:
# loading required dataset
df = pd.read_csv("dataset_diabetes/diabetic_data.csv")
df.head(5)

In [None]:
#declare missing values 
df = df.replace('?', np.NaN)

In [None]:
# define a function to show persentage of missing values in the dataset
def missing_values_table(df):
        mis_val = df.isnull().sum()
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        return mis_val_table_ren_columns
missing_values_table(df)

In [None]:
# all have alot of missing values, so these features will be excluded 
df = df.drop(['weight','payer_code','medical_specialty'], axis = 1)

In [None]:
# exclude any missing or errorness values
excluded_indecies = set(df[(df['diag_1'].isna()) & (df['diag_2'].isna()) & (df['diag_3'].isna())].index) 
excluded_indecies = excluded_indecies.union(set(df['gender'][df['gender'] == 'Unknown/Invalid'].index))
excluded_indecies = excluded_indecies.union(set(df[df['discharge_disposition_id'] == 11].index))
new_indecies = list(set(df.index) - set(excluded_indecies))
df = df.iloc[new_indecies]

In [None]:
# remove constant value columns
df = df.drop(['citoglipton', 'examide'], axis = 1)

In [None]:
#Feature Engineering : Creating new features :{service_utilization,n_medication_changes,n_medications}
# 1- service_utilization
#summerize into one column 
df['service_utilization'] = df['number_outpatient'] + df['number_emergency'] + df['number_inpatient']
# 2- n_medication_changes 
# count changes
meds = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'glipizide', 'glyburide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'insulin', 'glyburide-metformin', 'tolazamide', 'metformin-pioglitazone','metformin-rosiglitazone', 'glimepiride-pioglitazone', 'glipizide-metformin', 'troglitazone', 'tolbutamide', 'acetohexamide']
df['n_medication_changes'] = 0
for i in meds:
    temp_col = str(i) + '_'
    df[temp_col] = df[i].apply(lambda x: 0 if (x == 'No' or x == 'Steady') else 1)
    df['n_medication_changes'] = df['n_medication_changes'] + df[temp_col]
    del df[temp_col]
# 3- n_medications
# count medications
for i in meds:
    df[i] = df[i].replace('No', 0)
    df[i] = df[i].replace('Steady', 1)
    df[i] = df[i].replace('Up', 1)
    df[i] = df[i].replace('Down', 1) 
df['n_medications'] = 0
for i in meds:
    df['n_medications'] = df['n_medications'] + df[i]

In [None]:
# Mapping
# 4 categories

df['admission_type'] = df['admission_type_id']
df['admission_type'] = df['admission_type'].replace(1,'Emergency')
df['admission_type'] = df['admission_type'].replace(2,'Emergency')
df['admission_type'] = df['admission_type'].replace(3,'Elective')
df['admission_type'] = df['admission_type'].replace(4,'Newborn')
df['admission_type'] = df['admission_type'].replace(5,'N/A')
df['admission_type'] = df['admission_type'].replace(6,'N/A')
df['admission_type'] = df['admission_type'].replace(7,'Emergency')
df['admission_type'] = df['admission_type'].replace(8,'N/A')

# 6 categories
df['admission_source'] = df['admission_source_id']
df['admission_source'] = df['admission_source'].replace(1,'Referral')
df['admission_source'] = df['admission_source'].replace(2,'Referral')
df['admission_source'] = df['admission_source'].replace(3,'Referral')
df['admission_source'] = df['admission_source'].replace(4,'Transfer')
df['admission_source'] = df['admission_source'].replace(5,'Transfer')
df['admission_source'] = df['admission_source'].replace(6,'Transfer')
df['admission_source'] = df['admission_source'].replace(7,'ER')
df['admission_source'] = df['admission_source'].replace(8,'Law_Enforcement')
df['admission_source'] = df['admission_source'].replace(9,'N/A')
df['admission_source'] = df['admission_source'].replace(10,'Transfer')
df['admission_source'] = df['admission_source'].replace(11,'Birth')
df['admission_source'] = df['admission_source'].replace(13,'Birth')
df['admission_source'] = df['admission_source'].replace(14,'Birth')
df['admission_source'] = df['admission_source'].replace(15,'N/A')
df['admission_source'] = df['admission_source'].replace(17,'N/A')
df['admission_source'] = df['admission_source'].replace(20,'N/A')
df['admission_source'] = df['admission_source'].replace(21,'N/A')
df['admission_source'] = df['admission_source'].replace(22,'Transfer')
df['admission_source'] = df['admission_source'].replace(25,'Transfer')

# 7 categories
df['discharge_disposition'] = df['discharge_disposition_id']
df['discharge_disposition'] = df['discharge_disposition'].replace(1,"Home")
df['discharge_disposition'] = df['discharge_disposition'].replace(2,"Transferred")
df['discharge_disposition'] = df['discharge_disposition'].replace(3,"Transferred")
df['discharge_disposition'] = df['discharge_disposition'].replace(4,"Transferred")
df['discharge_disposition'] = df['discharge_disposition'].replace(5,"Transferred")
df['discharge_disposition'] = df['discharge_disposition'].replace(6,"Home")
df['discharge_disposition'] = df['discharge_disposition'].replace(7,"AMA")
df['discharge_disposition'] = df['discharge_disposition'].replace(8,"Home")
df['discharge_disposition'] = df['discharge_disposition'].replace(9,"Home")
df['discharge_disposition'] = df['discharge_disposition'].replace(10,"Outpatient")
df['discharge_disposition'] = df['discharge_disposition'].replace(12,"Outpatient")
df['discharge_disposition'] = df['discharge_disposition'].replace(13,"Home")
df['discharge_disposition'] = df['discharge_disposition'].replace(14,"Transferred")
df['discharge_disposition'] = df['discharge_disposition'].replace(15,"Outpatient")
df['discharge_disposition'] = df['discharge_disposition'].replace(16,"Outpatient")
df['discharge_disposition'] = df['discharge_disposition'].replace(17,"Outpatient")
df['discharge_disposition'] = df['discharge_disposition'].replace(18,"N/A")
df['discharge_disposition'] = df['discharge_disposition'].replace(19,"Expired")
df['discharge_disposition'] = df['discharge_disposition'].replace(20,"Expired")
df['discharge_disposition'] = df['discharge_disposition'].replace(22,"Transferred")
df['discharge_disposition'] = df['discharge_disposition'].replace(23,"Transferred")
df['discharge_disposition'] = df['discharge_disposition'].replace(24,"Transferred")
df['discharge_disposition'] = df['discharge_disposition'].replace(25,"N/A")
df['discharge_disposition'] = df['discharge_disposition'].replace(26,"N/A")
df['discharge_disposition'] = df['discharge_disposition'].replace(27,"Transferred")
df['discharge_disposition'] = df['discharge_disposition'].replace(28,"To_Psychiatric")

In [None]:
# disease categories for codes are in the paper
df['diag_1'] = df['diag_1'].convert_objects(convert_numeric=True)
df['diag_1_label'] = np.where(df['diag_1'].between(390, 459),'Circulatory' , df['diag_1'])
df['diag_1_label'] = np.where(df['diag_1'].between(460, 519),'Respiratory' , df['diag_1_label'])
df['diag_1_label'] = np.where(df['diag_1'].between(520, 579),'Digestive' , df['diag_1_label'])
df['diag_1_label'] = np.where(df['diag_1'].between(250.00, 250.99),'Diabetes' , df['diag_1_label'])
df['diag_1_label'] = np.where(df['diag_1'].between(800, 999),'Injury' , df['diag_1_label'])
df['diag_1_label'] = np.where(df['diag_1'].between(710, 739),'Musculoskeletal' , df['diag_1_label'])
df['diag_1_label'] = np.where(df['diag_1'].between(580, 629),'Genitourinary' , df['diag_1_label'])
df['diag_1_label'] = np.where(df['diag_1'].between(140, 239),'Neoplasms' , df['diag_1_label'])
df['diag_1_label'] = np.where(df['diag_1']== 785,'Circulatory' , df['diag_1_label'])
df['diag_1_label'] = np.where(df['diag_1']== 786,'Respiratory' , df['diag_1_label'])
df['diag_1_label'] = np.where(df['diag_1']== 787,'Digestive' , df['diag_1_label'])
df['diag_1_label'] = np.where(df['diag_1']== 788,'Genitourinary' , df['diag_1_label'])
df['diag_1_label'] = np.where(df['diag_1'].between(740, 759),'other' , df['diag_1_label'])
df['diag_1_label'] = np.where(df['diag_1'].between(630, 709),'other' , df['diag_1_label'])
df['diag_1_label'] = np.where(df['diag_1'].between(240, 249),'other' , df['diag_1_label'])
df['diag_1_label'] = np.where(df['diag_1'].between(251, 389),'other' , df['diag_1_label'])
df['diag_1_label'] = np.where(df['diag_1'].between(1, 139),'other' , df['diag_1_label'])
df['diag_1_label'] = np.where(df['diag_1'].between(780, 782),'other' , df['diag_1_label'])
df['diag_1_label'] = np.where(df['diag_1'].between(790, 799),'other' , df['diag_1_label'])
df['diag_1_label'] = np.where(df['diag_1']== 784,'other' , df['diag_1_label'])
df['diag_1_label'] = np.where(df['diag_1']== 783,'other' , df['diag_1_label'])
df['diag_1_label'] = np.where(df['diag_1']== 789,'other' , df['diag_1_label'])
df['diag_1_label'] = np.where(df['diag_1'].isna(),'other' , df['diag_1_label'])

In [None]:
# start Coding the columns
df['readmitted'] = df['readmitted'].replace('>30', 0)
df['readmitted'] = df['readmitted'].replace('<30', 1) 
df['readmitted'] = df['readmitted'].replace('NO', 0)

df['change'] = df['change'].replace('Ch', 1)
df['change'] = df['change'].replace('No', 0)

df['gender'] = df['gender'].replace('Male', 1)
df['gender'] = df['gender'].replace('Female', 0)

df['diabetesMed'] = df['diabetesMed'].replace('Yes', 1)
df['diabetesMed'] = df['diabetesMed'].replace('No', 0)

df['max_glu_serum'] = df['max_glu_serum'].replace('>200', 1)
df['max_glu_serum'] = df['max_glu_serum'].replace('>300', 1)
df['max_glu_serum'] = df['max_glu_serum'].replace('Norm', 0)
df['max_glu_serum'] = df['max_glu_serum'].replace('None', -99)

df['A1Cresult'] = df['A1Cresult'].replace('>7', 1)
df['A1Cresult'] = df['A1Cresult'].replace('>8', 1)
df['A1Cresult'] = df['A1Cresult'].replace('Norm', 0)
df['A1Cresult'] = df['A1Cresult'].replace('None', -99) 

In [None]:
#transform age category to average age
df['age'] = df.age.map({"[0-10)":5,
                        "[10-20)":15,
                        "[20-30)":25,
                        "[30-40)":35,
                        "[40-50)":45,
                        "[50-60)":55,
                        "[60-70)":65,
                        "[70-80)":75,
                        "[80-90)":85,
                        "[90-100)":95})
df['age'] = df['age'].astype('int64')

In [None]:
# one encounter per patient to avoid bias
df = df.sort_values('encounter_id').drop_duplicates(subset=['patient_nbr']) 

In [None]:
# log transformation
num_col = ['time_in_hospital',
 'number_inpatient',
 'number_diagnoses',
 'num_medications',
 'number_outpatient',
 'service_utilization',
 'n_medication_changes',
 'number_emergency',
 'num_lab_procedures',
 'num_procedures',
 'age',
 'n_medications']
skewed_col = pd.DataFrame()
skewed_col_names = []
skewed_col_values = []
skewed_col_values_ = []
for i in num_col:
    skewness = df[i].skew()   
    if (abs(skewness) >2): 
        skewed_col_values.append(skewness)
        df = df[df[i] >= 0]
        df[i + "_log1p"] = np.log1p(df[i])
        skewness = df[i + "_log1p"].skew()  
        skewed_col_values_.append(skewness)
skewed_col['column'] = skewed_col_names
skewed_col['skew'] = skewed_col_values
skewed_col['skew after logp1'] = skewed_col_values_
# print skewness table
skewed_col

In [None]:
# One-Hot Encoding
df_pd = pd.get_dummies(df, columns=['race', 'gender', 'admission_type', 'discharge_disposition',
                                      'admission_source', 'max_glu_serum', 'A1Cresult', 'diag_1_label'], drop_first = True)

In [None]:
# define a function for Standrization using z-score
def z_score(data):
    return ((data - np.mean(data, axis = 0)) / np.std(data, axis = 0))
df_pd[col_numeric] = z_score(df_pd[col_numeric])

In [None]:
# OverSampling using SMOTE
X = df_pd.loc[:, df_pd.columns != 'readmitted']
y = df_pd['readmitted']
from imblearn.over_sampling import SMOTE

from collections import Counter
print('Before SMOTE {}'.format(Counter(y)))
sm = SMOTE(random_state=0)
os_data_X, os_data_y = sm.fit_sample(X, y)
print('After SMOTE {}'.format(Counter(os_data_y)))

In [None]:
# feature selection using RFE by training a random forest classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
rf = RandomForestClassifier(n_estimators = 10, max_depth=25)
# create the RFE model for the svm classifier 
# and select attributes
rfe = RFE(rf, 15)
X_new = rfe.fit_transform(os_data_X, os_data_y)
# print summaries for the selection of attributes
print(rfe.support_)
print(rfe.ranking_)

In [None]:
# Create list of top most features based on importance
feature_names = pd.DataFrame(X).columns
feature_imports = rfe.estimator_.feature_importances_
most_imp_features = pd.DataFrame([f for f in zip(feature_names,feature_imports)], columns=["Feature", "Importance"])
# plot the feature importances
plt.figure(figsize=(8,8))
plt.barh(range(len(most_imp_features)), most_imp_features.Importance, align='center', alpha=0.8)
plt.yticks(range(len(most_imp_features)), most_imp_features.Feature, fontsize=14)
plt.xlabel('Importance')
plt.title('Features importance - Random Forest')
plt.show()

In [None]:
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_new, os_data_y, test_size=0.2, random_state=9)

In [None]:
# Logistic Regression Classifier model with GridSearchCV
from sklearn.linear_model import LogisticRegression
grid_param = {'C': [0.01, 0.1, 1, 10, 100],
                          'penalty': ['l1', 'l2'],
                          'solver' :["lbfgs",'newton-cg']
                          }
logreg=LogisticRegression()
gd_sr = GridSearchCV(estimator=logreg,  
                     param_grid=grid_param,
                     scoring='accuracy',
                     cv=5,
                     n_jobs=-1)
gd_sr.fit(X_train, y_train)  
best_parameters = gd_sr.best_params_  
print(best_parameters) 
best_result = gd_sr.best_score_  
print(best_result)  
print(gd_sr.cv_results_)

In [None]:
# knearest neighbors Classifier model with GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
grid_param = {'n_neighbors': [1,2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
                              15]}
knn = KNeighborsClassifier()
grid = GridSearchCV(knn, grid_param, cv=5, scoring='accuracy', n_jobs= -1)
grid.fit(X_train, y_train)
# print results
grid.cv_results_

In [1]:
# Random forest Classifier with grid search 
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier()
grid_param = {
    'bootstrap': [True,False],
    'max_depth': [15,25, 50],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [10, 100, 200, 1000]
}
grid = GridSearchCV(clf, grid_param, cv=5, scoring='accuracy', n_jobs= -1)
grid.fit(X_train, y_train)
# print results
grid.cv_results_

NameError: name 'X_train' is not defined

In [None]:
# xgboost Classifier with grid search 
from xgboost import XGBClassifier
model = XGBClassifier()
grid_param = {
    'learning_rate'=[0.01,0.1],
    'max_depth': [15,25, 50],
    'n_estimators': [10, 100, 200, 1000]
}
grid = GridSearchCV(model, grid_param, cv=5, scoring='accuracy', n_jobs= -1)
grid.fit(X_train, y_train)
# print results
grid.cv_results_

In [None]:
# SVM Classifier with grid search 
from sklearn.svm import SVC
svc = SVC(gamma=1,C=1)
grid_param = {
    'gamma'=[0.001,0.01,1],
    'C'=[0.001,0.01,1],
    'kernel'=['linear','poly', 'rbf']
}
grid = GridSearchCV(model, grid_param, cv=5, scoring='accuracy', n_jobs= -1)
grid.fit(X_train, y_train)
# print results
grid.cv_results_

In [None]:
# tree visualization of LOS 
from sklearn.tree import export_graphviz
export_graphviz(model, out_file='tree_limited.dot', 
                feature_names=pd.DataFrame(X).columns[pd.DataFrame(X_new).columns], max_depth=2,
                class_names=["<5 days","5 to 8 days",">8 days"],
                rounded = True, proportion = False, precision = 2,filled=True)