In [1]:
%matplotlib inline

In [68]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

# Tree and Ensamble Methods
## Classification: Live Demos

In [3]:
diabetes_data = pd.read_csv("diabetic_data.csv")

In [4]:
diabetes_data.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [5]:
diabetes_data

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,443847548,100162476,AfricanAmerican,Male,[70-80),?,1,3,7,3,...,No,Down,No,No,No,No,No,Ch,Yes,>30
101762,443847782,74694222,AfricanAmerican,Female,[80-90),?,1,4,5,5,...,No,Steady,No,No,No,No,No,No,Yes,NO
101763,443854148,41088789,Caucasian,Male,[70-80),?,1,1,7,1,...,No,Down,No,No,No,No,No,Ch,Yes,NO
101764,443857166,31693671,Caucasian,Female,[80-90),?,2,3,7,10,...,No,Up,No,No,No,No,No,Ch,Yes,NO


In [6]:
diabetes_data.dtypes

encounter_id                 int64
patient_nbr                  int64
race                        object
gender                      object
age                         object
weight                      object
admission_type_id            int64
discharge_disposition_id     int64
admission_source_id          int64
time_in_hospital             int64
payer_code                  object
medical_specialty           object
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
diag_1                      object
diag_2                      object
diag_3                      object
number_diagnoses             int64
max_glu_serum               object
A1Cresult                   object
metformin                   object
repaglinide                 object
nateglinide                 object
chlorpropamide              object
glimepiride         

In [7]:
diabetes_data.shape

(101766, 50)

In [8]:
diabetes_target = diabetes_data["readmitted"]
diabetes_attributes = diabetes_data.drop("readmitted", axis = 1)

In [9]:
diabetes_attributes = pd.get_dummies(diabetes_attributes)

In [10]:
diabetes_attributes.shape

(101766, 2472)

In [11]:
diabetes_attributes_scaled = MinMaxScaler().fit_transform(diabetes_attributes)

In [14]:
attributes_train, attributes_test, target_train, target_test = train_test_split(
    diabetes_attributes_scaled,
    diabetes_target,
    test_size=0.2,
    random_state=42
)

In [29]:
tree = DecisionTreeClassifier(max_depth=12)

In [30]:
tree.fit(attributes_train, target_train)

DecisionTreeClassifier(max_depth=12)

In [31]:
tree.score(attributes_train, target_train)

0.650972829558296

In [32]:
tree.score(attributes_test, target_test)

0.5844060135599882

In [33]:
tree.feature_importances_

array([0.1557312 , 0.12918801, 0.00819086, ..., 0.00274824, 0.00679798,
       0.00520765])

## Forest bit

In [74]:
forest = RandomForestClassifier(n_estimators=200, max_depth=25)

In [75]:
forest.fit(attributes_train, target_train)

RandomForestClassifier(max_depth=25, n_estimators=200)

In [76]:
forest.score(attributes_train, target_train)

0.6690537021569302

In [77]:
forest.score(attributes_test, target_test)

0.5789525400412695

In [78]:
ada_boost = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth=7), n_estimators = 7)

In [79]:
ada_boost.fit(attributes_train, target_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),
                   n_estimators=20)

In [80]:
ada_boost.score(attributes_train, target_train)

0.5903060973812214

In [81]:
ada_boost.score(attributes_test, target_test)

0.587452097867741

In [None]:
plt.scatter