In [380]:
import pandas as pd
import numpy as np
import math

## Import dataset

In [381]:
df=pd.read_csv('dataset_output/raw_training_data_cleaned_up.csv')

## Shuffle dataset and reset index
reset_index tells Pandas to replace the existing index column instead of creating a new one.

frac is the fraction of rows to return; in this case 100% of them, in random order

In [382]:
df = df.sample(frac=1).reset_index(drop=True)

In [383]:
df

Unnamed: 0.1,Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide.metformin,glipizide.metformin,glimepiride.pioglitazone,metformin.rosiglitazone,metformin.pioglitazone,change,diabetesMed,readmitted
0,75657,228000030,84415995,3,2,8,1,1,7,7,...,1,1,1,1,1,1,1,0,1,2
1,34374,107836062,90052641,3,2,5,2,1,7,3,...,1,2,1,1,1,1,1,1,1,2
2,2157,14055972,92806155,3,1,8,1,1,7,5,...,1,1,1,1,1,1,1,0,0,2
3,24761,83606496,20991267,3,1,7,1,6,7,4,...,1,2,1,1,1,1,1,1,1,1
4,1249,8860284,94419315,4,2,6,6,1,17,3,...,1,1,1,1,1,1,1,1,1,0
5,83440,261992604,43332093,3,2,6,1,11,7,4,...,1,4,1,1,1,1,1,1,1,0
6,35515,110127240,1939779,6,2,9,2,1,9,3,...,1,1,1,1,1,1,1,0,0,0
7,8097,37271340,18110268,3,2,8,2,1,1,7,...,1,1,1,1,1,1,1,0,1,0
8,48870,148770738,57994317,1,1,7,1,1,7,6,...,1,3,1,1,1,1,1,1,1,2
9,90623,300272162,53611668,1,2,4,1,1,7,2,...,1,2,1,1,1,1,1,1,1,0


## Clean up the dataframe a bit

In [384]:
df.rename(columns={'glyburide.metformin':'glyburide_metformin', 
                   'glipizide.metformin':'glipizide_metformin',
                   'glimepiride.pioglitazone':'glimepiride_pioglitazone', 
                   'metformin.rosiglitazone':'metformin_rosiglitazone',
                   'metformin.pioglitazone':'metformin_pioglitazone'}, 
          inplace=True)

In [385]:
X = df.drop(['Unnamed: 0', 'encounter_id','patient_nbr'], axis=1, inplace=True)

In [386]:
df.info()
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100702 entries, 0 to 100701
Data columns (total 45 columns):
race                        100702 non-null int64
gender                      100702 non-null int64
age                         100702 non-null int64
admission_type_id           100702 non-null int64
discharge_disposition_id    100702 non-null int64
admission_source_id         100702 non-null int64
time_in_hospital            100702 non-null int64
num_lab_procedures          100702 non-null int64
num_procedures              100702 non-null int64
num_medications             100702 non-null int64
number_outpatient           100702 non-null int64
number_emergency            100702 non-null int64
number_inpatient            100702 non-null int64
diag_1                      100702 non-null int64
diag_2                      100702 non-null int64
diag_3                      100702 non-null int64
number_diagnoses            100702 non-null int64
max_glu_serum               100702 non-

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,...,citoglipton,insulin,glyburide_metformin,glipizide_metformin,glimepiride_pioglitazone,metformin_rosiglitazone,metformin_pioglitazone,change,diabetesMed,readmitted
0,3,2,8,1,1,7,7,32,0,10,...,1,1,1,1,1,1,1,0,1,2
1,3,2,5,2,1,7,3,60,0,13,...,1,2,1,1,1,1,1,1,1,2
2,3,1,8,1,1,7,5,39,0,8,...,1,1,1,1,1,1,1,0,0,2
3,3,1,7,1,6,7,4,37,0,12,...,1,2,1,1,1,1,1,1,1,1
4,4,2,6,6,1,17,3,46,2,23,...,1,1,1,1,1,1,1,1,1,0
5,3,2,6,1,11,7,4,81,4,40,...,1,4,1,1,1,1,1,1,1,0
6,6,2,9,2,1,9,3,30,0,2,...,1,1,1,1,1,1,1,0,0,0
7,3,2,8,2,1,1,7,72,5,26,...,1,1,1,1,1,1,1,0,1,0
8,1,1,7,1,1,7,6,42,3,22,...,1,3,1,1,1,1,1,1,1,2
9,1,2,4,1,1,7,2,36,2,12,...,1,2,1,1,1,1,1,1,1,0


#### All columns are using encoded int's. Great! 

# Random forest

#### How to pick parameters: http://scikit-learn.org/stable/modules/ensemble.html#parameters

scikit-learn uses randomly selected features to determine how to split a node when constructing a tree (http://scikit-learn.org/stable/modules/ensemble.html#random-forests)

In [387]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

### 1) Construct training and testing 
TODO: validation sets

In [400]:
# Separate input and output features 
# Original df is not mutated
X = df.drop(['discharge_disposition_id','admission_type_id'], axis=1)

# columns of the values we want to predict which we use to train our model
y_disposition = df['discharge_disposition_id']
y_admission = df['admission_type_id']

# For disposition statuses
X_disposition_train, X_disposition_test, y_disposition_train, y_disposition_test = train_test_split(X, y_disposition, test_size=0.2)

# For admission statuses
X_admission_train, X_admission_test, y_admission_train, y_admission_test = train_test_split(X, y_admission, test_size=0.2)

### 2) Run RandomForestClassifier to train our model

In [401]:
admission_classifier = RandomForestClassifier(n_estimators=40, max_features=6, 
                                    verbose=2, bootstrap=False,
                                   n_jobs=-1, )
disposition_classifier = RandomForestClassifier(n_estimators=40, max_features=6, 
                                    verbose=2, bootstrap=False,
                                   n_jobs=-1, )

admission_classifier.fit(X_admission_train, y_admission_train)
disposition_classifier.fit(X_disposition_train, y_disposition_train)

building tree 3 of 40building tree 1 of 40building tree 2 of 40building tree 4 of 40



building tree 5 of 40
building tree 6 of 40
building tree 7 of 40
building tree 8 of 40
building tree 9 of 40
building tree 10 of 40
building tree 11 of 40
building tree 12 of 40
building tree 13 of 40
building tree 14 of 40
building tree 15 of 40
building tree 16 of 40
building tree 17 of 40
building tree 18 of 40
building tree 19 of 40
building tree 20 of 40
building tree 21 of 40
building tree 22 of 40
building tree 23 of 40
building tree 24 of 40
building tree 25 of 40
building tree 26 of 40
building tree 27 of 40
building tree 28 of 40
building tree 29 of 40
building tree 30 of 40
building tree 31 of 40
building tree 32 of 40
building tree 33 of 40
building tree 34 of 40
building tree 35 of 40
building tree 36 of 40


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    4.6s


building tree 37 of 40
building tree 38 of 40
building tree 39 of 40
building tree 40 of 40


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    5.2s finished


building tree 1 of 40building tree 2 of 40building tree 3 of 40building tree 4 of 40



building tree 5 of 40
building tree 6 of 40
building tree 7 of 40
building tree 8 of 40
building tree 9 of 40
building tree 10 of 40
building tree 11 of 40
building tree 12 of 40
building tree 13 of 40
building tree 14 of 40
building tree 15 of 40
building tree 16 of 40
building tree 17 of 40
building tree 18 of 40
building tree 19 of 40
building tree 20 of 40
building tree 21 of 40
building tree 22 of 40
building tree 23 of 40
building tree 24 of 40
building tree 25 of 40
building tree 26 of 40
building tree 27 of 40
building tree 28 of 40
building tree 29 of 40
building tree 30 of 40
building tree 31 of 40
building tree 32 of 40
building tree 33 of 40
building tree 34 of 40
building tree 35 of 40
building tree 36 of 40


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    6.6s


building tree 37 of 40
building tree 38 of 40
building tree 39 of 40
building tree 40 of 40


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    7.3s finished


RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features=6, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=40, n_jobs=-1, oob_score=False, random_state=None,
            verbose=2, warm_start=False)

### 3) Predict!

In [402]:
# predict classes for data in test_set
predict_admission = admission_classifier.predict(X_admission_test) 
predict_disposition = disposition_classifier.predict(X_disposition_test)

[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done  40 out of  40 | elapsed:    0.6s finished
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    1.0s
[Parallel(n_jobs=4)]: Done  40 out of  40 | elapsed:    1.2s finished


### 4) Evaluate our model

~~~Since scikit doesn't support multiclass-multilabel, we have to do this manually in a simple, basic way for now.~~~ (I was using multiclass multilabel classifiers. But I'm experimenting multiclass single-label now.)

In [403]:
from sklearn.metrics import classification_report,confusion_matrix

In [404]:
print(classification_report(predict_admission, y_admission_test))

             precision    recall  f1-score   support

          1       0.95      0.86      0.90     11872
          2       0.39      0.57      0.46      2455
          3       0.77      0.68      0.72      4187
          4       0.00      0.00      0.00         1
          5       0.65      0.70      0.67       901
          6       0.69      0.95      0.80       725
          7       0.00      0.00      0.00         0
          8       0.00      0.00      0.00         0

avg / total       0.82      0.78      0.80     20141



  'recall', 'true', average, warn_for)


In [405]:
print(classification_report(predict_disposition, y_disposition_test))

             precision    recall  f1-score   support

          1       0.94      0.66      0.78     17007
          2       0.00      0.00      0.00         5
          3       0.38      0.45      0.41      2325
          4       0.00      0.00      0.00         0
          5       0.00      0.00      0.00         1
          6       0.10      0.37      0.16       710
          7       0.00      0.00      0.00         1
          8       0.00      0.00      0.00         0
          9       0.00      0.00      0.00         0
         10       0.00      0.00      0.00         0
         11       0.02      0.60      0.04        10
         13       0.00      0.00      0.00         2
         14       0.00      0.00      0.00         1
         15       0.00      0.00      0.00         0
         16       0.00      0.00      0.00         0
         17       0.00      0.00      0.00         0
         18       0.03      0.41      0.06        58
         19       0.00      0.00      0.00   

  'recall', 'true', average, warn_for)


## Models to try:


Adaboost

Gradient Tree Boosting 

Try VotingClassifier from scikit at the end


## Things to try:
predict time in hospital