# Analysis of the National Survey of Health

## Jason Piccone, Ph.D.

## Part III: Advanced Analysis

# Import Libraries

In [None]:
from matplotlib import style 
from sklearn import cross_validation as cv, linear_model, ensemble, metrics, svm
from sklearn.cross_validation import KFold, train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, VotingClassifier, BaggingRegressor, ExtraTreesRegressor, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge
from sklearn.metrics import mean_squared_error
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder

import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random


# Parameters

In [None]:
path = "/home/paco/Documents/data science/DS projects/national survey on drug use and health 2012/ICPSR_34933/DS0001/"
d_file = 'df2.csv'
style.use('ggplot')

rand_state = 42
trees = 100 # number of trees in the random forest


# Read Data

In [None]:
df = pd.read_csv(path+d_file)

print(df.shape)
print(df.head())

# Establish Functions

In [None]:
# to make train/test split

def split_data(df,y):

        '''divide sample into train and test'''      
        y = df[y]
        X = df.drop(['BOOKED','HEALTH'], axis=1)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=rand_state)

        return(X_train, X_test, y_train, y_test)
    



# test weighting for stacked ensemble (this section of code is adapted from Julian, 2016):
def vclas(w1,w2,w3, w4, w5,X_train,X_test, y_train,y_test):
    X_train,X_test, y_train,y_test= cv.train_test_split(X_train,y_train,test_size=0.4)

    clf1 = LogisticRegression()
    clf2 = GaussianNB()
    clf3 = RandomForestClassifier(n_estimators=10,bootstrap=True)
    clf4= ExtraTreesClassifier(n_estimators=10, bootstrap=True)
    clf5 = GradientBoostingClassifier(n_estimators=10)

    clfes=[clf1,clf2,clf3,clf4, clf5]

    eclf = VotingClassifier(estimators=[('lr', clf1), ('gnb', clf2), ('rf', clf3),('et',clf4), ('gb',clf5)],
                            voting='soft',
                            weights=[w1, w2, w3,w4, w5])

    [c.fit(X_train, y_train) for c in (clf1, clf2, clf3,clf4, clf5, eclf)]
 
    N = 6
    ind = np.arange(N)
    width = 0.3
    fig, ax = plt.subplots()

    for i, clf in enumerate(clfes):
        print(clf,i)
        p1=ax.bar(i,clfes[i].score(X_train,y_train,), width=width,color="blue", alpha=0.5)
        p2=ax.bar(i+width,clfes[i].score(X_test,y_test,), width=width,color="red", alpha=0.5)
    ax.bar(len(clfes)+width,eclf.score(X_train,y_train,), width=width,color="blue", alpha=0.5)
    ax.bar(len(clfes)+width *2,eclf.score(X_test,y_test,), width=width,color="red", alpha=0.5)
    plt.axvline(4.8, color='k', linestyle='dashed')
    ax.set_xticks(ind + width)
    ax.set_xticklabels(['LogisticRegression',
                        'GaussianNB',
                        'RandomForestClassifier',
                        'ExtraTrees',
                        'GradientBoosting',
                        'VotingClassifier'],
                       rotation=40,
                       ha='right')
    plt.title('Training and Test Score for Different Classifiers')
    plt.legend([p1[0], p2[0]], ['training', 'test'], loc='lower left')
    plt.show()




In [None]:
# collect partitioned data
X_train, X_test, y_train, y_test = split_data(df,'BOOKED')


# run ensemble
vclas(2,1.5,2,2,2,X_train,X_test, y_train,y_test)


In [None]:
#Fit final model:

def fit_final(X_train,y_train,X_test,y_test):
    
    clf1 = LogisticRegression(random_state=rand_state)
    clf2 = GaussianNB()
    clf3 = RandomForestClassifier(n_estimators=trees, random_state=rand_state)
    clf4 = ExtraTreesClassifier(n_estimators=trees, bootstrap=True,random_state=rand_state)
    clf5 = GradientBoostingClassifier(n_estimators=trees, random_state=rand_state)

    clf1.fit(X_train, y_train)
    clf2.fit(X_train, y_train)
    clf3.fit(X_train, y_train)
    clf4.fit(X_train, y_train)
    clf5.fit(X_train, y_train)


    eclf = VotingClassifier(estimators=[('lr', clf1), ('gnb', clf2), ('rf', clf3),('et',clf4),('gb',clf5)],
                                voting='soft',
                                weights=[2, 1.5, 2, 2, 2])
    eclf1 = eclf.fit(X_train, y_train)
    disbursed = eclf1.predict_proba(X_test)
    fpr, tpr, thresholds = metrics.roc_curve(y_test, disbursed[:,1])
    print('Final AUC = ' + str(metrics.auc(fpr, tpr))) 

fit_final(X_train,y_train,X_test,y_test)