In [4]:
import numpy as np
from pandas import DataFrame, read_csv, to_numeric
import pandas as pd
from sklearn import cluster
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

from sklearn.model_selection import RandomizedSearchCV
import matplotlib as mpl
import matplotlib.pyplot as plt

In [5]:
df_train = read_csv('diabetes_train.csv')
df_test = read_csv('diabetes_test.csv')
df_test.shape

(119, 9)

In [6]:
X_train = df_train.drop(columns=df_train.columns[8])
X_test = df_test.drop(columns=df_test.columns[8])
y_train = df_train[df_train.columns[8]]
y_test = df_test[df_test.columns[8]]

## Decision Tree Classifier

In [7]:
from sklearn.tree import DecisionTreeClassifier

def decision_tree_classifier(df_train,df_test,depth_k,features_i,leaf_j):
    DT_classifier = DecisionTreeClassifier(criterion='gini',splitter='best',max_depth=depth_k, max_features=features_i, min_samples_leaf=leaf_j )
    DT_classifier.fit(X_train, y_train)
    y_train_pred = DT_classifier.predict(X_train)
    y_test_pred = DT_classifier.predict(X_test)
    print("criterion gini splitter best max_depth {} max_features {} min_samples_leaf {} Train Error is {} Test Error is {} " .format(depth_k ,features_i, leaf_j, '%.4f' % (1 - metrics.accuracy_score(y_train, y_train_pred)), '%.4f' % (1 - metrics.accuracy_score(y_test, y_test_pred))))
    
depth = [2,10]
features = [4,8]
leaf = [1,3]

for k in depth:
    for i in features:
        for j in leaf:
            decision_tree_classifier(df_train, df_test, k, i, j)

criterion gini splitter best max_depth 2 max_features 4 min_samples_leaf 1 Train Error is 0.2357 Test Error is 0.2101 
criterion gini splitter best max_depth 2 max_features 4 min_samples_leaf 3 Train Error is 0.2851 Test Error is 0.3193 
criterion gini splitter best max_depth 2 max_features 8 min_samples_leaf 1 Train Error is 0.2280 Test Error is 0.2269 
criterion gini splitter best max_depth 2 max_features 8 min_samples_leaf 3 Train Error is 0.2280 Test Error is 0.2269 
criterion gini splitter best max_depth 10 max_features 4 min_samples_leaf 1 Train Error is 0.0431 Test Error is 0.2773 
criterion gini splitter best max_depth 10 max_features 4 min_samples_leaf 3 Train Error is 0.0971 Test Error is 0.2857 
criterion gini splitter best max_depth 10 max_features 8 min_samples_leaf 1 Train Error is 0.0185 Test Error is 0.2857 
criterion gini splitter best max_depth 10 max_features 8 min_samples_leaf 3 Train Error is 0.0724 Test Error is 0.3025 


## MLP Classifier

In [12]:
import warnings
warnings.filterwarnings("ignore")
from sklearn.neural_network import MLPClassifier

def multilayerperceptron_classifier(df_train,df_test,hiddenlayer_k,learningrate_i,alpha_j):
    mlp_classifier = MLPClassifier(activation='relu',batch_size='auto',hidden_layer_sizes=hiddenlayer_k, learning_rate=learningrate_i, alpha=alpha_j )
    mlp_classifier.fit(X_train, y_train)
    y_train_pred = mlp_classifier.predict(X_train)
    y_test_pred = mlp_classifier.predict(X_test)
    print("activation relu batch_size auto hidden_layer_sizes {} learning_rate {} alpha {} Train Error is {} Test Error is {} " .format(hiddenlayer_k ,learningrate_i, alpha_j, '%.4f' % (1 - metrics.accuracy_score(y_train, y_train_pred)), '%.4f' % (1 - metrics.accuracy_score(y_test, y_test_pred))))
    
hiddenlayer = [(10, ), (50, )]
learningrate = ['constant','adaptive']
alpha = [0.001,0.05]

for k in hiddenlayer:
    for i in learningrate:
        for j in alpha:
            multilayerperceptron_classifier(df_train, df_test, k, i, j)

activation relu batch_size auto hidden_layer_sizes (10,) learning_rate constant alpha 0.001 Train Error is 0.2881 Test Error is 0.3529 
activation relu batch_size auto hidden_layer_sizes (10,) learning_rate constant alpha 0.05 Train Error is 0.3374 Test Error is 0.4202 
activation relu batch_size auto hidden_layer_sizes (10,) learning_rate adaptive alpha 0.001 Train Error is 0.2789 Test Error is 0.3193 
activation relu batch_size auto hidden_layer_sizes (10,) learning_rate adaptive alpha 0.05 Train Error is 0.2989 Test Error is 0.3193 
activation relu batch_size auto hidden_layer_sizes (50,) learning_rate constant alpha 0.001 Train Error is 0.2881 Test Error is 0.3613 
activation relu batch_size auto hidden_layer_sizes (50,) learning_rate constant alpha 0.05 Train Error is 0.2743 Test Error is 0.3613 
activation relu batch_size auto hidden_layer_sizes (50,) learning_rate adaptive alpha 0.001 Train Error is 0.2250 Test Error is 0.3277 
activation relu batch_size auto hidden_layer_sizes 

## Random Forest Classifier

In [9]:
from sklearn.ensemble import RandomForestClassifier

def random_forest_classifier(df_train,df_test,estimator_k,depth_i,split_j):
    rf_classifier = RandomForestClassifier(max_features='auto',criterion='entropy',n_estimators=estimator_k, max_depth=depth_i, min_samples_split=split_j )
    rf_classifier.fit(X_train, y_train)
    y_train_pred = rf_classifier.predict(X_train)
    y_test_pred = rf_classifier.predict(X_test)
    print("max_features auto criterion entropy n_estimators {} max_depth {} min_samples_split {} Train Error is:  {} Test Error is:  {} " .format(estimator_k ,depth_i, split_j, '%.4f' % (1 - metrics.accuracy_score(y_train, y_train_pred)), '%.4f' % (1 - metrics.accuracy_score(y_test, y_test_pred))))
    
estimator = [200,500]
depth = [10,20]
split = [5,10]

for k in estimator:
    for i in depth:
        for j in split:
            random_forest_classifier(df_train, df_test, k, i, j)

max_features auto criterion entropy n_estimators 200 max_depth 10 min_samples_split 5 Train Error is:  0.0262 Test Error is:  0.2269 
max_features auto criterion entropy n_estimators 200 max_depth 10 min_samples_split 10 Train Error is:  0.0678 Test Error is:  0.2437 
max_features auto criterion entropy n_estimators 200 max_depth 20 min_samples_split 5 Train Error is:  0.0046 Test Error is:  0.2353 
max_features auto criterion entropy n_estimators 200 max_depth 20 min_samples_split 10 Train Error is:  0.0478 Test Error is:  0.2437 
max_features auto criterion entropy n_estimators 500 max_depth 10 min_samples_split 5 Train Error is:  0.0247 Test Error is:  0.2437 
max_features auto criterion entropy n_estimators 500 max_depth 10 min_samples_split 10 Train Error is:  0.0555 Test Error is:  0.2353 
max_features auto criterion entropy n_estimators 500 max_depth 20 min_samples_split 5 Train Error is:  0.0077 Test Error is:  0.2185 
max_features auto criterion entropy n_estimators 500 max_de

## Gradient Boosting Classifier

In [13]:
from sklearn.ensemble import GradientBoostingClassifier

def gradient_boosting_classifier(df_train,df_test,estimator_k,depth_i,split_j):
    gb_classifier = GradientBoostingClassifier(max_features='auto',loss='deviance',n_estimators=estimator_k, max_depth=depth_i, min_samples_split=split_j )
    gb_classifier.fit(X_train, y_train)
    y_train_pred = gb_classifier.predict(X_train)
    y_test_pred = gb_classifier.predict(X_test)
    print("max_features auto loss deviance n_estimators {} max_depth {} min_samples_split {} Train Error is {} Test Error is {} " .format(estimator_k ,depth_i, split_j, '%.4f' % (1 - metrics.accuracy_score(y_train, y_train_pred)), '%.4f' % (1 - metrics.accuracy_score(y_test, y_test_pred))))
    
estimator = [20,80]
depth = [5,10]
split = [200,400]

for k in estimator:
    for i in depth:
        for j in split:
            gradient_boosting_classifier(df_train, df_test, k, i, j)


max_features auto loss deviance n_estimators 20 max_depth 5 min_samples_split 200 Train Error is 0.1695 Test Error is 0.2437 
max_features auto loss deviance n_estimators 20 max_depth 5 min_samples_split 400 Train Error is 0.2034 Test Error is 0.2269 
max_features auto loss deviance n_estimators 20 max_depth 10 min_samples_split 200 Train Error is 0.1726 Test Error is 0.2521 
max_features auto loss deviance n_estimators 20 max_depth 10 min_samples_split 400 Train Error is 0.2034 Test Error is 0.2269 
max_features auto loss deviance n_estimators 80 max_depth 5 min_samples_split 200 Train Error is 0.1109 Test Error is 0.2269 
max_features auto loss deviance n_estimators 80 max_depth 5 min_samples_split 400 Train Error is 0.1572 Test Error is 0.2269 
max_features auto loss deviance n_estimators 80 max_depth 10 min_samples_split 200 Train Error is 0.0616 Test Error is 0.2353 
max_features auto loss deviance n_estimators 80 max_depth 10 min_samples_split 400 Train Error is 0.1402 Test Error

## Voting Classifier

### Combine the classifiers with the best test error you produced in Q1..Q4 using VotingClassifier and measure the training and test error for each of the following cases:  

### give equal weight to each classifier 

In [18]:
from sklearn.ensemble import VotingClassifier

dt = DecisionTreeClassifier(criterion='gini', max_depth=2, max_features=4, min_samples_leaf=1, splitter='best')
mlp = MLPClassifier(activation='relu', batch_size='auto',hidden_layer_sizes=(10,), learning_rate='constant', alpha=0.05)
rf = RandomForestClassifier(max_features='auto', criterion='entropy', n_estimators=500, max_depth=20, min_samples_split=5)
gb = GradientBoostingClassifier(max_features='auto', loss='deviance',n_estimators=80, max_depth=10, min_samples_split=400 )

ensemble_clf = VotingClassifier(estimators=[('dt',dt),('mlp',mlp),('rf',rf),('gb',gb)], voting='hard', weights=[0.25,0.25,0.25,0.25])
ensemble_clf.fit(X_train,y_train)
pred_train = ensemble_clf.predict(X_train)
print("Training Error is " '%.4f' % (1-accuracy_score(y_train,pred_train)))
pred_test = ensemble_clf.predict(X_test)
print("Test Error is " '%.4f' % (1-accuracy_score(y_test,pred_test)))


Training Error is 0.1572
Test Error is 0.2437


### give weight proportional to 1/(1+trainingerror) 

In [25]:
from sklearn.ensemble import VotingClassifier

dt = DecisionTreeClassifier(criterion='gini', max_depth=2, max_features=4, min_samples_leaf=1, splitter='best')
mlp = MLPClassifier(activation='relu', batch_size='auto',hidden_layer_sizes=(10,), learning_rate='constant', alpha=0.05)
rf = RandomForestClassifier(max_features='auto', criterion='entropy', n_estimators=500, max_depth=20, min_samples_split=5)
gb = GradientBoostingClassifier(max_features='auto', loss='deviance',n_estimators=80, max_depth=10, min_samples_split=400 )

#Deciding weights based on Training Errors of best parameters
dt_w = 1/(1+0.2542)
mlp_w = 1/(1+0.2589)
rf_w = 1/(1+0.0031)
gb_w = 1/(1+0.1402)

ensemble_clf = VotingClassifier(estimators=[('dt',dt),('mlp',mlp),('rf',rf),('gb',gb)], voting='hard', weights=[dt_w,mlp_w,rf_w,gb_w])
ensemble_clf.fit(X_train,y_train)
pred_train = ensemble_clf.predict(X_train)
print("Training Error is " '%.4f' % (1-accuracy_score(y_train,pred_train)))
pred_test = ensemble_clf.predict(X_test)
print("Test Error is " '%.4f' % (1-accuracy_score(y_test,pred_test)))

Training Error is 0.0678
Test Error is 0.2101
