In [1]:
# Bagged Decision Trees for Classification
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [3]:
filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = pd.read_csv(filename, names=names)
df

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [4]:
X = df.iloc[:,0:8]
Y = df.iloc[:,8]

In [6]:
seed = 7
kfold = KFold(n_splits=10, random_state=seed, shuffle = True)

#if you want to specify random_state them shuffle must be True

#kfold = KFold(n_splits=10)

In [7]:
cart = DecisionTreeClassifier()
num_trees = 100
model = BaggingClassifier(base_estimator = cart,
                          n_estimators = num_trees,
                          random_state = seed)

In [8]:
results = cross_val_score(model, X, Y, cv = kfold)
results

array([0.76623377, 0.75324675, 0.74025974, 0.77922078, 0.80519481,
       0.79220779, 0.66233766, 0.75324675, 0.78947368, 0.73684211])

In [9]:
print(results.mean())

0.7578263841421736


In [10]:
# Random Forest Classification
num_trees = 100
model = RandomForestClassifier(n_estimators = num_trees,
                               max_features = 3)

In [12]:
results = cross_val_score(model, X, Y, cv = kfold)
print(results.mean())

0.7695146958304854


In [13]:
# AdaBoost Classification
num_trees = 10
seed=7
model = AdaBoostClassifier(n_estimators = num_trees,
                           random_state = seed)

In [14]:
results = cross_val_score(model, X, Y, cv = kfold)
print(results.mean())

0.7669685577580314


In [15]:
# Stacking Ensemble for Classification

# create the sub models
estimators = []

model1 = LogisticRegression(max_iter=500)
estimators.append(('logistic', model1))

model2 = DecisionTreeClassifier()
estimators.append(('cart', model2))

model3 = SVC()
estimators.append(('svm', model3))

# create the ensemble model
ensemble = VotingClassifier(estimators)

In [16]:
estimators

[('logistic', LogisticRegression(max_iter=500)),
 ('cart', DecisionTreeClassifier()),
 ('svm', SVC())]

In [18]:
results = cross_val_score(ensemble, X, Y, cv = kfold)
print(results.mean())

0.773462064251538
