In [29]:
import pandas
from sklearn import cross_validation
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

In [5]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
names = ['pregnant', 'plasma', 'pressure', 'thickness', 'insulin', 'bmi', 'diabetes', 'age', 'class']

data_df = pandas.read_csv(url, names=names)
data_df.head(2)

Unnamed: 0,pregnant,plasma,pressure,thickness,insulin,bmi,diabetes,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0


In [7]:
X_train = data_df.ix[:,0:8]
Y_train = data_df.ix[:,8]

### Bagging Algorithms

In [16]:
num_folds = 5
num_instances = len(X_train)
seed = 7
num_trees = 50
kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed)

#### 1. Decision Tree

In [14]:
base_clf = DecisionTreeClassifier()
model = BaggingClassifier(base_estimator=base_clf, n_estimators=num_trees, random_state=seed)

In [15]:
results = cross_validation.cross_val_score(model, X_train, Y_train, cv=kfold)
print(results.mean())

0.770885323827


#### 2. Random Forest

In [19]:
max_features = 4
model = RandomForestClassifier(n_estimators=num_trees, max_features=max_features)

In [21]:
results = cross_validation.cross_val_score(model, X_train, Y_train, cv=kfold)
print(results.mean())

0.764391817333


### Boosting Algorithms

#### 1. AdaBoost

In [24]:
model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)

In [25]:
results = cross_validation.cross_val_score(model, X_train, Y_train, cv=kfold)
print(results.mean())

0.744868856634


#### 2. SGD

In [27]:
model = GradientBoostingClassifier(n_estimators=num_trees, random_state=seed)

In [28]:
results = cross_validation.cross_val_score(model, X_train, Y_train, cv=kfold)
print(results.mean())

0.774789915966


### Voting Algorithms

In [31]:
estimators = []

model1 = LogisticRegression()
estimators.append(('logistic', model1))
model2 = DecisionTreeClassifier()
estimators.append(('decision', model2))
model3 = SVC()
estimators.append(('svm', model2))

ensemble = VotingClassifier(estimators)

In [32]:
results = cross_validation.cross_val_score(ensemble, X_train, Y_train, cv=kfold)
print(results.mean())

0.721509209745
