In [1]:
from sklearn import datasets

iris = datasets.load_iris()
X = iris.data
y = iris.target

In [2]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import cross_val_score

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)
dt = DecisionTreeClassifier(max_depth=4)

#Evaluate the list of MSE obtained by 10-fold CV
# Set n_jobs=-1 in order to exploit all CPU cores in computation
MSE_CV = -cross_val_score(dt, X_train, y_train, cv=10, scoring ='neg_mean_squared_error', n_jobs=-1)

dt.fit(X_train, y_train)
y_predict_train = dt.predict(X_train)
y_predict_test = dt.predict(X_test)

print('CV_MSE: {:.2f}'.format(MSE_CV.mean()))
print('Train MSE {:.2f}'.format(MSE(y_train, y_predict_train)))
print('Test MSE {:.2f}'.format(MSE(y_test,y_predict_test)))

CV_MSE: 0.04
Train MSE 0.01
Test MSE 0.07


[1 2 1 1 1 1 1 1 1 0 0 0 0 1 2 2 2 1 2 0 1 0 2 1 2 0 2 2 1 0 1 2 2 1 0 1 2
 0 1 2 0 0 2 0 2 2 2 1 1 0 2 0 1 0 0 2 0 2 0 1 1 1 1 0 0 2 2 0 2 2 1 1 0 1
 1 1 2 1 1 0 2 0 1 1 2 2 2 0 2 1 2 0 0 2 0 1 2 2 0 1 2 0 1 2 0]


In [4]:
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score

bc = BaggingClassifier(base_estimator=dt,n_estimators=300)
bc.fit(X_train, y_train)
y_pred = bc.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print('Accuracy of Bagging Classifier {:.3f}'.format(accuracy))
print('Accuracy of Decision Tree {:.3f}'.format(accuracy_score(y_test,y_predict_test)))

Accuracy of Bagging Classifier 0.911
Accuracy of Decision Tree 0.844


In [5]:
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(base_estimator=dt,n_estimators=100)
ada.fit(X_train, y_train)

print 'Accuracy score {:.3f}'.format(accuracy_score(y_test,ada.predict(X_test)))

# from sklearn.metrics import roc_auc_score
# y_pred_prob=ada.predict_proba(X_test)[:,1]
# print('ROC AUC score:{:.2f}'.format(roc_auc_score(y_test,y_pred_prob)))

Accuracy score 0.844


In [6]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=400, min_samples_leaf=0.12)

rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

rmse_test = MSE(y_test, y_pred)**(1/2)
print('Test set RMSE of Random Forest: {:.2f}'.format(rmse_test))
print('Accuracy of Random Forest: {:.2f}'.format(accuracy_score(y_test,y_pred)))

import matplotlib.pyplot as plt
import pandas as pd

importances_rf = pd.Series(rf.feature_importances_,index=iris.feature_names)
sorted_importances_rf = importances_rf.sort_values()

sorted_importances_rf.plot(kind='barh', color='lightgreen')

Test set RMSE of Random Forest: 1.00
Accuracy of Random Forest: 0.93


<matplotlib.axes._subplots.AxesSubplot at 0xc2ed2e8>

In [8]:
from sklearn.metrics import classification_report
print classification_report(y_test,rf.predict(X_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        17
           1       0.83      1.00      0.91        15
           2       1.00      0.77      0.87        13

   micro avg       0.93      0.93      0.93        45
   macro avg       0.94      0.92      0.93        45
weighted avg       0.94      0.93      0.93        45

