In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier, AdaBoostClassifier

dataset = pd.read_csv('Data/spambase.data', header=None)
X = dataset.iloc[:, :-1].values
t = dataset.iloc[:, -1].values
X_train, X_test, t_train, t_test = train_test_split(X, t, test_size = 1/3)

In [None]:
# Decision tree classifier
def tree_clf(iteration):
    err = np.zeros(iteration)
    for i in range (iteration):
        tree_clf = DecisionTreeClassifier(criterion='entropy',splitter='random', max_leaf_nodes= i+2)
        tree_clf.fit(X_train, t_train)
        err[i] = -np.mean(np.asarray(cross_val_score(tree_clf, X, t, cv=5, scoring='neg_mean_squared_error')))
    return err

# Bagging classifier
def bag_clf(iteration):
	err = np.zeros(iteration)
	for i in range (iteration):
		bag_clf=BaggingClassifier(DecisionTreeClassifier(),n_jobs=-1, n_estimators=(i+1)*50)	# loop from 50 to 2500
		bag_clf.fit(X_train, t_train)
		err[i] = -np.mean(np.asarray(cross_val_score(bag_clf, X, t, cv=5,scoring='neg_mean_squared_error')))
	
	return err

# Random forest classifier
def rf_clf(iteration):
    err = np.zeros(iteration)
    for i in range (iteration):
        rf_clf = RandomForestClassifier(n_jobs=-1, n_estimators=(i+1)*50)
        rf_clf.fit(X_train, t_train)
        err[i] = -np.mean(np.asarray(cross_val_score(rf_clf, X, t, cv=5,scoring='neg_mean_squared_error')))
    return err

# Adaboost classifiers with decision stumps
def ab_clf(iteration):
    err = np.zeros(iteration)
    for i in range (iteration):
        ab_clf = AdaBoostClassifier(n_estimators=(i+1)*50)
        ab_clf.fit(X_train, t_train)
        err[i] = -np.mean(np.asarray(cross_val_score(ab_clf, X, t, cv=5,scoring='neg_mean_squared_error')))
    return err

# Adaboost classifiers with decision trees with maximum 10 leaves
def ab_clf_tree(iteration):
    err = np.zeros(iteration)
    for i in range (iteration):
        ab_clf = AdaBoostClassifier(DecisionTreeClassifier(max_leaf_nodes=10), n_estimators=(i+1)*50)
        ab_clf.fit(X_train, t_train)
        err[i] = -np.mean(np.asarray(cross_val_score(ab_clf, X, t, cv=5,scoring='neg_mean_squared_error')))
    return err

# Adaboost classifiers with decision trees with no restriction
def ab_clf_tree_free(iteration):
    err = np.zeros(iteration)
    for i in range (iteration):
        ab_clf = AdaBoostClassifier(DecisionTreeClassifier(), n_estimators=(i+1)*50)
        ab_clf.fit(X_train, t_train)
        err[i] = -np.mean(np.asarray(cross_val_score(ab_clf, X, t, cv=5,scoring='neg_mean_squared_error')))
    return err

In [None]:
tree_err = tree_clf(500)
bag_err = bag_clf(50)
rf_err = rf_clf(50)
ab_err = ab_clf(50)
ab_err_tree = ab_clf_tree(50)
ab_err_tree_free = ab_clf_tree_free(50)

In [None]:
x = (np.arange(50)+1)*50 
figure(figsize=(10, 6), dpi=100)

plt.plot(tree_err)
plt.show()

plt.plot(x,np.ones(50)*np.min(tree_err),label='Decision tree')
plt.plot(x,bag_err,label='Bagging')
plt.plot(x,rf_err,label='Ramdom Forest')
plt.plot(x,ab_err,label='Ab with decision stumps')
plt.plot(x,ab_err_tree,label='Ab with trees of max 10 leaves')
plt.plot(x,ab_err_tree_free,label='Ab with trees of no restriction',color='y')
plt.xlabel("Number of predictors")
plt.ylabel("MSE")
plt.title("MSE of 5 ensemble methods vs Number of predictors")
plt.legend()
plt.show()