In [16]:
%run "00-common"
%matplotlib inline

import sklearn as sk
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier

df = pd.read_csv('../feature_selection.csv')

In [4]:
X = df.drop("koi_disposition", axis=1)
y = df["koi_disposition"]

(X_train, X_test, y_train, y_test) = sk.model_selection.train_test_split(X, y, test_size=.3, stratify=y, random_state=1)

# Bagging

In [7]:
cv = 50
estimators = [1, 2, 5, 10, 20, 50, 100, 200]

lb = []
for nest in estimators:
    scores = cross_val_score(BaggingClassifier(base_estimator=DecisionTreeClassifier(),n_estimators=nest), X_train, y_train, cv=cv, scoring='accuracy', n_jobs=-1)
    print("Accuracy: %0.3f [%s]" % (scores.mean(), nest))
    lb.append(scores.mean())

Accuracy: 0.871 [1]
Accuracy: 0.871 [2]
Accuracy: 0.896 [5]
Accuracy: 0.901 [10]
Accuracy: 0.902 [20]
Accuracy: 0.912 [50]
Accuracy: 0.916 [100]
Accuracy: 0.915 [200]


In [12]:
lb2 = []
print()
for nest in estimators:
    scores = cross_val_score(BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=nest, max_features=0.3), X_train, y_train, cv=cv, scoring='accuracy', n_jobs=-1)
    print("Accuracy: %0.3f [%s]" % (scores.mean(), nest))
    lb2.append(scores.mean())


Accuracy: 0.831 [1]
Accuracy: 0.843 [2]
Accuracy: 0.891 [5]
Accuracy: 0.896 [10]
Accuracy: 0.912 [20]
Accuracy: 0.910 [50]
Accuracy: 0.909 [100]
Accuracy: 0.909 [200]


In [13]:
bg = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=100)
bg.fit(X_train, y_train)
pred = bg.predict(X_test)

print("\nConfusion matrix on test set:\n", sk.metrics.confusion_matrix(y_test, pred))
print("\nAccuracy on test set: ", sk.metrics.accuracy_score(y_test, pred))


Confusion matrix on test set:
 [[386  22]
 [ 24 168]]

Accuracy on test set:  0.9233333333333333


In [27]:
n_estimators = [1, 2, 5, 10, 20, 50, 100, 200]
max_features = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

param_grid = { 'max_features': max_features, 'n_estimators': n_estimators }

grid_search = GridSearchCV(BaggingClassifier(base_estimator=DecisionTreeClassifier()), param_grid, cv=cv, n_jobs=-1)
grid_search.fit(X_train, y_train)

scores = grid_search.cv_results_['mean_test_score']

In [None]:
scores = grid_search.cv_results_['mean_test_score']
scores = np.array(scores).reshape(len(param_grid['max_features']), len(param_grid['n_estimators']))

plt.matshow(scores)
plt.xlabel('n estimators')
plt.ylabel('max features')
plt.colorbar()
plt.grid(b=None)
plt.xticks(np.arange(len(param_grid['n_estimators'])), param_grid['n_estimators'], rotation='vertical')
plt.yticks(np.arange(len(param_grid['max_features'])), param_grid['max_features'])

plt.savefig("../figures/bagging.pdf")

parval = grid_search.best_params_
print("\nBest combination of parameters found: ", parval)


cvacc = cross_val_score(BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=parval['n_estimators'], max_features=parval['max_features']) , X=X_train,  y=y_train, cv=cv, scoring='accuracy', n_jobs=-1)
print('\nAcc. 50-fold cross on train data= ', cvacc.mean())


Best combination of parameters found:  {'max_features': 0.9, 'n_estimators': 200}


In [20]:
scores

array([[0.71142857, 0.74785714, 0.8       , 0.83071429, 0.85214286,
        0.875     , 0.87714286, 0.88214286, 0.80357143, 0.83857143],
       [0.86857143, 0.88571429, 0.90142857, 0.90357143, 0.91142857,
        0.90714286, 0.82714286, 0.83714286, 0.90571429, 0.89928571],
       [0.90357143, 0.90857143, 0.90785714, 0.91285714, 0.86071429,
        0.86357143, 0.905     , 0.90285714, 0.91      , 0.91142857],
       [0.91357143, 0.91357143, 0.84857143, 0.86642857, 0.89357143,
        0.90071429, 0.905     , 0.91142857, 0.90714286, 0.90857143],
       [0.85928571, 0.85928571, 0.89785714, 0.89714286, 0.90357143,
        0.90714286, 0.90571429, 0.90714286, 0.85857143, 0.86142857],
       [0.89142857, 0.90214286, 0.90642857, 0.90428571, 0.90571429,
        0.905     , 0.86428571, 0.87214286, 0.89785714, 0.89      ],
       [0.90714286, 0.89857143, 0.90428571, 0.90642857, 0.87142857,
        0.86357143, 0.89571429, 0.90571429, 0.90428571, 0.90142857],
       [0.90785714, 0.90785714, 0.8657142