# Random Forest

In [8]:
import sklearn.ensemble as en
import numpy as np
import pandas as pd
import sklearn.model_selection as cv
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

df = pd.read_csv('../feature_selection.csv')
df.head()

Unnamed: 0,koi_period,koi_period_err1,koi_period_err2,koi_time0bk_err1,koi_time0bk_err2,koi_impact,koi_duration_err1,koi_duration_err2,koi_depth,koi_prad,...,koi_model_snr,koi_steff,koi_steff_err1,koi_steff_err2,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,dec,koi_disposition
0,101.110701,0.000953,-0.000953,0.00806,-0.00806,0.046,0.21,-0.21,878.1,1.61,...,24.1,4133.0,74.0,-82.0,0.023,-0.033,0.561,0.033,41.452209,1
1,4.800654,4e-06,-4e-06,0.000745,-0.000745,0.785,0.055,-0.055,15304.0,16.91,...,298.0,6715.0,161.0,-241.0,0.056,-0.224,1.241,0.441,38.999008,0
2,39.593105,0.000615,-0.000615,0.014,-0.014,0.0044,0.598,-0.598,156.6,1.22,...,5.7,6046.0,172.0,-218.0,0.044,-0.298,0.972,0.411,41.659611,0
3,31.158825,5.7e-05,-5.7e-05,0.00138,-0.00138,0.029,0.0504,-0.0504,959.0,3.56,...,73.6,5951.0,107.0,-119.0,0.12,-0.12,1.155,0.184,38.710232,1
4,613.82905,0.0326,-0.0326,0.0319,-0.0319,0.2616,1.4,-1.4,148.2,2.85,...,10.2,5636.0,173.0,-148.0,0.459,-0.224,2.357,1.027,43.824032,0


In [9]:
y_kepler = df['koi_disposition']
X_kepler = df.drop('koi_disposition', axis=1)
(X_train, X_test, y_train, y_test) = cv.train_test_split(X_kepler, y_kepler, test_size=.3, stratify = y_kepler, random_state=1)

In [11]:
rf = en.RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print("Confusion matrix on test set:\n", confusion_matrix(y_test, y_pred))
print("\nAccuracy on test set: ", accuracy_score(y_test, y_pred))

Confusion matrix on test set:
 [[386  22]
 [ 25 167]]

Accuracy on test set:  0.9216666666666666


### Hyperparameter tuning

In [15]:
params = {
    'bootstrap': [True, False],
    'max_depth': [50, 150],
    'max_features': [5, 10],
    'min_samples_leaf': [5, 10],
    'min_samples_split': [5, 10],
    'n_estimators': [100, 500, 1000]
}

rf = en.RandomForestClassifier()

# Instantiate the grid search model
gs = GridSearchCV(estimator = rf, param_grid = params, cv = 5, n_jobs = -1, verbose = 2)
gs.fit(X_train, y_train)
gs.best_params_

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   13.4s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed:  5.2min finished


{'bootstrap': False,
 'max_depth': 150,
 'max_features': 5,
 'min_samples_leaf': 5,
 'min_samples_split': 5,
 'n_estimators': 100}

In [12]:
best_params = gs.best_params_

In [13]:
rf = en.RandomForestClassifier(**best_params)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print("Confusion matrix on test set:\n", confusion_matrix(y_test, y_pred))
print("\nAccuracy on test set: ", accuracy_score(y_test, y_pred))

Confusion matrix on test set:
 [[385  23]
 [ 23 169]]

Accuracy on test set:  0.9233333333333333
