# Random Forest

In [None]:
import sklearn.ensemble as en
import numpy as np
import pandas as pd
import sklearn.model_selection as cv
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

df = pd.read_csv('../feature_selection.csv')
df.head()

In [None]:
y_kepler = df['koi_disposition']
X_kepler = df.drop('koi_disposition', axis=1)
(X_train, X_test, y_train, y_test) = cv.train_test_split(X_kepler, y_kepler, test_size=.3, stratify = y_kepler, random_state=1)

In [None]:
rf = en.RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print("Confusion matrix on test set:\n", confusion_matrix(y_test, y_pred))
print("\nAccuracy on test set: ", accuracy_score(y_test, y_pred))

### Hyperparameter tuning

In [None]:
params = {
    'bootstrap': [True, False],
    'max_depth': [50, 150],
    'max_features': [5, 10],
    'min_samples_leaf': [5, 10],
    'min_samples_split': [5, 10],
    'n_estimators': [100, 500, 1000]
}

rf = en.RandomForestClassifier()

# Instantiate the grid search model
gs = GridSearchCV(estimator = rf, param_grid = params, cv = 5, n_jobs = -1, verbose = 2)
gs.fit(X_train, y_train)
gs.best_params_

In [None]:
best_params = gs.best_params_

In [None]:
rf = en.RandomForestClassifier(**best_params)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print("Confusion matrix on test set:\n", confusion_matrix(y_test, y_pred))
print("\nAccuracy on test set: ", accuracy_score(y_test, y_pred))