# Random forest

## Setup

### Run the Data Preprocessing notebook once to import the get_data method

In [None]:
%run data_preprocessing.ipynb

### Run the Data Evaluation notebook once to import the show_evaluation method

In [None]:
%run data_evaluation.ipynb

### Imports

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
import graphviz 
from sklearn import tree
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn import decomposition, datasets
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

### Hyperparmeters

In [None]:
# for get_data function
enable_feature_engineering_gender = True
enable_feature_engineering_height_weight = True 
enable_feature_engineering_gluc_chol = True
enable_feature_engineering_alco_smoking = True
enable_outlier_handling = True
normalize = 'minmax'
use_one_hot_encoding = True
split_size = (0.8, 0.0, 0.2)

### Get the data

In [None]:
y_train, x_train, y_val, x_val, y_test, x_test = get_data(enable_feature_engineering_gender, enable_feature_engineering_height_weight, enable_feature_engineering_gluc_chol, enable_feature_engineering_alco_smoking, enable_outlier_handling, normalize, use_one_hot_encoding, split_size)

## Model

### Create and train the model

In [None]:
rf = RandomForestClassifier()
rf.fit(x_train, y_train.values.ravel())
print(rf)

## Evaluation

### Predict the test set

In [None]:
y_prob = rf.predict(x_test)
y_pred = np.round(y_prob)

y_proba = rf.predict_proba(x_test)[:,1]

In [None]:

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

### Apply the evaluation metrics

In [None]:
show_evaluation(y_test, y_pred, y_proba)

In [None]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = 4, verbose = 2)

# Fit the grid search to the data
grid_search.fit(x_train, y_train.values.ravel())



best_grid = grid_search.best_estimator_


In [None]:
y_prob1 = grid_search.predict(x_test) 
y_pred1 = np.round(y_prob1)
y_proba1 = grid_search.predict_proba(x_test)[:,1]

show_evaluation(y_test, y_pred1, y_proba1)

In [None]:
 # Creating an standardscaler object
std_slc = StandardScaler()

    # Creating a pca object
pca = decomposition.PCA()


    # Creating a pipeline of three steps. First, standardizing the data.
    # Second, tranforming the data with PCA.
    # Third, training a Decision Tree Classifier on the data.
pipe = Pipeline(steps=[('std_slc', std_slc),
                        ('pca', pca),
                        ('rf', rf)])

    # Creating Parameter Space
    # Creating a list of a sequence of integers from 1 to 30 (the number of features in X + 1)
n_components = list(range(1,x_train.shape[1]+1,1))

    # Creating lists of parameter for Decision Tree Classifier
criterion = ['gini', 'entropy']
max_depth = [2,4,6,8,10,12]

    # Creating a dictionary of all the parameter options 
    # Note that we can access the parameters of steps of a pipeline by using '__’
parameters = dict(pca__n_components=n_components,
                    rf__criterion=criterion,
                    rf__max_depth=max_depth)

    # Conducting Parameter Optmization With Pipeline
    # Creating a grid search object
clf_GS = GridSearchCV(pipe, parameters)

    # Fitting the grid search
clf_GS.fit(x_train, y_train.values.ravel())


In [None]:
print('Best Criterion:', clf_GS.best_estimator_.get_params()['rf__criterion'])
print('Best max_depth:', clf_GS.best_estimator_.get_params()['rf__max_depth'])
print('Best Number Of Components:', clf_GS.best_estimator_.get_params()['pca__n_components'])
print(); print(clf_GS.best_estimator_.get_params()['rf'])

y_prob2 = clf_GS.predict(x_test.astype('float32'))
y_pred2 = np.round(y_prob2)
y_proba2 = clf_GS.predict_proba(x_test)[:,1]

In [None]:
show_evaluation(y_test, y_pred2, y_prob2)

In [None]:
# Save predictions
method_name = 'Random Forest'
predicted_probabilities = y_proba2
savePredictedProbabilities(method_name, y_test ,predicted_probabilities)