# Ensambles of Decision Trees 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import model_selection
from sklearn import preprocessing
from sklearn import metrics

from sklearn import ensemble

In [2]:
#loading data for binary classification

data_earthquakes = pd.read_csv('data_for_classification.csv')
data_earthquakes.head()

X = data_earthquakes[['Latitude', 'Longitude', 'Depth', 'Magnitude', 'Distance']]

Function that performs classification with RandomForestClassifier:  
(Function depends on the name of the target variable)

In [3]:
# target_name = ['Is-erupting' | 'Is-erupting-tomorrow' | 'Is-erupting-next-week']

def random_forest_classication(target_name):
    
    y = data_earthquakes[target_name]
    
    X_train_validation, X_test, y_train_validation, y_test = \
        model_selection.train_test_split(X, y, test_size = 0.33, random_state = 7, stratify = y)
    
    # standardization    
    scaler = preprocessing.StandardScaler()
    scaler.fit(X_train_validation)
    X_train_validation = scaler.transform(X_train_validation)
    X_test = scaler.transform(X_test)
    
    
    # using grid search to find the optimal hyperparameters of a model
    
    grid_model = ensemble.RandomForestClassifier()
    
    params = {
        'n_estimators': range(50, 65, 5), 
        'max_depth': range(35, 50, 5)
    }
    
    grid = model_selection.GridSearchCV(grid_model, param_grid=params, scoring='recall', 
                                        cv=10, return_train_score=True)
    grid.fit(X_train_validation, y_train_validation)
    
    print('Best recall score: ', grid.best_score_)
    print('Best parameters: \n\tn_estimators: ', grid.best_params_['n_estimators'], 
          '\n\tmax_depth: ', grid.best_params_['max_depth'])
    
    
    
    # testing
    model = ensemble.RandomForestClassifier(n_estimators=grid.best_params_['n_estimators'], 
                                            max_depth=grid.best_params_['max_depth'], random_state=7)

    model.fit(X_train_validation, y_train_validation)
    y_test_predicted = model.predict(X_test)
    y_train_predicted = model.predict(X_train_validation)
    
    
    print('\n\nTest results')
    
    print('\taccuracy_score: ', metrics.accuracy_score(y_test, y_test_predicted))
    print('\trecall_score: ', metrics.recall_score(y_test, y_test_predicted))
    print('\tf1_score: ', metrics.f1_score(y_test, y_test_predicted))
    print('\nConfusion matrix on test data: ')
    print(metrics.confusion_matrix(y_test, y_test_predicted))
    
    
    print('\n\nTrain results')
    
    print('\taccuracy_score: ', metrics.accuracy_score(y_train_validation, y_train_predicted))
    print('\trecall_score: ', metrics.recall_score(y_train_validation, y_train_predicted))
    print('\tf1_score: ', metrics.f1_score(y_train_validation, y_train_predicted))
    print('\nConfusion matrix on train data')
    print(metrics.confusion_matrix(y_train_validation, y_train_predicted))
    
    
    # cross validation
    
    kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    
    model = ensemble.RandomForestClassifier(n_estimators=grid.best_params_['n_estimators'], 
                                            max_depth=grid.best_params_['max_depth'], random_state=7)

    scores = model_selection.cross_val_score(model, X, y, scoring='recall', cv=kf)
    print('\nCross validation mean recall score: ', scores.mean())

### Predicting whether an eruption is in progress

In [4]:
random_forest_classication('Is-erupting')

Best recall score:  0.6564109270560883
Best parameters: 
	n_estimators:  55 
	max_depth:  35


Test results
	accuracy_score:  0.9445503378963784
	recall_score:  0.6950819672131148
	f1_score:  0.7989949748743719

Confusion matrix on test data: 
[[4815   41]
 [ 279  636]]


Train results
	accuracy_score:  0.9997439180537772
	recall_score:  0.9983844911147012
	f1_score:  0.9991915925626517

Confusion matrix on train data
[[9858    0]
 [   3 1854]]

Cross validation mean recall score:  0.8105929033726869


### Predicting whether there will be an eruption tomorrow

In [5]:
random_forest_classication('Is-erupting-tomorrow')

Best recall score:  0.700250626566416
Best parameters: 
	n_estimators:  55 
	max_depth:  35


Test results
	accuracy_score:  0.9850979033096517
	recall_score:  0.7050359712230215
	f1_score:  0.8200836820083681

Confusion matrix on test data: 
[[5489    4]
 [  82  196]]


Train results
	accuracy_score:  0.9998292787025181
	recall_score:  0.9964539007092199
	f1_score:  0.9982238010657194

Confusion matrix on train data
[[11151     0]
 [    2   562]]

Cross validation mean recall score:  0.817061143984221


### Predicting whether there will be an eruption next week

In [6]:
random_forest_classication('Is-erupting-next-week')

Best recall score:  0.6403270914396887
Best parameters: 
	n_estimators:  55 
	max_depth:  45


Test results
	accuracy_score:  0.920291110726044
	recall_score:  0.6798418972332015
	f1_score:  0.7889908256880733

Confusion matrix on test data: 
[[4451   55]
 [ 405  860]]


Train results
	accuracy_score:  0.9998292787025181
	recall_score:  0.9992214869599065
	f1_score:  0.9996105919003115

Confusion matrix on train data
[[9146    0]
 [   2 2567]]

Cross validation mean recall score:  0.8145540081903315
