# Decision Trees

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import model_selection
from sklearn import preprocessing
from sklearn import metrics

from sklearn import tree

In [2]:
#loading data for binary classification
data_earthquakes = pd.read_csv('data_for_classification.csv')

X = data_earthquakes[['Latitude', 'Longitude', 'Depth', 'Magnitude', 'Distance']]

Function that performs classification with DecisionTreeClassifier:   
(Function depends on the name of the target variable)

In [3]:
# target_name = ['Is-erupting' | 'Is-erupting-tomorrow' | 'Is-erupting-next-week']

def decision_tree_classication(target_name):
    
    y = data_earthquakes[target_name]
    
    X_train_validation, X_test, y_train_validation, y_test = \
        model_selection.train_test_split(X, y, test_size = 0.33, random_state = 7, stratify = y)
    
    # standardization    
    scaler = preprocessing.StandardScaler()
    scaler.fit(X_train_validation)
    X_train_validation = scaler.transform(X_train_validation)
    X_test = scaler.transform(X_test)
    
    
    # using grid search to find the optimal hyperparameters of a model
    
    grid_model = tree.DecisionTreeClassifier(max_features=2)
    
    params = {
        'criterion': ['entropy', 'gini'], 
        'max_depth': range(30, 65, 3)
    }
    
    grid = model_selection.GridSearchCV(grid_model, param_grid=params, scoring='recall', 
                                        cv=10, return_train_score=True)
    grid.fit(X_train_validation, y_train_validation)
    
    print('Best recall score: ', grid.best_score_)
    print('Best parameters: \n\tCriterion: ', grid.best_params_['criterion'], 
          '\n\tMax_depth: ', grid.best_params_['max_depth'])
    
    
    
    # testing
    model = tree.DecisionTreeClassifier(criterion=grid.best_params_['criterion'], max_features=2, 
                                    max_depth=grid.best_params_['max_depth'], random_state=7)

    model.fit(X_train_validation, y_train_validation)
    y_test_predicted = model.predict(X_test)
    y_train_predicted = model.predict(X_train_validation)
    
    
    print('\n\nTest results')
    
    print('\taccuracy_score: ', metrics.accuracy_score(y_test, y_test_predicted))
    print('\trecall_score: ', metrics.recall_score(y_test, y_test_predicted))
    print('\tf1_score: ', metrics.f1_score(y_test, y_test_predicted))
    print('\nConfusion matrix on test data: ')
    print(metrics.confusion_matrix(y_test, y_test_predicted))
    
    
    print('\n\nTrain results')
    
    print('\taccuracy_score: ', metrics.accuracy_score(y_train_validation, y_train_predicted))
    print('\trecall_score: ', metrics.recall_score(y_train_validation, y_train_predicted))
    print('\tf1_score: ', metrics.f1_score(y_train_validation, y_train_predicted))
    print('\nConfusion matrix on train data')
    print(metrics.confusion_matrix(y_train_validation, y_train_predicted))
    
    
    # cross validation
    
    kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

    model = tree.DecisionTreeClassifier(criterion=grid.best_params_['criterion'], max_features=2, 
                                        max_depth=grid.best_params_['max_depth'], random_state=7)

    scores = model_selection.cross_val_score(model, X, y, scoring='recall', cv=kf)
    print('\nCross validation mean recall score: ', scores.mean())

### Predicting whether an eruption is in progress

In [4]:
decision_tree_classication('Is-erupting')

Best recall score:  0.7156814879395526
Best parameters: 
	Criterion:  gini 
	Max_depth:  54


Test results
	accuracy_score:  0.9180384682030844
	recall_score:  0.7366120218579235
	f1_score:  0.7402526084568918

Confusion matrix on test data: 
[[4624  232]
 [ 241  674]]


Train results
	accuracy_score:  1.0
	recall_score:  1.0
	f1_score:  1.0

Confusion matrix on train data
[[9858    0]
 [   0 1857]]

Cross validation mean recall score:  0.8336832861742609


### Predicting whether there will be an eruption tomorrow

In [5]:
decision_tree_classication('Is-erupting-tomorrow')

Best recall score:  0.7233082706766917
Best parameters: 
	Criterion:  entropy 
	Max_depth:  60


Test results
	accuracy_score:  0.9747010916652227
	recall_score:  0.7122302158273381
	f1_score:  0.7306273062730627

Confusion matrix on test data: 
[[5427   66]
 [  80  198]]


Train results
	accuracy_score:  1.0
	recall_score:  1.0
	f1_score:  1.0

Confusion matrix on train data
[[11151     0]
 [    0   564]]

Cross validation mean recall score:  0.8301141166525781


### Predicting whether there will be an eruption next week

In [6]:
decision_tree_classication('Is-erupting-next-week')

Best recall score:  0.7205192120622568
Best parameters: 
	Criterion:  gini 
	Max_depth:  51


Test results
	accuracy_score:  0.8925662796742332
	recall_score:  0.7581027667984189
	f1_score:  0.7557131599684792

Confusion matrix on test data: 
[[4192  314]
 [ 306  959]]


Train results
	accuracy_score:  1.0
	recall_score:  1.0
	f1_score:  1.0

Confusion matrix on train data
[[9146    0]
 [   0 2569]]

Cross validation mean recall score:  0.8555022620429533
