# Ensambles of Decision Trees 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import model_selection
from sklearn import preprocessing
from sklearn import metrics

from sklearn import tree
from sklearn import ensemble

In [2]:
#loading data for binary classification

data_earthquakes = pd.read_csv('data_for_classification.csv')
data_earthquakes.head()

X = data_earthquakes[['Latitude', 'Longitude', 'Depth', 'Magnitude', 'Distance']]

### Predicting whether an eruption is in progress

In [3]:
y = data_earthquakes['Is-erupting']

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.33, 
                                                                    random_state = 7, stratify = y)

In [4]:
# standardization

scaler = preprocessing.StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [5]:
model = ensemble.RandomForestClassifier(n_estimators=45, max_depth=25, random_state=7)

model.fit(X_train, y_train)
y_test_predicted = model.predict(X_test)
y_train_predicted = model.predict(X_train)

Test set results:

In [6]:
print("accuracy_score: ", metrics.accuracy_score(y_test, y_test_predicted))
print("recall_score: ", metrics.recall_score(y_test, y_test_predicted))
print("f1_score: ", metrics.f1_score(y_test, y_test_predicted))

accuracy_score:  0.9405648934326807
recall_score:  0.6721311475409836
f1_score:  0.7819453273998729


In [7]:
print(metrics.confusion_matrix(y_test, y_test_predicted))

[[4813   43]
 [ 300  615]]


Train set results:

In [8]:
print("accuracy_score: ", metrics.accuracy_score(y_train, y_train_predicted))
print("recall_score: ", metrics.recall_score(y_train, y_train_predicted))
print("f1_score: ", metrics.f1_score(y_train, y_train_predicted))

accuracy_score:  0.9976952624839949
recall_score:  0.9854604200323102
f1_score:  0.9926769731489016


In [9]:
print(metrics.confusion_matrix(y_train, y_train_predicted))

[[9858    0]
 [  27 1830]]


### Predicting whether there will be an eruption tomorrow

In [10]:
y = data_earthquakes['Is-erupting-tomorrow']

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.33, 
                                                                    random_state = 7, stratify = y)

In [11]:
# standardization

scaler = preprocessing.StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [12]:
model = ensemble.RandomForestClassifier(n_estimators=45, max_depth=25, random_state=7)

model.fit(X_train, y_train)
y_test_predicted = model.predict(X_test)
y_train_predicted = model.predict(X_train)

Test set results:

In [13]:
print("accuracy_score: ", metrics.accuracy_score(y_test, y_test_predicted))
print("recall_score: ", metrics.recall_score(y_test, y_test_predicted))
print("f1_score: ", metrics.f1_score(y_test, y_test_predicted))

accuracy_score:  0.9828452607866921
recall_score:  0.6510791366906474
f1_score:  0.7852494577006508


In [14]:
print(metrics.confusion_matrix(y_test, y_test_predicted))

[[5491    2]
 [  97  181]]


Train set results:

In [15]:
print("accuracy_score: ", metrics.accuracy_score(y_train, y_train_predicted))
print("recall_score: ", metrics.recall_score(y_train, y_train_predicted))
print("f1_score: ", metrics.f1_score(y_train, y_train_predicted))

accuracy_score:  0.998719590268886
recall_score:  0.973404255319149
f1_score:  0.986522911051213


In [16]:
print(metrics.confusion_matrix(y_train, y_train_predicted))

[[11151     0]
 [   15   549]]


### Predicting whether there will be an eruption next week

In [17]:
y = data_earthquakes['Is-erupting-next-week']

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.33, 
                                                                    random_state = 7, stratify = y)

In [18]:
# standardization

scaler = preprocessing.StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [19]:
model = ensemble.RandomForestClassifier(n_estimators=45, max_depth=25, random_state=7)

model.fit(X_train, y_train)
y_test_predicted = model.predict(X_test)
y_train_predicted = model.predict(X_train)

Test set results:

In [20]:
print("accuracy_score: ", metrics.accuracy_score(y_test, y_test_predicted))
print("recall_score: ", metrics.recall_score(y_test, y_test_predicted))
print("f1_score: ", metrics.f1_score(y_test, y_test_predicted))

accuracy_score:  0.9114538208282793
recall_score:  0.6379446640316205
f1_score:  0.7595294117647059


In [21]:
print(metrics.confusion_matrix(y_test, y_test_predicted))

[[4453   53]
 [ 458  807]]


Train set results:

In [22]:
print("accuracy_score: ", metrics.accuracy_score(y_train, y_train_predicted))
print("recall_score: ", metrics.recall_score(y_train, y_train_predicted))
print("f1_score: ", metrics.f1_score(y_train, y_train_predicted))

accuracy_score:  0.9943661971830986
recall_score:  0.9743090696769171
f1_score:  0.9869873817034701


In [23]:
print(metrics.confusion_matrix(y_train, y_train_predicted))

[[9146    0]
 [  66 2503]]
