# Decision Trees

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import model_selection
from sklearn import preprocessing
from sklearn import metrics

from sklearn import tree
from sklearn import ensemble

In [2]:
#loading data for binary classification

data_earthquakes = pd.read_csv('data_for_classification.csv')
data_earthquakes.head()

X = data_earthquakes[['Latitude', 'Longitude', 'Depth', 'Magnitude', 'Distance']]

### Predicting whether an eruption is in progress

In [3]:
y = data_earthquakes['Is-erupting']

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.33, 
                                                                    random_state = 7, stratify = y)

In [4]:
# standardization

scaler = preprocessing.StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [5]:
model = tree.DecisionTreeClassifier(criterion='gini', max_features=2, max_depth=50, random_state=7)

model.fit(X_train, y_train)
y_test_predicted = model.predict(X_test)
y_train_predicted = model.predict(X_train)

Test set results:

In [6]:
print("accuracy_score: ", metrics.accuracy_score(y_test, y_test_predicted))
print("recall_score: ", metrics.recall_score(y_test, y_test_predicted))
print("f1_score: ", metrics.f1_score(y_test, y_test_predicted))

accuracy_score:  0.9180384682030844
recall_score:  0.7366120218579235
f1_score:  0.7402526084568918


In [7]:
print(metrics.confusion_matrix(y_test, y_test_predicted))

[[4624  232]
 [ 241  674]]


Train set results:

In [8]:
print("accuracy_score: ", metrics.accuracy_score(y_train, y_train_predicted))
print("recall_score: ", metrics.recall_score(y_train, y_train_predicted))
print("f1_score: ", metrics.f1_score(y_train, y_train_predicted))

accuracy_score:  1.0
recall_score:  1.0
f1_score:  1.0


In [9]:
print(metrics.confusion_matrix(y_train, y_train_predicted))

[[9858    0]
 [   0 1857]]


### Predicting whether there will be an eruption tomorrow

In [10]:
y = data_earthquakes['Is-erupting-tomorrow']

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.33, 
                                                                    random_state = 7, stratify = y)

In [11]:
# standardization

scaler = preprocessing.StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [12]:
model = tree.DecisionTreeClassifier(criterion='gini', max_features=2, max_depth=45, random_state=7)

model.fit(X_train, y_train)
y_test_predicted = model.predict(X_test)
y_train_predicted = model.predict(X_train)

Test set results:

In [13]:
print("accuracy_score: ", metrics.accuracy_score(y_test, y_test_predicted))
print("recall_score: ", metrics.recall_score(y_test, y_test_predicted))
print("f1_score: ", metrics.f1_score(y_test, y_test_predicted))

accuracy_score:  0.9766071738000347
recall_score:  0.7122302158273381
f1_score:  0.7457627118644069


In [14]:
print(metrics.confusion_matrix(y_test, y_test_predicted))

[[5438   55]
 [  80  198]]


Train set results:

In [15]:
print("accuracy_score: ", metrics.accuracy_score(y_train, y_train_predicted))
print("recall_score: ", metrics.recall_score(y_train, y_train_predicted))
print("f1_score: ", metrics.f1_score(y_train, y_train_predicted))

accuracy_score:  1.0
recall_score:  1.0
f1_score:  1.0


In [16]:
print(metrics.confusion_matrix(y_train, y_train_predicted))

[[11151     0]
 [    0   564]]


### Predicting whether there will be an eruption next week

In [17]:
y = data_earthquakes['Is-erupting-next-week']

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.33, 
                                                                    random_state = 7, stratify = y)

In [18]:
# standardization

scaler = preprocessing.StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [19]:
model = tree.DecisionTreeClassifier(criterion='gini', max_features=2, max_depth=50, random_state=7)

model.fit(X_train, y_train)
y_test_predicted = model.predict(X_test)
y_train_predicted = model.predict(X_train)

Test set results:

In [20]:
print("accuracy_score: ", metrics.accuracy_score(y_test, y_test_predicted))
print("recall_score: ", metrics.recall_score(y_test, y_test_predicted))
print("f1_score: ", metrics.f1_score(y_test, y_test_predicted))

accuracy_score:  0.8925662796742332
recall_score:  0.7581027667984189
f1_score:  0.7557131599684792


In [21]:
print(metrics.confusion_matrix(y_test, y_test_predicted))

[[4192  314]
 [ 306  959]]


Train set results:

In [22]:
print("accuracy_score: ", metrics.accuracy_score(y_train, y_train_predicted))
print("recall_score: ", metrics.recall_score(y_train, y_train_predicted))
print("f1_score: ", metrics.f1_score(y_train, y_train_predicted))

accuracy_score:  1.0
recall_score:  1.0
f1_score:  1.0


In [23]:
print(metrics.confusion_matrix(y_train, y_train_predicted))

[[9146    0]
 [   0 2569]]
