## All modeling

In [252]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
from sklearn.utils import shuffle
import pickle

In [253]:
df = pd.read_csv("full_dataset_v1.csv", low_memory='false')
df.shape

(8953, 194)

In [254]:
df = df[df['type']=='landslide']
df['severity'].value_counts()

na              3755
medium          2803
small            497
large            339
unknown           67
very_large        43
...                5
catastrophic       1
Name: severity, dtype: int64

In [255]:
# filter by severity. na is for non-landslide data
df = df[df['severity'].isin(["medium", "small", "large", "very_large", "na"])]

In [256]:
df['severity'].value_counts()
df = shuffle(df)
df.reset_index(inplace=True, drop=True)
print(len(df))

7437


In [257]:
X = df.copy()
X.drop(X.columns[[i for i in range(0, 151)]], axis = 1, inplace = True)
# X.drop(X.columns[[i for i in range(20, 35)]], axis = 1, inplace = True)
X["severity"] = df["severity"]
X.drop(X.columns[[0]], axis = 1, inplace = True)

X = X.dropna()
for i in range(0, 8):
    del X['air' + str(i)]
    del X['temp' + str(i)]
    del X['humidity' + str(i)]
    
X.drop(X.columns[[i for i in range(len(X.columns)-9, len(X.columns)-1)]], axis = 1, inplace = True)
X["slope"] = df["slope"]
# X.drop(columns=['severity'], inplace=True)
X

Unnamed: 0,precip7,wind7,precip6,wind6,precip5,wind5,precip4,wind4,precip3,wind3,severity,slope
0,0.2,7,1.4,9,1.3,9,4.6,10,2.9,15,medium,22
1,4.6,9,3.3,10,2.9,7,1.5,10,1.5,10,small,108
2,2.4,12,3.6,11,2.4,11,0.2,9,0.1,9,na,17576
3,0.0,9,0.4,12,0.7,17,1.1,16,7.4,16,small,99
4,10.0,10,11.4,9,5.6,9,5.5,6,0.7,9,na,53
...,...,...,...,...,...,...,...,...,...,...,...,...
7432,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,na,118
7433,2.4,7,2.7,7,4.4,8,2.3,4,1.4,7,medium,103
7434,1.9,14,1.1,11,1.2,16,6.0,13,6.5,11,medium,141
7435,0.0,11,0.2,9,0.2,4,3.1,6,7.3,7,na,40


### Generate Labels
For binary classification, pass `True` into the function call

In [258]:
def generate_labels(binary = False):
    y = []
    idx_to_severity = ["na", "small", "medium", "large", "very_large"]
    for severity in X.severity:
        if binary:
            y.append(idx_to_severity.index(severity))
        elif severity == "na":
            y.append(0)
        else:
            y.append(1)
    X.drop(X.columns[[-1]], axis = 1, inplace = True)
    print(y.count(1))
    return y

In [259]:
y = generate_labels(True)
print(sum(y))
print(len(y))

497
7292
7437


## Scaling

In [261]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X.drop(['severity'], axis=1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Prediction

In [262]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix

In [263]:
# Testing random forest
best = 1
highest = 0
best_rf = 0

for i in range(85, 150, 2):
    rf = RandomForestClassifier(n_estimators = i)
    rf.fit(X_train, y_train)
    pred = rf.predict(X_test)
    score = round(accuracy_score(pred, y_test)*10000)/100
    print("n_estimators =", i, "  ACCURACY:", score)
    if score > highest:
        highest = score
        best = i
        best_rf = rf
        
print("# of trees:", best, highest)

n_estimators = 85   ACCURACY: 59.54
n_estimators = 87   ACCURACY: 59.34
n_estimators = 89   ACCURACY: 60.28
n_estimators = 91   ACCURACY: 59.68
n_estimators = 93   ACCURACY: 59.27
n_estimators = 95   ACCURACY: 58.74
n_estimators = 97   ACCURACY: 59.27
n_estimators = 99   ACCURACY: 59.95
n_estimators = 101   ACCURACY: 59.81
n_estimators = 103   ACCURACY: 59.61
n_estimators = 105   ACCURACY: 60.48
n_estimators = 107   ACCURACY: 59.54
n_estimators = 109   ACCURACY: 59.74
n_estimators = 111   ACCURACY: 60.08
n_estimators = 113   ACCURACY: 59.21
n_estimators = 115   ACCURACY: 60.55
n_estimators = 117   ACCURACY: 60.15
n_estimators = 119   ACCURACY: 59.14
n_estimators = 121   ACCURACY: 60.28
n_estimators = 123   ACCURACY: 59.95
n_estimators = 125   ACCURACY: 60.08
n_estimators = 127   ACCURACY: 59.54
n_estimators = 129   ACCURACY: 59.68
n_estimators = 131   ACCURACY: 60.62
n_estimators = 133   ACCURACY: 60.48
n_estimators = 135   ACCURACY: 60.15
n_estimators = 137   ACCURACY: 60.15
n_estimat

In [264]:
# import seaborn as sns

pred = best_rf.predict(X_test)
print("ACCURACY:", accuracy_score(pred, y_test))
matrix = confusion_matrix(pred, y_test)
matrix

ACCURACY: 0.6095430107526881


array([[594,  52, 252,  30,   5],
       [  0,  16,   7,   0,   0],
       [158,  29, 291,  33,   5],
       [  0,   0,   8,   6,   1],
       [  0,   0,   0,   1,   0]], dtype=int64)

In [124]:
best_svc = 0
best_score = 0

C_range =[1, 10, 100]
gamma_range = [0.1, 1, 10, 100]
for c in C_range:
    for g in gamma_range:
        svc = SVC(kernel='rbf', gamma=g,C=c, verbose = True)
        svc.fit(X_train, y_train)
        score = accuracy_score(y_test, svc.predict(X_test))
        print(c, g, ":",score)
        if score > best_score:
            best_score = score
            best_svc = svc

[LibSVM]1 0.1 : 0.603943661971831
[LibSVM]1 1 : 0.624225352112676
[LibSVM]1 10 : 0.6495774647887324
[LibSVM]1 100 : 0.5954929577464789
[LibSVM]10 0.1 : 0.5994366197183099
[LibSVM]10 1 : 0.6332394366197183
[LibSVM]10 10 : 0.6428169014084507
[LibSVM]10 100 : 0.5949295774647887
[LibSVM]

KeyboardInterrupt: 

In [125]:
# import seaborn as sns

pred = best_svc.predict(X_test)
print("ACCURACY:", accuracy_score(pred, y_test))
matrix = confusion_matrix(pred, y_test)
matrix

ACCURACY: 0.6495774647887324


array([[380, 121],
       [501, 773]], dtype=int64)

In [126]:
best = 1
highest = 0
best_knn = 0

for i in range(1, 15):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    pred = knn.predict(X_test)
    score = round(accuracy_score(pred, y_test)*10000)/100
    print("k =", i, "  ACCURACY:", score)
    if score > highest:
        highest = score
        best = i
        best_knn = knn
print("Best k:", best, highest)

k = 1   ACCURACY: 62.03
k = 2   ACCURACY: 59.83
k = 3   ACCURACY: 59.1
k = 4   ACCURACY: 59.38
k = 5   ACCURACY: 60.23
k = 6   ACCURACY: 57.35
k = 7   ACCURACY: 58.37
k = 8   ACCURACY: 57.52
k = 9   ACCURACY: 58.87
k = 10   ACCURACY: 58.54
k = 11   ACCURACY: 58.54
k = 12   ACCURACY: 57.69
k = 13   ACCURACY: 58.65
k = 14   ACCURACY: 58.42
Best k: 1 62.03


In [127]:
# import seaborn as sns

pred = best_knn.predict(X_test)
print("ACCURACY:", accuracy_score(pred, y_test))
matrix = confusion_matrix(pred, y_test)
matrix

ACCURACY: 0.6202816901408451


array([[592, 385],
       [289, 509]], dtype=int64)

In [128]:
# neural network
from sklearn.neural_network import MLPClassifier
nn = MLPClassifier(alpha=1e-05, hidden_layer_sizes=(15, 10), random_state=42,
              solver='lbfgs')

nn.fit(X_train, y_train)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(15, 10), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=42, shuffle=True, solver='lbfgs', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [129]:
pred = nn.predict(X_test)
print("ACCURACY:", accuracy_score(pred, y_test))
matrix = confusion_matrix(pred, y_test)
matrix

ACCURACY: 0.5971830985915493


array([[550, 384],
       [331, 510]], dtype=int64)