## Random Forest Classifier
The model predicts the severity of the landslide (or if there will even be one) within the next 2 days, based on weather data from the past 5 days.
A Random Forest model with 113 trees yielded an accuracy of 81.21% when trained on slope data and precipitation and wind data over a 5 day period.

In [146]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
from sklearn.utils import shuffle
import pickle

In [147]:
df = pd.read_csv("full_dataset_v1.csv", low_memory='false')

In [148]:
df['severity'].value_counts()

na              4476
medium          3300
small            613
large            427
unknown           75
very_large        56
...                5
catastrophic       1
Name: severity, dtype: int64

In [149]:
# filter by severity. na is for non-landslide data
df = df[df['severity'].isin(["medium", "small", "large", "very_large", "na"])]

In [150]:
df['severity'].value_counts()
df = shuffle(df)
df.reset_index(inplace=True, drop=True)
print(len(df))

8872


In [151]:
X = df.copy()

X.drop(X.columns[[i for i in range(0, 151)]], axis = 1, inplace = True)
# X.drop(X.columns[[i for i in range(20, 35)]], axis = 1, inplace = True)
X["severity"] = df["severity"]
X.drop(X.columns[[0]], axis = 1, inplace = True)

X = X.dropna()
for i in range(0, 8):
    del X['air' + str(i)]
    del X['temp' + str(i)]
    del X['humidity' + str(i)]
    
X.drop(X.columns[[i for i in range(len(X.columns)-8, len(X.columns)-2)]], axis = 1, inplace = True)
X.drop(columns=['label'], inplace=True)
X

Unnamed: 0,precip7,wind7,precip6,wind6,precip5,wind5,precip4,wind4,precip3,wind3,precip2,severity
0,1.2,7,1.3,8,0.1,8,0.0,8,0.2,6,0.4,na
1,3.9,8,2.6,6,1.1,9,2.9,14,1.1,10,0.9,medium
2,2.0,10,2.2,13,0.5,12,9.1,6,0.3,13,0.0,na
3,2.9,8,1.5,9,5.5,7,3.2,10,1.4,14,2.7,na
4,0.5,12,3.4,15,5.7,13,2.1,12,0.1,15,0.0,medium
...,...,...,...,...,...,...,...,...,...,...,...,...
8867,3.1,14,0.9,15,0.2,12,0.3,13,0.2,11,0.0,na
8868,1.6,15,0.1,12,0.6,12,0.8,13,1.1,12,0.0,na
8869,3.3,7,3.0,9,2.1,12,3.1,10,3.4,8,7.1,na
8870,0.0,7,0.0,9,0.0,7,0.0,7,0.0,8,0.0,na


### Generate Labels
For binary classification, pass `True` into the function call

In [152]:
def generate_labels(binary = False):
    y = []
    idx_to_severity = ["na", "small", "medium", "large", "very_large"]
    for severity in X.severity:
        if binary:
            y.append(idx_to_severity.index(severity))
        elif severity == "na":
            y.append(0)
        else:
            y.append(1)
    X.drop(X.columns[[-1]], axis = 1, inplace = True)
    print(y.count(1))
    return y

In [153]:
y = generate_labels(False)
print(sum(y))
print(len(y))

4396
4396
8872


## Scaling

In [154]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Prediction

In [155]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
model = RandomForestClassifier()
model.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [156]:
pred = model.predict(X_test)
print("ACCURACY:", accuracy_score(pred, y_test))

ACCURACY: 0.6309859154929578


In [157]:
best = 1
highest = 0
best_rf = 0

for i in range(85, 200, 2):
    rf = RandomForestClassifier(n_estimators = i)
    rf.fit(X_train, y_train)
    pred = rf.predict(X_test)
    score = round(accuracy_score(pred, y_test)*10000)/100
    print("n_estimators =", i, "  ACCURACY:", score)
    if score > highest:
        highest = score
        best = i
        best_rf = rf
        
print("# of trees:", best, highest)

n_estimators = 85   ACCURACY: 64.45
n_estimators = 87   ACCURACY: 65.18
n_estimators = 89   ACCURACY: 64.9
n_estimators = 91   ACCURACY: 64.45
n_estimators = 93   ACCURACY: 63.89
n_estimators = 95   ACCURACY: 64.79
n_estimators = 97   ACCURACY: 64.73
n_estimators = 99   ACCURACY: 65.13
n_estimators = 101   ACCURACY: 64.85
n_estimators = 103   ACCURACY: 64.9
n_estimators = 105   ACCURACY: 66.08
n_estimators = 107   ACCURACY: 63.77
n_estimators = 109   ACCURACY: 65.01
n_estimators = 111   ACCURACY: 64.34
n_estimators = 113   ACCURACY: 64.45
n_estimators = 115   ACCURACY: 65.8
n_estimators = 117   ACCURACY: 65.18
n_estimators = 119   ACCURACY: 65.13
n_estimators = 121   ACCURACY: 65.63
n_estimators = 123   ACCURACY: 64.73
n_estimators = 125   ACCURACY: 65.35
n_estimators = 127   ACCURACY: 63.89
n_estimators = 129   ACCURACY: 64.45
n_estimators = 131   ACCURACY: 64.79
n_estimators = 133   ACCURACY: 65.86
n_estimators = 135   ACCURACY: 65.01
n_estimators = 137   ACCURACY: 65.63
n_estimators

In [158]:
pred = best_rf.predict(X_test)
print("ACCURACY:", accuracy_score(pred, y_test))

ACCURACY: 0.6608450704225353


In [159]:
matrix = confusion_matrix(pred, y_test)

In [160]:
import seaborn as sns
sns.heatmap(matrix)

ModuleNotFoundError: No module named 'seaborn'