## Random Forest Classifier
The model predicts the severity of the landslide (or if there will even be one) within the next 2 days, based on weather data from the past 5 days.
A Random Forest model with 113 trees yielded an accuracy of 81.21% when trained on slope data and precipitation and wind data over a 5 day period.

In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
from sklearn.utils import shuffle
import pickle

In [38]:
df = pd.read_csv("full_dataset_v1.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [39]:
df['severity'].value_counts()

medium          3300
na              1321
small            613
large            427
unknown           75
very_large        56
...                5
catastrophic       1
Name: severity, dtype: int64

In [40]:
# filter by severity. na is for non-landslide data
df = df[df['severity'].isin(["medium", "small", "large", "very_large", "na"])]

In [41]:
df['severity'].value_counts()
df = shuffle(df)
df.reset_index(inplace=True, drop=True)
print(len(df))

5717


In [42]:
X = df.copy()

X.drop(X.columns[[i for i in range(0, 151)]], axis = 1, inplace = True)
# X.drop(X.columns[[i for i in range(20, 35)]], axis = 1, inplace = True)
X["severity"] = df["severity"]
X.drop(X.columns[[0]], axis = 1, inplace = True)

X = X.dropna()
for i in range(0, 8):
    del X['air' + str(i)]
    del X['temp' + str(i)]
    del X['humidity' + str(i)]
    
X.drop(X.columns[[i for i in range(len(X.columns)-8, len(X.columns)-2)]], axis = 1, inplace = True)
X

Unnamed: 0,precip7,wind7,precip6,wind6,precip5,wind5,precip4,wind4,precip3,wind3,slope,severity
0,6.4,6.0,31.4,10.0,11.6,11.0,6.3,6.0,16.2,6.0,-1.0,large
1,0.0,15.0,1.0,29.0,5.3,12.0,1.2,37.0,13.3,51.0,5.0,large
2,0.3,8.0,0.4,6.0,0.0,6.0,0.1,6.0,0.0,6.0,67.0,na
3,0.0,14.0,0.0,17.0,0.0,16.0,0.0,15.0,0.0,15.0,125.0,medium
4,6.2,9.0,5.0,9.0,6.3,10.0,3.3,9.0,4.1,10.0,83.0,medium
...,...,...,...,...,...,...,...,...,...,...,...,...
5712,4.5,10.0,5.3,13.0,1.1,16.0,1.9,15.0,3.8,11.0,105.0,large
5713,7.3,26.0,1.1,10.0,0.5,11.0,1.0,13.0,1.7,11.0,80.0,medium
5714,0.0,13.0,0.0,16.0,0.0,14.0,0.0,17.0,0.0,21.0,94.0,na
5715,6.8,14.0,3.6,13.0,2.1,12.0,0.7,10.0,2.4,11.0,53.0,medium


### Generate Labels
For binary classification, pass `True` into the function call

In [43]:
def generate_labels(binary = False):
    y = []
    idx_to_severity = ["na", "small", "medium", "large", "very_large"]
    for severity in X.severity:
        if binary:
            y.append(idx_to_severity.index(severity))
        elif severity == "na":
            y.append(0)
        else:
            y.append(1)
    X.drop(X.columns[[-1]], axis = 1, inplace = True)
    print(y.count(1))
    return y

In [44]:
y = generate_labels(False)

4396


## Scaling

In [45]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Prediction

In [46]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
model = RandomForestClassifier()
model.fit(X_train, y_train)

RandomForestClassifier()

In [47]:
pred = model.predict(X_test)
print("ACCURACY:", accuracy_score(pred, y_test))

ACCURACY: 0.8015734265734266


In [25]:
best = 1
highest = 0

for i in range(85, 200, 2):
    rf = RandomForestClassifier(n_estimators = i)
    rf.fit(X_train, y_train)
    pred = rf.predict(X_test)
    score = round(accuracy_score(pred, y_test)*10000)/100
    print("n_estimators =", i, "  ACCURACY:", score)
    if score > highest:
        highest = score
        best = i
print("# of trees:", best, highest)

n_estimators = 85   ACCURACY: 80.16
n_estimators = 87   ACCURACY: 80.07
n_estimators = 89   ACCURACY: 80.42
n_estimators = 91   ACCURACY: 80.51
n_estimators = 93   ACCURACY: 80.42
n_estimators = 95   ACCURACY: 80.59
n_estimators = 97   ACCURACY: 80.86
n_estimators = 99   ACCURACY: 80.59
n_estimators = 101   ACCURACY: 80.33
n_estimators = 103   ACCURACY: 80.77
n_estimators = 105   ACCURACY: 80.42
n_estimators = 107   ACCURACY: 80.24
n_estimators = 109   ACCURACY: 81.03
n_estimators = 111   ACCURACY: 80.51
n_estimators = 113   ACCURACY: 81.21
n_estimators = 115   ACCURACY: 80.16
n_estimators = 117   ACCURACY: 80.94
n_estimators = 119   ACCURACY: 80.42
n_estimators = 121   ACCURACY: 80.24
n_estimators = 123   ACCURACY: 80.86
n_estimators = 125   ACCURACY: 80.51
n_estimators = 127   ACCURACY: 80.24
n_estimators = 129   ACCURACY: 80.42
n_estimators = 131   ACCURACY: 80.68
n_estimators = 133   ACCURACY: 80.33
n_estimators = 135   ACCURACY: 80.42
n_estimators = 137   ACCURACY: 80.68
n_estimat