## Random Forest Classifier
The model predicts the severity of the landslide (or if there will even be one) within the next 2 days, based on weather data from the past 5 days.
A Random Forest model with 113 trees yielded an accuracy of 81.21% when trained on slope data and precipitation and wind data over a 5 day period.

In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
from sklearn.utils import shuffle
import pickle

In [47]:
df = pd.read_csv("../../data/dataset.csv")
len(df)

17808

In [48]:
df['severity'].value_counts()

medium          5594
small           2612
unknown          834
large            679
...              359
very_large       102
Medium            72
Unknown           19
Large             17
Small             10
catastrophic       4
landslide          2
Very_large         1
Very...large       1
Name: severity, dtype: int64

In [49]:
df['severity'].value_counts()
df = shuffle(df)
df.reset_index(inplace=True, drop=True)
print(len(df))

17808


In [50]:
X = df.copy()
y = X.landslide
columns=[]
for i in range(9, 4, -1):
    columns.append('humidity' + str(i))
    columns.append('ARI' + str(i))
columns.append('slope')
columns.append('forest2')
columns.append('osm')
X = X[columns]
X

IndentationError: expected an indented block (<ipython-input-50-3d4b7ed2e5de>, line 7)

## Scaling

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

X_train
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Prediction

In [52]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
model = RandomForestClassifier()
model.fit(X_train, y_train)

RandomForestClassifier()

In [53]:
pred = model.predict(X_test)
print("ACCURACY:", accuracy_score(pred, y_test))

ACCURACY: 0.8790005614823133


In [44]:
best = 1
highest = 0

for i in range(85, 150, 2):
    rf = RandomForestClassifier(n_estimators = i)
    rf.fit(X_train, y_train)
    pred = rf.predict(X_test)
    score = round(accuracy_score(pred, y_test)*10000)/100
    print("n_estimators =", i, "  ACCURACY:", score)
    if score > highest:
        highest = score
        best = i
print("# of trees:", best, highest)

n_estimators = 85   ACCURACY: 88.1
n_estimators = 87   ACCURACY: 88.04
n_estimators = 89   ACCURACY: 88.1
n_estimators = 91   ACCURACY: 88.15
n_estimators = 93   ACCURACY: 87.68
n_estimators = 95   ACCURACY: 87.98
n_estimators = 97   ACCURACY: 88.27
n_estimators = 99   ACCURACY: 88.27
n_estimators = 101   ACCURACY: 88.21
n_estimators = 103   ACCURACY: 88.1
n_estimators = 105   ACCURACY: 88.35
n_estimators = 107   ACCURACY: 88.35
n_estimators = 109   ACCURACY: 88.35
n_estimators = 111   ACCURACY: 88.21
n_estimators = 113   ACCURACY: 88.18
n_estimators = 115   ACCURACY: 88.07
n_estimators = 117   ACCURACY: 88.21
n_estimators = 119   ACCURACY: 88.6
n_estimators = 121   ACCURACY: 88.35
n_estimators = 123   ACCURACY: 88.15
n_estimators = 125   ACCURACY: 88.46
n_estimators = 127   ACCURACY: 88.15
n_estimators = 129   ACCURACY: 88.35
n_estimators = 131   ACCURACY: 88.38
n_estimators = 133   ACCURACY: 88.27
n_estimators = 135   ACCURACY: 88.35
n_estimators = 137   ACCURACY: 88.1
n_estimators =

KeyboardInterrupt: 

In [None]:
rf = RandomForestClassifier(n_estimators = 139)
rf.fit(X_train, y_train)
pred = rf.predict(X_test)
print(accuracy_score(pred, y_test))
from sklearn.metrics import confusion_matrix
array = confusion_matrix(y_test, pred)
array

In [None]:
array = [[1254,245],[161,1902]]

In [None]:
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt
binary=True
if binary:
    df_cm = pd.DataFrame(array, index = [i for i in ["No", "Yes"]],
                    columns = [i for i in ["No", "Yes"]])
else:
    df_cm = pd.DataFrame(array, index = [i for i in ["None", "Small", "Medium", "Large", "Very Large"]],
                  columns = [i for i in ["None", "Small", "Medium", "Large", "Very Large"]])

plt.figure(figsize = (10,7))

ax = sn.heatmap(df_cm, cmap="Greens", annot=True, annot_kws={"size":50}, fmt='g')
ax.tick_params(axis='both', which='major', labelsize=27)
plt.xlabel('Predicted', fontsize = 40) 
# plt.title("KNN Confusion Matrix", fontsize = 50)
plt.ylabel('Actual', fontsize = 40) 
plt.savefig("RF Matrix", bbox_inches="tight")


plt.show()

In [None]:
print({i: j for i, j in zip(X.columns, rf.feature_importances_)})
plt.barh(X.columns, rf.feature_importances_)
plt.title("Random Forest Feature Importances")
plt.xlabel("Importance (normalized)")
plt.ylabel("Feature")
plt.savefig("RF-Importances.png")