## Random Forest Classifier
The model predicts the severity of the landslide (or if there will even be one) within the next 2 days, based on weather data from the past 5 days.
A Random Forest model with 113 trees yielded an accuracy of 81.21% when trained on slope data and precipitation and wind data over a 5 day period.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
from sklearn.utils import shuffle
import pickle

In [None]:
df = pd.read_csv("dataset.csv")

len(df)

In [None]:
df['severity'].value_counts()

In [None]:
df['severity'].value_counts()
df = shuffle(df)
df.reset_index(inplace=True, drop=True)
print(len(df))

In [None]:
import random
X = []
y = []
days = dict()
for idx, row in df.iterrows():
    lastday = random.randint(6, 9)
    if row.landslide == 1:
        y.append(lastday-5)
    else:
#         print(y.count(-1))
        if y.count(-1) == 2650:
            continue
        y.append(-1)
    temp=[]
    if lastday in days:
        days[lastday] +=1
    else:
        days[lastday] = 0
    for i in range(7):
        temp.append(row['humidity' + str(lastday-i)])
        temp.append(row['ARI' + str(lastday-i)])
        temp.append(row['wind' + str(lastday-i)])
    temp.append(row['slope'])
    year = int(str(row.date)[-2:])
    temp.append(row['forest2'])
    temp.append(row['realosm'])
    X.append(temp)
    if idx == 0:
        print(year, row.forest_year)
# print(days)
print(y.count(-1), y.count(1), y.count(2), y.count(3), y.count(4))

## Scaling

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

X_train
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Prediction

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [None]:
pred = model.predict(X_test)
print("ACCURACY:", accuracy_score(pred, y_test))

In [None]:
best = 1
highest = 0

for i in range(50, 150, 3):
    rf = RandomForestClassifier(n_estimators = i)
    rf.fit(X_train, y_train)
    pred = rf.predict(X_test)
    score = round(accuracy_score(pred, y_test)*10000)/100
    print("n_estimators =", i, "  ACCURACY:", score)
    if score > highest:
        highest = score
        best = i
print("# of trees:", best, highest)

In [None]:
rf = RandomForestClassifier(n_estimators = best)
rf.fit(X_train, y_train)
pred = rf.predict(X_test)
print(accuracy_score(pred, y_test))
from sklearn.metrics import confusion_matrix
array = confusion_matrix(y_test, pred)
array

In [None]:
# array = [[1254,245],[161,1902]]

In [None]:
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt
binary=False
if binary:
    df_cm = pd.DataFrame(array, index = [i for i in ["No", "Yes"]],
                    columns = [i for i in ["No", "Yes"]])
else:
    df_cm = pd.DataFrame(array, index = [i for i in ["No", "6", "7", "8", "9"]],
                  columns = [i for i in ["No", "6", "7", "8", "9"]])

plt.figure(figsize = (10,7))

ax = sn.heatmap(df_cm, cmap="Greens", annot=True, annot_kws={"size":30}, fmt='g')
ax.tick_params(axis='both', which='major', labelsize=27)
plt.xlabel('Predicted', fontsize = 34) 
# plt.title("Forecasting Landslide Day Confusion Matrix", fontsize = 30)
plt.ylabel('Actual', fontsize = 34) 
plt.savefig("Equal Class Proper OSM Random Forest Days in Advance", bbox_inches="tight")

plt.show()

In [None]:
y.count(-1)

In [None]:
y.count(3)

In [None]:
t = sum(array)
total = sum(t)
total

In [None]:
score = array[0][0]
for col in range(1,5):
    cur = 0
    for row in range(col, 5):
        cur += array[row][col]
    print(cur)
    score += cur
score

In [None]:
# This is the early forecast score
score /= total
score

In [None]:
stored= [[443,  20,  15,  18,  20],
       [ 29, 202, 111, 109,  94],
       [ 38, 118, 143,  84, 118],
       [ 37, 105, 110, 164, 113],
       [ 35,  78, 108,  93, 187]]

In [None]:
# Raw accuracy
diag = 0
for i in range(5):
    diag += stored[i][i]
print(diag, diag/2592)