## Random Forest Classifier
The model predicts the severity of the landslide (or if there will even be one) within the next 2 days, based on weather data from the past 5 days.
A Random Forest model with 113 trees yielded an accuracy of 81.21% when trained on slope data and precipitation and wind data over a 5 day period.

In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
from sklearn.utils import shuffle
import pickle

In [25]:
# df = pd.read_csv("full_dataset_v1.csv")
# df = pd.read_csv("/Users/ishaanjavali/Documents/Science Fair/2020/Code/API/full_dataset_v1.csv")
df = pd.read_csv("/Users/ishaanjavali/Documents/Science Fair/2020/Code/API/full_dataset_good.csv")

In [26]:
df['severity'].value_counts()

na              4476
medium          3300
small            612
large            426
unknown           75
very_large        56
...                5
landslide          2
catastrophic       1
Name: severity, dtype: int64

In [27]:
# filter by severity. na is for non-landslide data
df = df[df['severity'].isin(["medium", "small", "large", "very_large", "na"])]
# Remove -1 slopes
df = df.loc[~(df.slope == -1)]

In [28]:
df['severity'].value_counts()
df = shuffle(df)
df.reset_index(inplace=True, drop=True)
print(len(df))

8696


In [29]:
def getX():
    X = df.copy()

    X.drop(X.columns[[i for i in range(0, 151)]], axis = 1, inplace = True)
    # X.drop(X.columns[[i for i in range(20, 35)]], axis = 1, inplace = True)
    X["severity"] = df["severity"]
    X.drop(X.columns[[0]], axis = 1, inplace = True)

    X = X.dropna()
    for i in range(0, 8):
        del X['air' + str(i)]
        del X['temp' + str(i)]
        del X['humidity' + str(i)]
        
    X.drop(X.columns[[i for i in range(len(X.columns)-8, len(X.columns)-2)]], axis = 1, inplace = True)
    return X
X = getX()
X

Unnamed: 0,precip7,wind7,precip6,wind6,precip5,wind5,precip4,wind4,precip3,wind3,slope,severity
0,0.1,21,0.0,20,0.0,18,0.7,15,0.4,14,50,na
1,1.4,14,4.6,13,8.0,13,5.4,9,3.4,10,96,na
2,0.2,6,6.7,5,7.9,8,2.8,5,1.2,6,59,na
3,0.2,15,2.1,13,0.4,13,0.2,19,0.5,17,185,na
4,4.0,8,3.5,5,6.4,5,1.3,5,9.5,8,29,na
...,...,...,...,...,...,...,...,...,...,...,...,...
8691,0.0,9,0.0,10,0.0,15,0.0,12,0.0,15,112,medium
8692,5.1,9,0.6,9,5.5,10,3.6,10,6.0,10,101,medium
8693,0.0,11,0.0,10,0.0,6,0.0,11,0.0,9,51,na
8694,2.3,6,6.4,7,7.1,3,2.3,4,1.7,9,86,na


### Generate Labels
For binary classification, pass `True` into the function call

In [30]:
def generate_labels(binary = False):
    global X
    X = getX()
    y = []
    idx_to_severity = ["na", "small", "medium", "large", "very_large"]
    for severity in X.severity:
        if not binary:
            y.append(idx_to_severity.index(severity))
        elif severity == "na":
            y.append(0)
        else:
            y.append(1)
    X.drop(X.columns[[-1]], axis = 1, inplace = True)
    print(y.count(1))
    print("Y Length:",len(y))
    return y

In [31]:
y = generate_labels(True)

4311
Y Length: 8696


In [32]:
for i in range(5):
    print(i, y.count(i))
X

0 4385
1 4311
2 0
3 0
4 0


Unnamed: 0,precip7,wind7,precip6,wind6,precip5,wind5,precip4,wind4,precip3,wind3,slope
0,0.1,21,0.0,20,0.0,18,0.7,15,0.4,14,50
1,1.4,14,4.6,13,8.0,13,5.4,9,3.4,10,96
2,0.2,6,6.7,5,7.9,8,2.8,5,1.2,6,59
3,0.2,15,2.1,13,0.4,13,0.2,19,0.5,17,185
4,4.0,8,3.5,5,6.4,5,1.3,5,9.5,8,29
...,...,...,...,...,...,...,...,...,...,...,...
8691,0.0,9,0.0,10,0.0,15,0.0,12,0.0,15,112
8692,5.1,9,0.6,9,5.5,10,3.6,10,6.0,10,101
8693,0.0,11,0.0,10,0.0,6,0.0,11,0.0,9,51
8694,2.3,6,6.4,7,7.1,3,2.3,4,1.7,9,86


## Scaling

In [33]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

X_train
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Prediction

In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
model = RandomForestClassifier()
model.fit(X_train, y_train)

RandomForestClassifier()

In [35]:
pred = model.predict(X_test)
print("ACCURACY:", accuracy_score(pred, y_test))

ACCURACY: 0.6224137931034482


In [14]:
best = 1
highest = 0

for i in range(85, 200, 2):
    rf = RandomForestClassifier(n_estimators = i)
    rf.fit(X_train, y_train)
    pred = rf.predict(X_test)
    score = round(accuracy_score(pred, y_test)*10000)/100
    print("n_estimators =", i, "  ACCURACY:", score)
    if score > highest:
        highest = score
        best = i
print("# of trees:", best, highest)

n_estimators = 85   ACCURACY: 62.97
n_estimators = 87   ACCURACY: 62.63
n_estimators = 89   ACCURACY: 62.12
n_estimators = 91   ACCURACY: 63.25
n_estimators = 93   ACCURACY: 61.61
n_estimators = 95   ACCURACY: 62.18
n_estimators = 97   ACCURACY: 62.74
n_estimators = 99   ACCURACY: 61.27
n_estimators = 101   ACCURACY: 62.97
n_estimators = 103   ACCURACY: 62.4
n_estimators = 105   ACCURACY: 63.7
n_estimators = 107   ACCURACY: 63.13
n_estimators = 109   ACCURACY: 62.06
n_estimators = 111   ACCURACY: 62.74
n_estimators = 113   ACCURACY: 63.42
n_estimators = 115   ACCURACY: 63.59
n_estimators = 117   ACCURACY: 63.47
n_estimators = 119   ACCURACY: 61.89
n_estimators = 121   ACCURACY: 62.18
n_estimators = 123   ACCURACY: 64.77
n_estimators = 125   ACCURACY: 62.4
n_estimators = 127   ACCURACY: 63.36
n_estimators = 129   ACCURACY: 63.42
n_estimators = 131   ACCURACY: 63.36
n_estimators = 133   ACCURACY: 61.1
n_estimators = 135   ACCURACY: 63.42
n_estimators = 137   ACCURACY: 63.75
n_estimators 