## Random Forest Classifier
The model predicts the severity of the landslide (or if there will even be one) within the next 2 days, based on weather data from the past 5 days.
A Random Forest model with 113 trees yielded an accuracy of 81.21% when trained on slope data and precipitation and wind data over a 5 day period.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
from sklearn.utils import shuffle
import pickle

In [2]:
# df = pd.read_csv("full_dataset_v1.csv")
# df = pd.read_csv("/Users/ishaanjavali/Documents/Science Fair/2020/Code/API/full_dataset_v1.csv")
df = pd.read_csv("/Users/ishaanjavali/Documents/Science Fair/2020/Code/API/full_dataset_good.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
df['severity'].value_counts()

na              4476
medium          3300
small            612
large            426
unknown           75
very_large        56
...                5
landslide          2
catastrophic       1
Name: severity, dtype: int64

In [4]:
# filter by severity. na is for non-landslide data
df = df[df['severity'].isin(["medium", "small", "large", "very_large", "na"])]
# Remove -1 slopes
df = df.loc[~(df.slope == -1)]

In [5]:
df['severity'].value_counts()
df = shuffle(df)
df.reset_index(inplace=True, drop=True)
print(len(df))

8696


In [6]:
def getX():
    X = df.copy()

    X.drop(X.columns[[i for i in range(0, 151)]], axis = 1, inplace = True)
    # X.drop(X.columns[[i for i in range(20, 35)]], axis = 1, inplace = True)
    X["severity"] = df["severity"]
    X.drop(X.columns[[0]], axis = 1, inplace = True)

    X = X.dropna()
    for i in range(0, 8):
        del X['air' + str(i)]
        del X['temp' + str(i)]
        del X['humidity' + str(i)]
        
    X.drop(X.columns[[i for i in range(len(X.columns)-8, len(X.columns)-2)]], axis = 1, inplace = True)
    return X
X = getX()
X

Unnamed: 0,precip7,wind7,precip6,wind6,precip5,wind5,precip4,wind4,precip3,wind3,slope,severity
0,0.8,4,0.3,5,6.1,12,16.5,29,2.1,10,121,medium
1,3.6,9,6.5,14,1.9,12,1.1,13,1.2,13,91,na
2,0.3,13,0.2,9,1.2,15,0.0,12,0.1,14,26,na
3,0.1,9,5.0,12,7.2,12,2.9,11,3.7,9,151,medium
4,2.7,11,8.0,15,2.2,7,0.5,7,0.0,11,54,na
...,...,...,...,...,...,...,...,...,...,...,...,...
8691,2.2,10,0.3,9,2.2,8,2.4,7,1.1,10,77,medium
8692,0.0,12,0.0,10,0.0,10,0.0,12,0.0,11,7,na
8693,1.9,23,1.5,24,0.1,19,4.2,28,1.5,27,52,medium
8694,3.6,7,2.9,8,2.7,9,12.1,8,0.4,6,78,small


### Generate Labels
For binary classification, pass `True` into the function call

In [7]:
def generate_labels(binary = False):
    global X
    X = getX()
    y = []
    idx_to_severity = ["na", "small", "medium", "large", "very_large"]
    for severity in X.severity:
        if not binary:
            y.append(idx_to_severity.index(severity))
        elif severity == "na":
            y.append(0)
        else:
            y.append(1)
    X.drop(X.columns[[-1]], axis = 1, inplace = True)
    print(y.count(1))
    print("Y Length:",len(y))
    return y

In [8]:
y = generate_labels(True)

4311
Y Length: 8696


In [9]:
for i in range(5):
    print(i, y.count(i))
X

0 4385
1 4311
2 0
3 0
4 0


Unnamed: 0,precip7,wind7,precip6,wind6,precip5,wind5,precip4,wind4,precip3,wind3,slope
0,0.8,4,0.3,5,6.1,12,16.5,29,2.1,10,121
1,3.6,9,6.5,14,1.9,12,1.1,13,1.2,13,91
2,0.3,13,0.2,9,1.2,15,0.0,12,0.1,14,26
3,0.1,9,5.0,12,7.2,12,2.9,11,3.7,9,151
4,2.7,11,8.0,15,2.2,7,0.5,7,0.0,11,54
...,...,...,...,...,...,...,...,...,...,...,...
8691,2.2,10,0.3,9,2.2,8,2.4,7,1.1,10,77
8692,0.0,12,0.0,10,0.0,10,0.0,12,0.0,11,7
8693,1.9,23,1.5,24,0.1,19,4.2,28,1.5,27,52
8694,3.6,7,2.9,8,2.7,9,12.1,8,0.4,6,78


## Scaling

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

X_train
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Prediction

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
model = RandomForestClassifier()
model.fit(X_train, y_train)

RandomForestClassifier()

In [12]:
pred = model.predict(X_test)
print("ACCURACY:", accuracy_score(pred, y_test))

ACCURACY: 0.635632183908046


In [13]:
best = 1
highest = 0

for i in range(85, 200, 2):
    rf = RandomForestClassifier(n_estimators = i)
    rf.fit(X_train, y_train)
    pred = rf.predict(X_test)
    score = round(accuracy_score(pred, y_test)*10000)/100
    print("n_estimators =", i, "  ACCURACY:", score)
    if score > highest:
        highest = score
        best = i
print("# of trees:", best, highest)

n_estimators = 85   ACCURACY: 64.08
n_estimators = 87   ACCURACY: 63.68
n_estimators = 89   ACCURACY: 63.1
n_estimators = 91   ACCURACY: 64.77
n_estimators = 93   ACCURACY: 63.05
n_estimators = 95   ACCURACY: 63.62
n_estimators = 97   ACCURACY: 63.91
n_estimators = 99   ACCURACY: 62.7
n_estimators = 101   ACCURACY: 62.87
n_estimators = 103   ACCURACY: 63.97
n_estimators = 105   ACCURACY: 64.31
n_estimators = 107   ACCURACY: 63.45
n_estimators = 109   ACCURACY: 62.64
n_estimators = 111   ACCURACY: 63.74
n_estimators = 113   ACCURACY: 62.93
n_estimators = 115   ACCURACY: 64.48
n_estimators = 117   ACCURACY: 62.59
n_estimators = 119   ACCURACY: 63.85
n_estimators = 121   ACCURACY: 63.45
n_estimators = 123   ACCURACY: 63.97
n_estimators = 125   ACCURACY: 63.45
n_estimators = 127   ACCURACY: 62.13
n_estimators = 129   ACCURACY: 63.62
n_estimators = 131   ACCURACY: 63.68
n_estimators = 133   ACCURACY: 62.93
n_estimators = 135   ACCURACY: 64.83
n_estimators = 137   ACCURACY: 63.33
n_estimator