## KNN Classifier
The model predicts the severity of the landslide (or if there will even be one) within the next 2 days, based on weather data from the past 5 days.
Binary Classification yielded a maximum accuracy of 77.53%. Severity Classification (multiple classes) was around 56%.

In [77]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
from sklearn.utils import shuffle
import pickle

In [78]:
df = pd.read_csv("full_dataset_v1.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [79]:
df['severity'].value_counts()

medium          3300
na              1321
small            613
large            427
unknown           75
very_large        56
...                5
catastrophic       1
Name: severity, dtype: int64

In [80]:
# filter by severity. na is for non-landslide data
df = df[df['severity'].isin(["medium", "small", "large", "very_large", "na"])]

In [81]:
df['severity'].value_counts()
df = shuffle(df)
df.reset_index(inplace=True, drop=True)
print(len(df))

5717


In [88]:
X = df.copy()

X.drop(X.columns[[i for i in range(0, 151)]], axis = 1, inplace = True)
# X.drop(X.columns[[i for i in range(20, 35)]], axis = 1, inplace = True)
X["severity"] = df["severity"]
X.drop(X.columns[[0]], axis = 1, inplace = True)

X = X.dropna()
for i in range(0, 8):
    del X['air' + str(i)]
    del X['temp' + str(i)]
    del X['humidity' + str(i)]
    
X.drop(X.columns[[i for i in range(len(X.columns)-8, len(X.columns)-2)]], axis = 1, inplace = True)
X

Unnamed: 0,precip7,wind7,precip6,wind6,precip5,wind5,precip4,wind4,precip3,wind3,slope,severity
0,4.4,9.0,4.3,10.0,4.1,8.0,5.3,9.0,5.7,9.0,99.0,medium
1,0.5,17.0,0.4,15.0,0.0,16.0,0.4,13.0,0.6,17.0,147.0,medium
2,3.5,7.0,0.5,8.0,3.1,6.0,1.5,7.0,7.4,8.0,106.0,large
3,0.2,17.0,2.5,10.0,4.1,11.0,3.6,9.0,1.0,14.0,5.0,medium
4,3.6,5.0,3.1,7.0,4.4,7.0,1.9,6.0,8.4,4.0,78.0,medium
...,...,...,...,...,...,...,...,...,...,...,...,...
5712,43.1,25.0,2.1,18.0,0.9,14.0,1.8,9.0,2.4,11.0,77.0,medium
5713,1.2,15.0,4.3,18.0,0.2,18.0,0.1,15.0,1.3,15.0,35.0,na
5714,1.6,11.0,4.8,9.0,4.7,10.0,1.7,12.0,0.9,10.0,20.0,medium
5715,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.0,medium


### Generate Labels
For binary classification, pass `True` into the function call

In [89]:
def generate_labels(binary = False):
    y = []
    idx_to_severity = ["na", "small", "medium", "large", "very_large"]
    for severity in X.severity:
        if not binary:
            y.append(idx_to_severity.index(severity))
        elif severity == "na":
            y.append(0)
        else:
            y.append(1)
    X.drop(X.columns[[-1]], axis = 1, inplace = True)
    print(y.count(1))
    return y

In [90]:
y = generate_labels()

613


## Scaling

In [91]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Prediction

In [92]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=17)
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=17)

In [93]:
from sklearn.metrics import accuracy_score
pred = knn.predict(X_test)
print("ACCURACY:", accuracy_score(pred, y_test))

ACCURACY: 0.5708041958041958


In [44]:
best = 1
highest = 0

for i in range(1, 130):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    pred = knn.predict(X_test)
    score = round(accuracy_score(pred, y_test)*10000)/100
    print("k =", i, "  ACCURACY:", score)
    if score > highest:
        highest = score
        best = i
print("Best k:", best, highest)

k = 1   ACCURACY: 73.95
k = 2   ACCURACY: 65.3
k = 3   ACCURACY: 73.86
k = 4   ACCURACY: 70.37
k = 5   ACCURACY: 75.26
k = 6   ACCURACY: 73.25
k = 7   ACCURACY: 75.52
k = 8   ACCURACY: 73.95
k = 9   ACCURACY: 74.83
k = 10   ACCURACY: 73.43
k = 11   ACCURACY: 75.17
k = 12   ACCURACY: 74.56
k = 13   ACCURACY: 75.26
k = 14   ACCURACY: 75.09
k = 15   ACCURACY: 76.05
k = 16   ACCURACY: 75.44
k = 17   ACCURACY: 75.79
k = 18   ACCURACY: 75.17
k = 19   ACCURACY: 75.87
k = 20   ACCURACY: 75.96
k = 21   ACCURACY: 77.1
k = 22   ACCURACY: 76.66
k = 23   ACCURACY: 77.01
k = 24   ACCURACY: 76.66
k = 25   ACCURACY: 76.84
k = 26   ACCURACY: 76.84
k = 27   ACCURACY: 76.92
k = 28   ACCURACY: 76.84
k = 29   ACCURACY: 77.01
k = 30   ACCURACY: 76.84
k = 31   ACCURACY: 76.66
k = 32   ACCURACY: 76.57
k = 33   ACCURACY: 76.92
k = 34   ACCURACY: 76.92
k = 35   ACCURACY: 76.75
k = 36   ACCURACY: 77.01
k = 37   ACCURACY: 77.1
k = 38   ACCURACY: 77.19
k = 39   ACCURACY: 77.27
k = 40   ACCURACY: 77.27
k = 41   ACC