## KNN Classifier
The model predicts the severity of the landslide (or if there will even be one) within the next 2 days, based on weather data from the past 5 days.
Binary Classification yielded a maximum accuracy of 77.53%. Severity Classification (multiple classes) was around 56%.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
from sklearn.utils import shuffle
import pickle

In [2]:
df = pd.read_csv("full_dataset_v1.csv")

In [3]:
df['severity'].value_counts()

medium          6604
small           1223
large            851
unknown          150
very_large       112
...               11
catastrophic       2
Name: severity, dtype: int64

In [4]:
# filter by severity. na is for non-landslide data
df = df[df['severity'].isin(["medium", "small", "large", "very_large", "na"])]

In [5]:
df['severity'].value_counts()
df = shuffle(df)
df.reset_index(inplace=True, drop=True)
print(len(df))

8790


In [6]:
X = df.copy()

X.drop(X.columns[[i for i in range(0, 151)]], axis = 1, inplace = True)
# X.drop(X.columns[[i for i in range(20, 35)]], axis = 1, inplace = True)
X["severity"] = df["severity"]
X.drop(X.columns[[0]], axis = 1, inplace = True)

X = X.dropna()
for i in range(0, 8):
    del X['air' + str(i)]
    del X['temp' + str(i)]
    del X['humidity' + str(i)]
    
X.drop(X.columns[[i for i in range(len(X.columns)-8, len(X.columns)-2)]], axis = 1, inplace = True)
X

Unnamed: 0,precip7,wind7,precip6,wind6,precip5,wind5,precip4,wind4,precip3,wind3,precip2,label,severity
0,1.2,11,3.3,8,0.5,9,1.6,11,0.4,14,0.7,0,medium
1,2.0,9,0.0,10,1.1,11,2.9,10,0.6,10,3.7,1,medium
2,0.3,21,3.7,25,10.5,32,6.1,28,12.9,31,7.9,0,medium
3,0.0,7,0.2,4,0.1,6,0.0,4,0.1,5,0.6,0,small
4,0.2,10,0.3,7,0.8,10,0.5,10,11.5,8,0.1,1,medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8785,0.1,12,0.1,12,0.0,14,0.0,13,0.5,14,4.4,1,small
8786,0.8,9,1.3,9,0.2,9,1.5,9,3.1,7,0.7,0,medium
8787,2.4,17,0.7,15,0.7,15,0.4,15,0.8,17,0.0,1,medium
8788,0.0,12,0.1,12,1.9,13,1.6,12,0.5,10,0.7,1,medium


### Generate Labels
For binary classification, pass `True` into the function call

In [7]:
def generate_labels(binary = False):
    y = []
    idx_to_severity = ["na", "small", "medium", "large", "very_large"]
    for severity in X.severity:
        if binary:
            y.append(idx_to_severity.index(severity))
        elif severity == "na":
            y.append(0)
        else:
            y.append(1)
    X.drop(X.columns[[-1]], axis = 1, inplace = True)
    print(y.count(1))
    return y

In [8]:
y = generate_labels()

8790


## Scaling

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Prediction

In [10]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=17)
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=17, p=2,
                     weights='uniform')

In [14]:
from sklearn.metrics import accuracy_score, confusion_matrix

pred = knn.predict(X_test)
print("ACCURACY:", accuracy_score(pred, y_test))
matrix = confusion_matrix(pred, y_test)

ACCURACY: 1.0


In [None]:
best = 1
highest = 0

for i in range(1, 130):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    pred = knn.predict(X_test)
    score = round(accuracy_score(pred, y_test)*10000)/100
    print("k =", i, "  ACCURACY:", score)
    if score > highest:
        highest = score
        best = i
print("Best k:", best, highest)

k = 1   ACCURACY: 100.0
k = 2   ACCURACY: 100.0
k = 3   ACCURACY: 100.0
k = 4   ACCURACY: 100.0
k = 5   ACCURACY: 100.0
k = 6   ACCURACY: 100.0
k = 7   ACCURACY: 100.0
k = 8   ACCURACY: 100.0
k = 9   ACCURACY: 100.0
k = 10   ACCURACY: 100.0
k = 11   ACCURACY: 100.0
k = 12   ACCURACY: 100.0
k = 13   ACCURACY: 100.0
k = 14   ACCURACY: 100.0
k = 15   ACCURACY: 100.0
k = 16   ACCURACY: 100.0
k = 17   ACCURACY: 100.0
k = 18   ACCURACY: 100.0
k = 19   ACCURACY: 100.0
k = 20   ACCURACY: 100.0
k = 21   ACCURACY: 100.0
k = 22   ACCURACY: 100.0
k = 23   ACCURACY: 100.0
k = 24   ACCURACY: 100.0
k = 25   ACCURACY: 100.0
k = 26   ACCURACY: 100.0
k = 27   ACCURACY: 100.0
k = 28   ACCURACY: 100.0
k = 29   ACCURACY: 100.0
k = 30   ACCURACY: 100.0
k = 31   ACCURACY: 100.0
k = 32   ACCURACY: 100.0
k = 33   ACCURACY: 100.0
k = 34   ACCURACY: 100.0
k = 35   ACCURACY: 100.0
k = 36   ACCURACY: 100.0
k = 37   ACCURACY: 100.0
k = 38   ACCURACY: 100.0
k = 39   ACCURACY: 100.0
k = 40   ACCURACY: 100.0
k = 41   