In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
from sklearn.utils import shuffle
import pickle
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("full_dataset_v1.csv")

In [3]:
df.head()


Unnamed: 0.1,Unnamed: 0,id,date,lat,lon,country,fatalities,injuries,type,trigger,...,air1,humidity1,wind1,precip0,temp0,air0,humidity0,wind0,slope,label
0,0.0,9204.0,03-03-2016,13.9791,121.8982,Philippines,0.0,0.0,landslide,flooding,...,1017,90,25,0.9,85,1016,87,23,54,1
1,1.0,12369.0,8/15/2015,20.8404,105.7341,Vietnam,0.0,0.0,riverbank_collapse,downpour,...,1006,91,10,0.0,98,1007,80,12,4,1
2,2.0,6842.0,03-10-2017,33.370552,75.193916,India,0.0,0.0,landslide,rain,...,1012,81,19,1.4,66,1012,72,11,115,1
3,3.0,8580.0,07-09-2017,24.827676,93.51825,India,0.0,0.0,landslide,continuous_rain,...,1005,99,4,6.5,81,1008,100,4,107,1
4,4.0,9452.0,07-03-2011,27.3167,86.5,Nepal,0.0,0.0,landslide,continuous_rain,...,1004,99,8,3.5,76,1006,99,11,77,1


In [4]:
df['severity'].value_counts()


medium          3300
small            613
large            427
unknown           75
very_large        56
...                5
catastrophic       1
Name: severity, dtype: int64

In [5]:
# filter by severity. na is for non-landslide data
df = df[df['severity'].isin(["medium", "small", "large", "very_large", "na"])]

In [6]:
df['severity'].value_counts()

medium        3300
small          613
large          427
very_large      56
Name: severity, dtype: int64

In [7]:
df = shuffle(df)
df.reset_index(inplace=True, drop=True)
print(len(df))

4396


In [8]:
X = df.copy()
df_col_length = len(df.columns)

In [9]:
X.drop(X.columns[[0]], axis = 1, inplace = True)

In [10]:
X

Unnamed: 0,id,date,lat,lon,country,fatalities,injuries,type,trigger,severity,...,air1,humidity1,wind1,precip0,temp0,air0,humidity0,wind0,slope,label
0,2821.0,6/23/2011,29.031300,114.538900,China,0.0,0.0,debris_flow,downpour,very_large,...,1003,97,11,0.0,99,1002,97,10,58,1
1,5809.0,2/16/2011,33.330800,75.177400,India,0.0,0.0,landslide,downpour,medium,...,1014,89,11,2.2,63,1018,82,9,121,1
2,6704.0,2/25/2009,-7.911500,112.343000,Indonesia,0.0,0.0,mudslide,downpour,medium,...,1008,99,6,4.1,84,1008,99,7,0,1
3,1847.0,07-09-2017,27.084494,93.605060,India,0.0,0.0,landslide,continuous_rain,medium,...,1006,99,5,12.7,77,1009,99,6,49,1
4,2231.0,6/20/2010,7.504800,124.311300,Philippines,0.0,0.0,landslide,downpour,medium,...,1013,97,6,11.6,92,1012,98,6,62,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4391,6161.0,5/15/2017,15.416025,75.616704,India,0.0,0.0,landslide,rain,medium,...,1009,79,28,5.7,96,1010,80,28,19,1
4392,7269.0,07-05-2016,18.772300,99.285300,Thailand,0.0,0.0,landslide,downpour,medium,...,1011,94,12,0.0,90,1008,92,9,95,1
4393,7097.0,10/25/2013,16.517500,80.611700,India,0.0,4.0,landslide,rain,medium,...,1007,98,18,0.0,90,1007,97,12,56,1
4394,7773.0,05-09-2018,33.370670,74.319656,India,1.0,0.0,landslide,rain,small,...,1007,44,14,0.0,91,1009,47,11,60,1


In [11]:
def generate_labels(binary = False):
    y = []
    idx_to_severity = [ "large","medium","na", "small","very_large"]
    for severity in X.severity:
        y.append(idx_to_severity.index(severity))
    X.drop(X.columns[[-1]], axis = 1, inplace = True)
    print(y.count(1))
    return y

In [12]:
y = generate_labels(False)

3300


In [13]:
X.drop(X.columns[[0,1]],axis = 1, inplace = True)
df = X

In [14]:
def cat(string):
    df[string] = df[string].astype('category')

cat('country')
cat('type')
cat('trigger')
cat('location')
cat('severity')

In [15]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
def label_encode(string):
    le.fit(df[string])
    df[string] = le.transform(df[string])

label_encode('country')
label_encode('type')
label_encode('trigger')
label_encode('location')
label_encode('severity')

In [16]:
df.dropna(axis='columns')
df.fillna(X.mean(), inplace=True)
df.head()
X = df

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=1, stratify=y)

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [18]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import accuracy_score
model = LogisticRegressionCV()
model.fit(X_train,y_train)

LogisticRegressionCV()

In [19]:
pred = model.predict(X_test)
print("ACCURACY:", accuracy_score(pred, y_test))

ACCURACY: 0.9986216402481047


In [20]:
print(pred)

[3 3 1 ... 3 1 1]


In [21]:
best = 1
highest = 0

for i in range(85, 120, 2):
    rf = LogisticRegressionCV(max_iter=i)
    rf.fit(X_train, y_train)
    pred = rf.predict(X_test)
    score = round(accuracy_score(pred, y_test)*10000)/100
    print("n_estimators =", i, "  ACCURACY:", score)
    if score > highest:
        highest = score
        best = i
print("# of trees = ", best)
print("Highest accuracy = ", highest)

n_estimators = 85   ACCURACY: 99.86
n_estimators = 87   ACCURACY: 99.86
n_estimators = 89   ACCURACY: 99.86
n_estimators = 91   ACCURACY: 99.86
n_estimators = 93   ACCURACY: 99.86
n_estimators = 95   ACCURACY: 99.86
n_estimators = 97   ACCURACY: 99.86
n_estimators = 99   ACCURACY: 99.86
n_estimators = 101   ACCURACY: 99.86
n_estimators = 103   ACCURACY: 99.86
n_estimators = 105   ACCURACY: 99.86
n_estimators = 107   ACCURACY: 99.86
n_estimators = 109   ACCURACY: 99.86
n_estimators = 111   ACCURACY: 99.86
n_estimators = 113   ACCURACY: 99.86
n_estimators = 115   ACCURACY: 99.86
n_estimators = 117   ACCURACY: 99.86
n_estimators = 119   ACCURACY: 99.86
# of trees =  85
Highest accuracy =  99.86
