In [147]:
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.model_selection import train_test_split
df = pd.read_csv('GLC.csv')

In [148]:
df

Unnamed: 0,landslide_category,landslide_size,fatality_count,event_date,landslide_trigger,landslide_setting,country_name,country_code,admin_division_population,longitude,latitude
0,landslide,large,11.0,8/1/2008 0:00,rain,mine,China,CN,0.0,107.450000,32.562500
1,mudslide,small,0.0,1/2/2009 2:00,downpour,unknown,United States,US,36619.0,-122.663000,45.420000
2,landslide,large,10.0,1/19/2007 0:00,downpour,unknown,Peru,PE,14708.0,-75.358700,-11.129500
3,landslide,medium,1.0,7/31/2009 0:00,monsoon,unknown,Nepal,NP,20908.0,81.708000,28.837800
4,landslide,medium,0.0,10/16/2010 12:00,tropical_cyclone,unknown,Philippines,PH,798634.0,123.897800,10.333600
...,...,...,...,...,...,...,...,...,...,...,...
11028,landslide,medium,27.0,4/1/2017 13:34,rain,natural_slope,,,,111.679944,-7.853409
11029,landslide,small,2.0,3/25/2017 17:32,other,natural_slope,,,,75.680611,33.403080
11030,landslide,small,1.0,12/15/2016 5:00,unknown,urban,,,,91.772042,26.181606
11031,translational_slide,large,24.0,4/29/2017 19:03,downpour,natural_slope,,,,73.472379,40.886395


In [149]:
df['landslide_size'] = df['landslide_size'].fillna('not_provided')
df['landslide_category'] = df['landslide_category'].fillna('not_provided')
df['landslide_trigger'] = df['landslide_trigger'].fillna('not_provided')
df['landslide_setting'] = df['landslide_setting'].fillna('not_provided')


In [150]:
a = df['landslide_category'].value_counts()

print(a)

landslide_category
landslide              7648
mudslide               2100
rock_fall               671
complex                 232
debris_flow             194
other                    68
unknown                  38
riverbank_collapse       37
snow_avalanche           15
translational_slide       9
lahar                     7
earth_flow                7
creep                     5
not_provided              1
topple                    1
Name: count, dtype: int64


In [151]:
distinct_values_list = df['landslide_category'].unique().tolist()

In [152]:
distinct_values_list

['landslide',
 'mudslide',
 'complex',
 'rock_fall',
 'debris_flow',
 'riverbank_collapse',
 'other',
 'unknown',
 'lahar',
 'snow_avalanche',
 'creep',
 'earth_flow',
 'translational_slide',
 'not_provided',
 'topple']

In [153]:
landslide_size_code_transform = {
    'unknown': 0, #Making 'unknown' into 0 because it feels more likely to be small than catastrophic
    'small': 1,
    'medium': 2, 
    'large': 3, 
    'very_large': 4,
    'catastrophic': 5,
    'NaN': 6
}

In [154]:
#Adds numeric codes for each string feature.
df['landslide_category_codes'] = df['landslide_category'].astype('category').cat.codes
#df['landslide_size_codes'] = df['landslide_size'].astype('category').cat.codes
df['landslide_size_codes'] = [landslide_size_code_transform.get(df['landslide_size'][i]) for i in range(len(df['landslide_size']))]
df['landslide_trigger_codes'] = df['landslide_trigger'].astype('category').cat.codes
df['landslide_setting_codes'] = df['landslide_setting'].astype('category').cat.codes
df['country_code_codes'] = df['country_code'].astype('category').cat.codes

In [155]:
df

Unnamed: 0,landslide_category,landslide_size,fatality_count,event_date,landslide_trigger,landslide_setting,country_name,country_code,admin_division_population,longitude,latitude,landslide_category_codes,landslide_size_codes,landslide_trigger_codes,landslide_setting_codes,country_code_codes
0,landslide,large,11.0,8/1/2008 0:00,rain,mine,China,CN,0.0,107.450000,32.562500,5,3.0,13,8,29
1,mudslide,small,0.0,1/2/2009 2:00,downpour,unknown,United States,US,36619.0,-122.663000,45.420000,6,1.0,3,13,129
2,landslide,large,10.0,1/19/2007 0:00,downpour,unknown,Peru,PE,14708.0,-75.358700,-11.129500,5,3.0,3,13,99
3,landslide,medium,1.0,7/31/2009 0:00,monsoon,unknown,Nepal,NP,20908.0,81.708000,28.837800,5,2.0,9,13,95
4,landslide,medium,0.0,10/16/2010 12:00,tropical_cyclone,unknown,Philippines,PH,798634.0,123.897800,10.333600,5,2.0,15,13,101
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11028,landslide,medium,27.0,4/1/2017 13:34,rain,natural_slope,,,,111.679944,-7.853409,5,2.0,13,9,-1
11029,landslide,small,2.0,3/25/2017 17:32,other,natural_slope,,,,75.680611,33.403080,5,1.0,12,9,-1
11030,landslide,small,1.0,12/15/2016 5:00,unknown,urban,,,,91.772042,26.181606,5,1.0,16,14,-1
11031,translational_slide,large,24.0,4/29/2017 19:03,downpour,natural_slope,,,,73.472379,40.886395,13,3.0,3,9,-1


In [156]:
#This is just for reference. We probably don't actually need to drop this data, but this sends the strings away.
del df['landslide_category']
del df['landslide_size']
del df['landslide_trigger']
del df['landslide_setting']
del df['country_name']
del df['country_code']

In [157]:
df

Unnamed: 0,fatality_count,event_date,admin_division_population,longitude,latitude,landslide_category_codes,landslide_size_codes,landslide_trigger_codes,landslide_setting_codes,country_code_codes
0,11.0,8/1/2008 0:00,0.0,107.450000,32.562500,5,3.0,13,8,29
1,0.0,1/2/2009 2:00,36619.0,-122.663000,45.420000,6,1.0,3,13,129
2,10.0,1/19/2007 0:00,14708.0,-75.358700,-11.129500,5,3.0,3,13,99
3,1.0,7/31/2009 0:00,20908.0,81.708000,28.837800,5,2.0,9,13,95
4,0.0,10/16/2010 12:00,798634.0,123.897800,10.333600,5,2.0,15,13,101
...,...,...,...,...,...,...,...,...,...,...
11028,27.0,4/1/2017 13:34,,111.679944,-7.853409,5,2.0,13,9,-1
11029,2.0,3/25/2017 17:32,,75.680611,33.403080,5,1.0,12,9,-1
11030,1.0,12/15/2016 5:00,,91.772042,26.181606,5,1.0,16,14,-1
11031,24.0,4/29/2017 19:03,,73.472379,40.886395,13,3.0,3,9,-1


In [158]:
df['datetime'] = pd.to_datetime(df['event_date'], format='%m/%d/%Y %H:%M')

df['year'] = df['datetime'].dt.year
df['day_of_year'] = df['datetime'].dt.dayofyear
df['minute_of_day'] = df['datetime'].dt.hour * 60 + df['datetime'].dt.minute
df['unix_timestamp'] = df['datetime'].astype('int64') // 10**9  # Convert to seconds

del df['datetime']
del df['event_date']
del df['unix_timestamp']

In [159]:
nan_filtered_df = df[df[['fatality_count', 'admin_division_population', 'longitude', 'latitude', 
                         'landslide_category_codes', 'landslide_size_codes', 'landslide_trigger_codes', 
                         'landslide_setting_codes', 'country_code_codes', 'year', 'day_of_year', 
                         'minute_of_day']].notnull().all(1)]
nan_filtered_df




Unnamed: 0,fatality_count,admin_division_population,longitude,latitude,landslide_category_codes,landslide_size_codes,landslide_trigger_codes,landslide_setting_codes,country_code_codes,year,day_of_year,minute_of_day
0,11.0,0.0,107.4500,32.5625,5,3.0,13,8,29,2008,214,0
1,0.0,36619.0,-122.6630,45.4200,6,1.0,3,13,129,2009,2,120
2,10.0,14708.0,-75.3587,-11.1295,5,3.0,3,13,99,2007,19,0
3,1.0,20908.0,81.7080,28.8378,5,2.0,9,13,95,2009,212,0
4,0.0,798634.0,123.8978,10.3336,5,2.0,15,13,101,2010,289,720
...,...,...,...,...,...,...,...,...,...,...,...,...
9889,0.0,31089.0,124.7333,6.3666,5,2.0,3,13,101,2011,18,0
9918,0.0,2689.0,125.4782,7.1426,5,2.0,3,13,101,2009,15,0
9939,0.0,16671.0,125.9667,7.6000,5,2.0,15,13,101,2014,12,0
10005,0.0,4534.0,-122.9542,38.4738,5,1.0,3,13,129,2011,48,540


In [160]:
X = nan_filtered_df.drop(columns='fatality_count')
y = nan_filtered_df['fatality_count']

In [161]:
X_train, X_test, y_train, y_test = train_test_split(X, y) #test_size=0.25, random_state=88

In [162]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)

In [163]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression().fit(X_train, y_train)

In [164]:
from sklearn.neighbors import KNeighborsRegressor
KNNTLA = KNeighborsRegressor(n_neighbors=2)

In [165]:
KNNTLA.fit(X, y)

In [166]:
#Knn, logistic, svm, mlp... only q is which of those to fiddle around with a lot to get it to preform better. 
#See which one looks best first.