In [9]:
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.model_selection import train_test_split
df = pd.read_csv('GLC.csv')

In [10]:
df

Unnamed: 0,landslide_category,landslide_size,fatality_count,event_date,landslide_trigger,landslide_setting,country_name,country_code,admin_division_population,longitude,latitude
0,landslide,large,11.0,8/1/2008 0:00,rain,mine,China,CN,0.0,107.450000,32.562500
1,mudslide,small,0.0,1/2/2009 2:00,downpour,unknown,United States,US,36619.0,-122.663000,45.420000
2,landslide,large,10.0,1/19/2007 0:00,downpour,unknown,Peru,PE,14708.0,-75.358700,-11.129500
3,landslide,medium,1.0,7/31/2009 0:00,monsoon,unknown,Nepal,NP,20908.0,81.708000,28.837800
4,landslide,medium,0.0,10/16/2010 12:00,tropical_cyclone,unknown,Philippines,PH,798634.0,123.897800,10.333600
...,...,...,...,...,...,...,...,...,...,...,...
11028,landslide,medium,27.0,4/1/2017 13:34,rain,natural_slope,,,,111.679944,-7.853409
11029,landslide,small,2.0,3/25/2017 17:32,other,natural_slope,,,,75.680611,33.403080
11030,landslide,small,1.0,12/15/2016 5:00,unknown,urban,,,,91.772042,26.181606
11031,translational_slide,large,24.0,4/29/2017 19:03,downpour,natural_slope,,,,73.472379,40.886395


In [11]:
#ADD: Make any data that have easily diterminable order use that order (small, medium, large, etc.)

In [12]:
#Adds numeric codes for each string feature.
df['landslide_category_codes'] = df['landslide_category'].astype('category').cat.codes
df['landslide_size_codes'] = df['landslide_size'].astype('category').cat.codes
df['landslide_trigger_codes'] = df['landslide_trigger'].astype('category').cat.codes
df['landslide_setting_codes'] = df['landslide_setting'].astype('category').cat.codes
df['country_code_codes'] = df['country_code'].astype('category').cat.codes

In [13]:
#This is just for reference. We probably don't actually need to drop this data, but this sends the strings away.
del df['landslide_category']
del df['landslide_size']
del df['landslide_trigger']
del df['landslide_setting']
del df['country_name']
del df['country_code']

In [14]:
df

Unnamed: 0,fatality_count,event_date,admin_division_population,longitude,latitude,landslide_category_codes,landslide_size_codes,landslide_trigger_codes,landslide_setting_codes,country_code_codes
0,11.0,8/1/2008 0:00,0.0,107.450000,32.562500,5,1,12,8,29
1,0.0,1/2/2009 2:00,36619.0,-122.663000,45.420000,6,3,3,12,129
2,10.0,1/19/2007 0:00,14708.0,-75.358700,-11.129500,5,1,3,12,99
3,1.0,7/31/2009 0:00,20908.0,81.708000,28.837800,5,2,9,12,95
4,0.0,10/16/2010 12:00,798634.0,123.897800,10.333600,5,2,14,12,101
...,...,...,...,...,...,...,...,...,...,...
11028,27.0,4/1/2017 13:34,,111.679944,-7.853409,5,2,12,9,-1
11029,2.0,3/25/2017 17:32,,75.680611,33.403080,5,3,11,9,-1
11030,1.0,12/15/2016 5:00,,91.772042,26.181606,5,3,15,13,-1
11031,24.0,4/29/2017 19:03,,73.472379,40.886395,12,1,3,9,-1


In [16]:
df['datetime'] = pd.to_datetime(df['event_date'], format='%m/%d/%Y %H:%M')

df['year'] = df['datetime'].dt.year
df['day_of_year'] = df['datetime'].dt.dayofyear
df['minute_of_day'] = df['datetime'].dt.hour * 60 + df['datetime'].dt.minute
df['unix_timestamp'] = df['datetime'].astype('int64') // 10**9  # Convert to seconds

del df['datetime']
del df['event_date']

KeyError: 'event_date'

In [21]:
nan_filtered_df = df[df[['fatality_count', 'admin_division_population', 'longitude', 'latitude', 
                         'landslide_category_codes', 'landslide_size_codes', 'landslide_trigger_codes', 
                         'landslide_setting_codes', 'country_code_codes', 'year', 'day_of_year', 
                         'minute_of_day', 'unix_timestamp']].notnull().all(1)]
nan_filtered_df




Unnamed: 0,fatality_count,admin_division_population,longitude,latitude,landslide_category_codes,landslide_size_codes,landslide_trigger_codes,landslide_setting_codes,country_code_codes,year,day_of_year,minute_of_day,unix_timestamp
0,11.0,0.0,107.4500,32.5625,5,1,12,8,29,2008,214,0,1217548800
1,0.0,36619.0,-122.6630,45.4200,6,3,3,12,129,2009,2,120,1230861600
2,10.0,14708.0,-75.3587,-11.1295,5,1,3,12,99,2007,19,0,1169164800
3,1.0,20908.0,81.7080,28.8378,5,2,9,12,95,2009,212,0,1248998400
4,0.0,798634.0,123.8978,10.3336,5,2,14,12,101,2010,289,720,1287230400
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9889,0.0,31089.0,124.7333,6.3666,5,2,3,12,101,2011,18,0,1295308800
9918,0.0,2689.0,125.4782,7.1426,5,2,3,12,101,2009,15,0,1231977600
9939,0.0,16671.0,125.9667,7.6000,5,2,14,12,101,2014,12,0,1389484800
10005,0.0,4534.0,-122.9542,38.4738,5,3,3,12,129,2011,48,540,1297933200


In [25]:
X = nan_filtered_df.drop(columns='fatality_count')
y = nan_filtered_df['fatality_count']

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y) #test_size=0.25, random_state=88

In [29]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)

In [28]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression().fit(X_train, y_train)

In [11]:
from sklearn.neighbors import KNeighborsRegressor
KNNTLA = KNeighborsRegressor(n_neighbors=2)

In [12]:
KNNTLA.fit(X, y)

ValueError: Input X contains NaN.
KNeighborsRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [22]:
#Knn, logistic, svm, mlp... only q is which of those to fiddle around with a lot to get it to preform better. 
#See which one looks best first.