In [2]:
import sys
import pathlib
import pandas as pd
import numpy as np
import boto3

ROOT = pathlib.Path().absolute().parent.as_posix()
if ROOT not in sys.path:
    sys.path.append(ROOT)
    
from helpers import *

In [7]:
client = boto3.client('s3')

bucket='asdi-hackathon'

file_key1 = 'population-data/popcsv/longitude0.csv'
file_key2 = 'population-data/popcsv/longitudeNeg10.csv'

obj1 = client.get_object(Bucket=bucket, Key=file_key1)
obj2 = client.get_object(Bucket=bucket, Key=file_key2)

df1 = pd.read_csv(obj1['Body'])
df2 = pd.read_csv(obj2['Body'])

In [8]:
#check data types, float64 or float32?
df1.dtypes

Unnamed: 0      int64
latitude      float64
longitude     float64
population    float64
dtype: object

In [9]:
df1.head()

Unnamed: 0.1,Unnamed: 0,latitude,longitude,population
0,1308823,51.560139,0.227083,5.40624
1,1308824,51.560694,0.22625,5.40624
2,1308825,51.560417,0.22625,5.40624
3,1308826,51.560417,0.228472,5.40624
4,1308827,51.559861,0.227361,5.40624


In [10]:
df2.head()

Unnamed: 0.1,Unnamed: 0,latitude,longitude,population
0,126943,51.620417,-0.288472,46.080284
1,126944,51.620417,-0.289306,46.080284
2,126945,51.620417,-0.289028,46.080284
3,126946,51.620139,-0.288194,46.080284
4,126947,51.620139,-0.28875,46.080284


In [11]:
df = pd.concat([df1, df2], ignore_index=True)
df = df.drop('Unnamed: 0', axis = 1)
df

Unnamed: 0,latitude,longitude,population
0,51.560139,0.227083,5.406240
1,51.560694,0.226250,5.406240
2,51.560417,0.226250,5.406240
3,51.560417,0.228472,5.406240
4,51.559861,0.227361,5.406240
...,...,...,...
1625276,51.554583,-0.208472,33.506246
1625277,51.553750,-0.207639,33.506246
1625278,51.554028,-0.207639,33.506246
1625279,51.554861,-0.208750,33.506246


In [12]:
df['population'].describe()

count    1.625281e+06
mean     6.951418e+00
std      8.148915e+00
min      1.374282e-01
25%      3.012382e+00
50%      5.275737e+00
75%      8.442115e+00
max      6.827970e+02
Name: population, dtype: float64

In [13]:
df.isna().sum()

latitude      0
longitude     0
population    0
dtype: int64

In [14]:
#check for duplicates
print(df.duplicated(subset = ['latitude', 'longitude'], keep=False).sum())

0


In [15]:
print(max(df['latitude']))
min(df['latitude'])

51.69291666666667


51.26513888888889

In [16]:
print(max(df['longitude']))
min(df['longitude'])

0.56125


-0.7465277777777778

In [17]:
nrow_before = len(df)
df_filter = df[(df['latitude'] >= 51.239405) & (df['latitude'] <= 51.737184)]
df_filter = df_filter[(df_filter['longitude'] >= -0.625211) & (df_filter['longitude'] <= 0.328289)]
nrow_after = len(df_filter)
print('Number of rows removed:', nrow_before - nrow_after)
df = df_filter

Number of rows removed: 264834


In [18]:
df

Unnamed: 0,latitude,longitude,population
0,51.560139,0.227083,5.406240
1,51.560694,0.226250,5.406240
2,51.560417,0.226250,5.406240
3,51.560417,0.228472,5.406240
4,51.559861,0.227361,5.406240
...,...,...,...
1625276,51.554583,-0.208472,33.506246
1625277,51.553750,-0.207639,33.506246
1625278,51.554028,-0.207639,33.506246
1625279,51.554861,-0.208750,33.506246


In [19]:
import sys
import pathlib

ROOT = pathlib.Path().absolute().parent.as_posix()
if ROOT not in sys.path:
    sys.path.append(ROOT)

try:
    points_df = pd.read_csv(ROOT + '/Spikes/Dash/data/points_df.csv', index_col = 0)
except:
    print('Make sure points_df.csv has beenb initialised via create_df_with_all_spatial_points.ipynb with the desired resolution')
    
points_df = points_df.rename(columns={'Latitude':'latitude', 'Longitude':'longitude'})

In [20]:
print(points_df.dtypes)
print('-'*50)
df.dtypes

latitude     float64
longitude    float64
dtype: object
--------------------------------------------------


latitude      float64
longitude     float64
population    float64
dtype: object

In [21]:
# points_df = points_df.sort_values(by=['latitude'], ascending = False)
# df = df.sort_values(by=['latitude'], ascending = False)
# print(points_df.head())
# print(df.head())

In [22]:
#compared points_df and pop densdity df, keep pop density rows from right table that exist in left table latitude/longitudes --> left join
#keep all on left table and only keep data from right that match left
#df = pd.merge(points_df, df, how="left", left_on=['latitude', 'longitude'], right_on=['latitude', 'longitude'])
df_merged = pd.merge(points_df, df, how="left", left_on=['latitude', 'longitude'], right_on=['latitude', 'longitude'])

In [23]:
print(len(points_df))
print(len(df))
print(len(df_merged))

58248
1360447
58248


In [24]:
df_merged.isna().sum()

latitude          0
longitude         0
population    58248
dtype: int64

In [25]:
df_merged

Unnamed: 0,latitude,longitude,population
0,51.737184,-0.620643,
1,51.737184,-0.617012,
2,51.737184,-0.613382,
3,51.737183,-0.609751,
4,51.737183,-0.606120,
...,...,...,...
58243,51.238843,0.312049,
58244,51.238815,0.315640,
58245,51.238786,0.319231,
58246,51.238757,0.322822,


In [26]:
#as dataset is huge, already larger than resolution we can plot, to avoid computation issues, can take a sample of 50% of the data
# print(len(df))
# df = df.sample(frac=0.50)
# print(len(df))

In [27]:
#as data high resolution, using knn model to nearest neighbour search takes an eternity
#converting from float64 to float32 both reduces memory and compute time
#there should be minimal loss in precision, besides for population precision loss isn't as important as for likes of latitude and longitude
#16bit: 0.1235
#32bit: 0.12345679
#64bit: 0.12345678912121212
df['population'] = df['population'].astype(np.float32)

df.dtypes

latitude      float64
longitude     float64
population    float32
dtype: object

In [28]:
#haversine requires latitude and longitude in radians
import math
df['latitude'] = df['latitude'].apply(math.radians)
df['longitude'] = df['longitude'].apply(math.radians)

In [29]:
from sklearn.neighbors import KNeighborsRegressor

#use all of data in train
X_train = df[['latitude', 'longitude']]
y_train = df['population'].ravel()

#use best k value
#lower k value works better for accuracy's sake for high resolution datasets such as pop density
k = 1
#kdtree and balltree more computationally efficient
#kdtree only works with minkowski, not haversine
#balltree does accept haversine
model = KNeighborsRegressor(n_neighbors=k, weights = 'distance', algorithm = 'ball_tree', metric = 'haversine', n_jobs = -1)
model.fit(X_train, y_train)

In [30]:
# pickle final model locally
# import pickle
# filename = ROOT + '/Pickles/popdensity_model.pkl'
# pickle.dump(model, open(filename, 'wb'))

In [3]:
#upload pickle to s3 bucket's pickles folder
upload_pickle_to_s3('asdi-hackathon', model, 'pickles/popdensity_model.pkl')

Successful upload
