In [1]:
import pandas as pd
from sklearn.neighbors import BallTree
import math
import numpy as np
import statistics

### Sets the relative root path to search for files, here we take the parent folder as the root

In [21]:
import sys
import pathlib
import pandas as pd

ROOT = pathlib.Path().absolute().parent.as_posix()
if ROOT not in sys.path:
    sys.path.append(ROOT)
    
from helpers import *

In [39]:
df = pd.read_csv(ROOT_FOLDER_PATH + '/land_type_025.csv', index_col = 0)

#preprocess land type columns, convert to binary from boolean
for i in ['Airport', 'Water', 'Building', 'Green_Space', 'Railway_Station', 'Urban_Area']:
    df[i] = df[i].astype(int)

In [40]:
df_filter = df[(df['Latitude'] >= 51.46506) & (df['Latitude'] <= 51.47855)]
df_filter = df_filter[(df_filter['Longitude'] >= -0.4865776) & (df_filter['Longitude'] <= -0.4216181)]

df_filter['Airport'] = 1

df_filter

Unnamed: 0,Latitude,Longitude,Airport,Water,Building,Green_Space,Railway_Station,Urban_Area
30264,51.478545,-0.483491,1,0,1,0,0,1
30265,51.478541,-0.479881,1,0,1,0,0,1
30266,51.478536,-0.476271,1,0,1,0,0,1
30267,51.478532,-0.472661,1,0,1,0,0,1
30268,51.478527,-0.469051,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...
31597,51.467238,-0.436138,1,0,0,0,0,1
31598,51.467232,-0.432529,1,0,1,0,0,1
31599,51.467226,-0.428920,1,0,1,0,0,1
31600,51.467220,-0.425311,1,0,1,0,0,1


In [41]:
df.update(df_filter)

In [42]:
df.to_csv(ROOT_FOLDER_PATH + '/land_type_025.csv', index = True)
upload_df_to_s3(bucket = 'asdi-hackathon', df = df, key = 'land_type_025.csv')

Successful upload


In [4]:
df

Unnamed: 0,Latitude,Longitude,Airport,Water,Building,Green_Space,Railway_Station,Urban_Area
0,51.737184,-0.620643,0,0,0,1,0,0
1,51.737184,-0.617012,0,0,0,0,0,0
2,51.737184,-0.613382,0,0,1,1,0,0
3,51.737183,-0.609751,0,0,1,1,0,0
4,51.737183,-0.606120,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...
58243,51.238843,0.312049,0,0,0,1,0,0
58244,51.238815,0.315640,0,0,0,1,0,0
58245,51.238786,0.319231,0,1,1,1,0,0
58246,51.238757,0.322822,0,1,1,0,0,0


In [5]:
df_greenspace1 = df.loc[df.Green_Space == 1, :]
df_greenspace1 = df_greenspace1[['Latitude', 'Longitude']]
df_greenspace1 = df_greenspace1.apply(np.radians)

df_greenspace0_1 = df
df_greenspace0_1 = df_greenspace0_1[['Latitude', 'Longitude']]
df_greenspace0_1 = df_greenspace0_1.apply(np.radians)

### BallTree search algorithm to find the nearest 3 neighbours of the greenspace = 1 data points and find an average distance

#### Given the green space land type considers both large and small greenspaces, to avoid finding a single, small nearest green space (which would reflect a lesser benefit), we consider an aggregated distance of 3 green space points. Nearby larger green spaces like parks are better reflected.

In [6]:
tree = BallTree(df_greenspace1, leaf_size=40, metric = 'haversine')   
dist, ind = tree.query(df_greenspace0_1, k=3)

In [7]:
dist[1][0]

3.924025344306556e-05

In [8]:
print(dist)

[[0.00000000e+00 3.92536504e-05 7.84805069e-05]
 [3.92402534e-05 3.92402534e-05 5.47755427e-05]
 [0.00000000e+00 3.92402534e-05 5.62220690e-05]
 ...
 [0.00000000e+00 3.92383804e-05 3.92402534e-05]
 [3.92383149e-05 3.92402534e-05 3.92402534e-05]
 [0.00000000e+00 3.92382493e-05 5.62069776e-05]]


#### Take the mean of the distances from the nearest 3 greenspaces. Mean over median because we want the 0 values (i.e. greenspace currently at coordinate) to be fully reflected in final aggregated distance. Extremes on the other end should be limited by the fact only considering 3 closest neighbours, the likelihood of a really high distance skewing the aggregated distance is small at k=3

In [9]:
for i in range(len(dist)):
    dist[i] = statistics.mean(dist[i])
print(dist)

[[3.92447191e-05 3.92447191e-05 3.92447191e-05]
 [4.44186832e-05 4.44186832e-05 4.44186832e-05]
 [3.18207742e-05 3.18207742e-05 3.18207742e-05]
 ...
 [2.61595446e-05 2.61595446e-05 2.61595446e-05]
 [3.92396072e-05 3.92396072e-05 3.92396072e-05]
 [3.18150756e-05 3.18150756e-05 3.18150756e-05]]


In [11]:
print(len(df_greenspace0_1))
print(len(dist))

58248
58248


In [12]:
for i in range(len(dist)):
    print(dist[i][0])

3.924471910399256e-05
4.4418683198187253e-05
3.182077415923306e-05
2.6160168962043287e-05
2.6160168962043423e-05
2.6160168962043294e-05
2.616016896204362e-05
2.616451541569653e-05
3.9244582786609734e-05
2.6164481175236844e-05
2.6160168962043477e-05
3.133847639013996e-05
4.490499988132587e-05
4.442277972349102e-05
2.616439522178049e-05
2.616016896204345e-05
2.616436069945154e-05
3.924442788926077e-05
2.6164326096673016e-05
2.616016896204355e-05
2.616429141337164e-05
4.4418437102251496e-05
2.616425664943484e-05
2.616016896204355e-05
2.6160168962043524e-05
2.616016896204364e-05
2.6160168962043724e-05
2.6160168962043253e-05
2.6160168962043613e-05
2.6160168962043585e-05
2.6160168962043443e-05
2.616016896204395e-05
2.6160168962043735e-05
2.6160168962043375e-05
2.616016896204358e-05
2.6160168962043802e-05
2.6160168962043545e-05
2.6160168962043545e-05
2.616016896204372e-05
2.616016896204347e-05
2.6160168962043585e-05
3.133810075730728e-05
3.924398906229184e-05
2.6163886766357166e-05
2.61601689

In [13]:
distances = []
for i in range(len(dist)):
    distances.append(dist[i][0])
    
radius_earth = 6371   #radius of earth in km
#convert distances to km by multiplying by radius of the earth
distances_km = [item * radius_earth for item in distances]

In [14]:
distances_km

[0.2500281054115366,
 0.28299143065565097,
 0.20273015216847382,
 0.1666664364571778,
 0.16666643645717866,
 0.16666643645717782,
 0.1666664364571799,
 0.1666941277134026,
 0.2500272369334906,
 0.16669390956743393,
 0.166666436457179,
 0.19965743308158168,
 0.28608975424392713,
 0.28301752961836124,
 0.1666933619579635,
 0.16666643645717882,
 0.16669314201620578,
 0.25002625008248036,
 0.1666929215619038,
 0.16666643645717946,
 0.1666927005945907,
 0.2829898627784443,
 0.16669247911354937,
 0.16666643645717946,
 0.1666664364571793,
 0.16666643645718002,
 0.16666643645718057,
 0.16666643645717757,
 0.16666643645717985,
 0.16666643645717968,
 0.16666643645717877,
 0.16666643645718202,
 0.16666643645718063,
 0.16666643645717835,
 0.16666643645717963,
 0.16666643645718107,
 0.16666643645717943,
 0.16666643645717943,
 0.16666643645718054,
 0.16666643645717896,
 0.16666643645717968,
 0.19965503992480468,
 0.2500234543158613,
 0.1666901225884615,
 0.166666436457181,
 0.2027268160120744,
 0.25

In [15]:
column_values = pd.Series(distances_km)
df.insert(loc=8, column='Distance_Nearest_Greenspace', value=column_values)
df

Unnamed: 0,Latitude,Longitude,Airport,Water,Building,Green_Space,Railway_Station,Urban_Area,Distance_Nearest_Greenspace
0,51.737184,-0.620643,0,0,0,1,0,0,0.250028
1,51.737184,-0.617012,0,0,0,0,0,0,0.282991
2,51.737184,-0.613382,0,0,1,1,0,0,0.202730
3,51.737183,-0.609751,0,0,1,1,0,0,0.166666
4,51.737183,-0.606120,0,0,0,1,0,0,0.166666
...,...,...,...,...,...,...,...,...,...
58243,51.238843,0.312049,0,0,0,1,0,0,0.166663
58244,51.238815,0.315640,0,0,0,1,0,0,0.166663
58245,51.238786,0.319231,0,1,1,1,0,0,0.166662
58246,51.238757,0.322822,0,1,1,0,0,0,0.249996


In [16]:
df['Distance_Nearest_Greenspace'].value_counts()

0.166666    258
0.166666    239
0.166666    234
0.166666    232
0.166666    228
           ... 
0.166675      1
0.166675      1
0.282989      1
0.319037      1
0.202694      1
Name: Distance_Nearest_Greenspace, Length: 28238, dtype: int64

In [17]:
df['Distance_Nearest_Greenspace'].describe()

count    58248.000000
mean         0.199523
std          0.080860
min          0.166658
25%          0.166666
50%          0.166666
75%          0.166691
max          1.390527
Name: Distance_Nearest_Greenspace, dtype: float64

### Test function created based on this notebook from helpers.py

In [19]:
df = pd.read_csv(ROOT_FOLDER_PATH + '/land_type_025.csv', index_col = 0)

#preprocess land type columns, convert to binary from boolean
for i in ['Airport', 'Water', 'Building', 'Green_Space', 'Railway_Station', 'Urban_Area']:
    df[i] = df[i].astype(int)

from time import time
start = time()
df_processed = dist_nearest_greenspace_function(df)
end = time()
time_taken1 = round(end - start, 2)
print('dist_nearest_greenspace_function complete')                      
print('Time taken:', time_taken1)

dist_nearest_greenspace_function complete
Time taken: 1.72


In [20]:
df_processed

Unnamed: 0,Latitude,Longitude,Airport,Water,Building,Green_Space,Railway_Station,Urban_Area,Distance_Nearest_Greenspace
0,51.737184,-0.620643,0,0,0,1,0,0,0.250028
1,51.737184,-0.617012,0,0,0,0,0,0,0.282991
2,51.737184,-0.613382,0,0,1,1,0,0,0.202730
3,51.737183,-0.609751,0,0,1,1,0,0,0.166666
4,51.737183,-0.606120,0,0,0,1,0,0,0.166666
...,...,...,...,...,...,...,...,...,...
58243,51.238843,0.312049,0,0,0,1,0,0,0.166663
58244,51.238815,0.315640,0,0,0,1,0,0,0.166663
58245,51.238786,0.319231,0,1,1,1,0,0,0.166662
58246,51.238757,0.322822,0,1,1,0,0,0,0.249996
