In [2]:
import pandas as pd
from sklearn.neighbors import BallTree
import math
import numpy as np

In [1]:
import sys
import pathlib
import pandas as pd

ROOT = pathlib.Path().absolute().parent.as_posix()
if ROOT not in sys.path:
    sys.path.append(ROOT)
    
from helpers import *

In [2]:
df = pd.read_csv(ROOT_FOLDER_PATH + '/land_type_025.csv', index_col = 0)

#preprocess land type columns, convert to binary from boolean
for i in ['Airport', 'Water', 'Building', 'Green_Space', 'Railway_Station', 'Urban_Area']:
    df[i] = df[i].astype(int)

In [3]:
df

Unnamed: 0,Latitude,Longitude,Airport,Water,Building,Green_Space,Railway_Station,Urban_Area
0,51.737184,-0.620643,0,0,0,1,0,0
1,51.737184,-0.617012,0,0,0,0,0,0
2,51.737184,-0.613382,0,0,1,1,0,0
3,51.737183,-0.609751,0,0,1,1,0,0
4,51.737183,-0.606120,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...
58243,51.238843,0.312049,0,0,0,1,0,0
58244,51.238815,0.315640,0,0,0,1,0,0
58245,51.238786,0.319231,0,1,1,1,0,0
58246,51.238757,0.322822,0,1,1,0,0,0


In [4]:
df_greenspace1 = df.loc[df.Green_Space == 1, :]
df_greenspace1 = df_greenspace1[['Latitude', 'Longitude']]
df_greenspace1 = df_greenspace1.apply(np.radians)

df_greenspace0 = df.loc[df.Green_Space == 0, :]
df_greenspace0 = df_greenspace0[['Latitude', 'Longitude']]
df_greenspace0 = df_greenspace0.apply(np.radians)

In [5]:
tree = BallTree(df_greenspace1, leaf_size=40, metric = 'haversine')   
dist, ind = tree.query(df_greenspace0, k=1)

In [6]:
print(dist)

[[3.92402534e-05]
 [3.92402534e-05]
 [3.92402534e-05]
 ...
 [3.92402534e-05]
 [3.92402534e-05]
 [3.92383149e-05]]


In [7]:
print(len(df_greenspace0))
print(len(dist))

12295
12295


In [8]:
for i in range(len(dist)):
    print(dist[i][0])

3.924025344306556e-05
3.924025344306439e-05
3.924025344306552e-05
3.9240253443065725e-05
3.9240253443064675e-05
3.924025344306553e-05
3.924025344306503e-05
3.9240253443064804e-05
3.924025344306565e-05
3.9240253443064546e-05
3.9240253443064926e-05
3.9240253443065434e-05
3.9240253443064804e-05
3.924025344306528e-05
3.924025344306472e-05
3.9240253443066315e-05
5.477122674591321e-05
3.924025344306496e-05
3.9240253443064967e-05
3.924025344306592e-05
3.924025344306438e-05
3.924025344306814e-05
3.924025344306258e-05
3.9240253443066966e-05
5.476800890226668e-05
3.924025344306286e-05
3.924025344306515e-05
5.62145392310451e-05
3.924025344306427e-05
3.9240253443064255e-05
7.848050688613184e-05
5.6214188821276954e-05
3.9242204032666184e-05
5.476733614847613e-05
8.682491934484157e-05
7.84805068861299e-05
3.924025344306482e-05
3.924025344306281e-05
3.9240253443065786e-05
3.924025344306685e-05
3.9240253443068965e-05
3.924025344306222e-05
3.924025344306739e-05
3.9240253443066254e-05
3.9240253443065414

In [9]:
distances = []
for i in range(len(dist)):
    distances.append(dist[i][0])
    
radius_earth = 6371   #radius of earth in km
#convert distances to km by multiplying by radius of the earth
distances_km = [item * radius_earth for item in distances]

In [10]:
distances_km

[0.2499996546857707,
 0.24999965468576324,
 0.24999965468577043,
 0.24999965468577173,
 0.24999965468576504,
 0.24999965468577048,
 0.2499996546857673,
 0.24999965468576588,
 0.24999965468577126,
 0.24999965468576424,
 0.24999965468576665,
 0.24999965468576987,
 0.24999965468576588,
 0.24999965468576887,
 0.24999965468576535,
 0.24999965468577548,
 0.3489474855982131,
 0.24999965468576685,
 0.2499996546857669,
 0.24999965468577298,
 0.24999965468576316,
 0.2499996546857871,
 0.2499996546857517,
 0.24999965468577964,
 0.34892698471634104,
 0.24999965468575347,
 0.24999965468576807,
 0.3581428294409883,
 0.24999965468576246,
 0.24999965468576238,
 0.49999930937154596,
 0.3581405969803555,
 0.25001208189211627,
 0.3489226986019414,
 0.5531615611459856,
 0.49999930937153364,
 0.24999965468576596,
 0.24999965468575316,
 0.24999965468577212,
 0.2499996546857789,
 0.24999965468579238,
 0.24999965468574942,
 0.24999965468578236,
 0.2499996546857751,
 0.24999965468576976,
 0.249999654685762,
 0

In [11]:
df_greenspace0 = df.loc[df.Green_Space == 0, :]   #get green_space = 0 where latitudes and longitudes have not been converted to radians
df_greenspace0 = df_greenspace0.reset_index(drop = True)
column_values = pd.Series(distances_km)
df_greenspace0.insert(loc=8, column='Distance_Nearest_Greenspace', value=column_values)
df_greenspace0

Unnamed: 0,Latitude,Longitude,Airport,Water,Building,Green_Space,Railway_Station,Urban_Area,Distance_Nearest_Greenspace
0,51.737184,-0.617012,0,0,0,0,0,0,0.250000
1,51.737180,-0.591598,0,0,1,0,0,0,0.250000
2,51.737176,-0.577076,0,0,0,0,0,0,0.250000
3,51.737174,-0.573445,0,0,0,0,0,0,0.250000
4,51.737168,-0.558923,0,0,0,0,0,0,0.250000
...,...,...,...,...,...,...,...,...,...
12290,51.241116,-0.039869,0,0,0,0,0,0,0.250000
12291,51.240988,-0.014731,0,0,1,0,0,0,0.250000
12292,51.240970,-0.011140,0,0,1,0,0,0,0.250000
12293,51.240875,0.006816,0,0,1,0,0,1,0.250000


In [12]:
df_merged = pd.merge(df, df_greenspace0[['Latitude', 'Longitude', 'Distance_Nearest_Greenspace']], how="left", on=['Latitude', 'Longitude'])

In [13]:
df_merged['Distance_Nearest_Greenspace'] = df_merged['Distance_Nearest_Greenspace'].replace(np.nan, 0.000000)

In [14]:
df_merged['Distance_Nearest_Greenspace'].value_counts()

0.000000    45953
0.250000       35
0.250000       34
0.250000       33
0.250000       31
            ...  
0.250015        1
0.250000        1
0.348931        1
0.250003        1
0.249987        1
Name: Distance_Nearest_Greenspace, Length: 5364, dtype: int64

In [15]:
df_merged

Unnamed: 0,Latitude,Longitude,Airport,Water,Building,Green_Space,Railway_Station,Urban_Area,Distance_Nearest_Greenspace
0,51.737184,-0.620643,0,0,0,1,0,0,0.000000
1,51.737184,-0.617012,0,0,0,0,0,0,0.250000
2,51.737184,-0.613382,0,0,1,1,0,0,0.000000
3,51.737183,-0.609751,0,0,1,1,0,0,0.000000
4,51.737183,-0.606120,0,0,0,1,0,0,0.000000
...,...,...,...,...,...,...,...,...,...
58243,51.238843,0.312049,0,0,0,1,0,0,0.000000
58244,51.238815,0.315640,0,0,0,1,0,0,0.000000
58245,51.238786,0.319231,0,1,1,1,0,0,0.000000
58246,51.238757,0.322822,0,1,1,0,0,0,0.249987


In [16]:
df

Unnamed: 0,Latitude,Longitude,Airport,Water,Building,Green_Space,Railway_Station,Urban_Area
0,51.737184,-0.620643,0,0,0,1,0,0
1,51.737184,-0.617012,0,0,0,0,0,0
2,51.737184,-0.613382,0,0,1,1,0,0
3,51.737183,-0.609751,0,0,1,1,0,0
4,51.737183,-0.606120,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...
58243,51.238843,0.312049,0,0,0,1,0,0
58244,51.238815,0.315640,0,0,0,1,0,0
58245,51.238786,0.319231,0,1,1,1,0,0
58246,51.238757,0.322822,0,1,1,0,0,0


In [17]:
from time import time
start = time()
df_processed = dist_nearest_greenspace_function(df)
end = time()
time_taken1 = round(end - start, 2)
print('dist_nearest_greenspace_function complete')                      
print('Time taken:', time_taken1)

dist_nearest_greenspace_function complete
Time taken: 0.35


In [18]:
df_processed

Unnamed: 0,Latitude,Longitude,Airport,Water,Building,Green_Space,Railway_Station,Urban_Area,Distance_Nearest_Greenspace
0,51.737184,-0.620643,0,0,0,1,0,0,0.000000
1,51.737184,-0.617012,0,0,0,0,0,0,0.250000
2,51.737184,-0.613382,0,0,1,1,0,0,0.000000
3,51.737183,-0.609751,0,0,1,1,0,0,0.000000
4,51.737183,-0.606120,0,0,0,1,0,0,0.000000
...,...,...,...,...,...,...,...,...,...
58243,51.238843,0.312049,0,0,0,1,0,0,0.000000
58244,51.238815,0.315640,0,0,0,1,0,0,0.000000
58245,51.238786,0.319231,0,1,1,1,0,0,0.000000
58246,51.238757,0.322822,0,1,1,0,0,0,0.249987
