# Using Machine Learning to Calculate Shortest Distance Between Two Points
### Calculating the shortest distances between all households with children in the United States

### Import statements
Need to install dask-geopandas and pygeos as well

In [6]:
# pip install dask-geopandas==0.1.0a4

In [None]:
# pip install pygeos

In [1]:
import dask
import pandas as pd
from dask import dataframe as dd
import dask_geopandas
import geopandas as gpd
import os



### Reading InfoUSA data

In [2]:
DATA_DIR = os.getcwd()
DATA_DIR = DATA_DIR.replace('processing', 'data')
DATA_DIR

'/hpc/home/at341/ondemand/codeplus-celine-dcc-package/data'

In [7]:
%%time
df_hh = pd.read_parquet(DATA_DIR + '/infousa_merged.parquet')
df_hh

CPU times: user 291 ms, sys: 104 ms, total: 395 ms
Wall time: 202 ms


Unnamed: 0,zip,county_fips,state,child_num,has_child,age_code,lat_h_4326,lon_h_4326,lat_h_3857,lon_h_3857
0,84606,37041,NH,12,0,H,47.536477,-123.299151,-1.372560e+07,6.030085e+06
1,66723,23407,OR,3,0,A,43.121187,-123.473248,-1.374498e+07,5.330436e+06
2,59965,50536,NV,7,1,I,38.462998,-111.014933,-1.235813e+07,4.645040e+06
3,38676,38340,SD,3,1,B,48.433631,-80.411200,-8.951334e+06,6.179301e+06
4,75640,24383,OR,15,1,J,42.203405,-73.802723,-8.215682e+06,5.191497e+06
...,...,...,...,...,...,...,...,...,...,...
99995,16202,52106,AL,5,1,M,41.305474,-71.274986,-7.934295e+06,5.057504e+06
99996,85007,53083,RI,6,1,B,40.467252,-90.764017,-1.010380e+07,4.934076e+06
99997,64030,18524,AZ,7,0,I,36.134337,-97.876163,-1.089552e+07,4.319122e+06
99998,32071,12458,LA,1,1,J,37.978958,-106.804916,-1.188947e+07,4.576454e+06


Filter for only households with children. Because the test data we are using os not as big as the original InfoUSA data, we don't have to filter for only households with children.

In [8]:
# %%time
# df_hh = df[(df['has_child'] == 1)]
# df_hh

### Use Dask to transform pandas dataframe to a geopandas dataframe
For the code we use to calculate the shortest distance from each household to a tank, we must convert our dataframe ```df_hh``` to a GeoDataFrame. However, as this dataframe has 53 million rows and 10 columns, converting it without using Dask is not feasible. We attempted it, and ran the code for three hours and it was still not done. Hence, we turned to Dask, an open-source Python library for parallel computing. It allows us to efficiently execute the transformation of our dataframe to a GeoDataFrame, even when working with over 53 million rows. 

To use Dask, we first converted our dataframe to a Dask dataframe, using Dask's ```.from_pandas()``` method. This method takes in our pandas dataframe along with the ```npartitions``` parameter, which is used to specify the number of 'sections' the dask dataframe will be split into.

In [9]:
df_dask = dd.from_pandas(df_hh, npartitions = 500)

Then, we specify what manipulation to the dask dataframe ```df_dask``` to compute. In this case, we use Dask Geopandas' ```.points_from_xy()``` method to convert the pandas dask dataframe into a geopandas dask dataframe.

In [10]:
%%time
df_dask['geometry'] = dask_geopandas.points_from_xy(df_dask, 'lon_h_4326', 'lat_h_4326')

CPU times: user 1.32 s, sys: 143 ms, total: 1.46 s
Wall time: 1.46 s


After, we convert the dask geodataframe into a geopandas dataframe:

In [11]:
%%time
gdf = dask_geopandas.from_dask_dataframe(df_dask)

CPU times: user 7.16 ms, sys: 1.21 ms, total: 8.37 ms
Wall time: 7.75 ms


Calling compute puts all the above code into action. Dask executes each set of commands on each partition, as specified above. This returns GeoDataFrame ```gdf_hh```, with over 53 million rows, in less than 20 seconds.

In [12]:
%%time
gdf_hh = gdf.compute()

CPU times: user 2.84 s, sys: 453 ms, total: 3.29 s
Wall time: 2.81 s


In [13]:
gdf_hh = gdf_hh.reset_index(drop = True)
gdf_hh

Unnamed: 0,zip,county_fips,state,child_num,has_child,age_code,lat_h_4326,lon_h_4326,lat_h_3857,lon_h_3857,geometry
0,84606,37041,NH,12,0,H,47.536477,-123.299151,-1.372560e+07,6.030085e+06,POINT (-123.29915 47.53648)
1,81738,01023,WY,7,0,D,43.645283,-108.865972,-1.211890e+07,5.410712e+06,POINT (-108.86597 43.64528)
2,74456,46241,DC,6,0,F,48.506191,-101.045728,-1.124836e+07,6.191484e+06,POINT (-101.04573 48.50619)
3,99348,28045,OR,9,1,C,42.126887,-107.605732,-1.197862e+07,5.180006e+06,POINT (-107.60573 42.12689)
4,80757,30520,SC,9,1,I,46.010545,-75.330427,-8.385745e+06,5.782039e+06,POINT (-75.33043 46.01055)
...,...,...,...,...,...,...,...,...,...,...,...
960402,66642,52066,IL,6,1,I,36.390018,-109.150041,-1.215053e+07,4.354421e+06,POINT (-109.15004 36.39002)
960403,23288,48421,MD,1,1,H,42.192245,-96.961705,-1.079373e+07,5.189820e+06,POINT (-96.96170 42.19224)
960404,27809,56087,KS,11,0,B,35.258688,-78.694535,-8.760236e+06,4.199092e+06,POINT (-78.69454 35.25869)
960405,55542,46332,WY,13,1,G,49.438345,-90.031454,-1.002226e+07,6.349569e+06,POINT (-90.03145 49.43834)


Filtering for only the ```geometry``` column, as it is the only one we need to run the code below.

In [14]:
gdf_hh = gdf_hh[['geometry']]
gdf_hh

Unnamed: 0,geometry
0,POINT (-123.29915 47.53648)
1,POINT (-108.86597 43.64528)
2,POINT (-101.04573 48.50619)
3,POINT (-107.60573 42.12689)
4,POINT (-75.33043 46.01055)
...,...
960402,POINT (-109.15004 36.39002)
960403,POINT (-96.96170 42.19224)
960404,POINT (-78.69454 35.25869)
960405,POINT (-90.03145 49.43834)


### Reading AST data
Converting it into a GeoDataFrame with point geometries from the center latitude and longitude from each tank. Then, filtering for only the columns we need.

In [15]:
df_tanks = gpd.read_file(DATA_DIR + '/tanks_risk_score.shp')
df_tanks

Unnamed: 0,state,tank_type,diameter,lat_t_4326,lon_t_4326,lat_t_3857,lon_t_3857,county,on_floodpl,erqk_risks,swnd_risks,hrcn_risks,trnd_risks,cfld_risks,rfld_risks,avg_risk,adj_risk,geometry
0,Louisiana,closed_roof_tank,4.8,30.501991,-91.188296,-1.015103e+07,3.568241e+06,22033,0,4.149297,9.661013,14.415955,43.776313,9.471153,39.822684,20.216069,20.216069,POINT (-91.18830 30.50199)
1,Louisiana,closed_roof_tank,30.0,29.990189,-90.395876,-1.006282e+07,3.502289e+06,22089,0,1.208395,6.264728,13.189863,13.190995,17.685820,12.877608,10.736235,10.736235,POINT (-90.39588 29.99019)
2,Georgia,closed_roof_tank,20.4,34.221754,-83.783722,-9.326761e+06,4.058617e+06,13139,0,5.628088,12.104342,5.312985,31.912282,-1.000000,7.696209,10.442318,10.442318,POINT (-83.78372 34.22175)
3,Indiana,narrow_closed_roof_tank,4.8,37.906023,-87.926250,-9.787905e+06,4.566158e+06,18129,0,4.926164,10.959311,2.206652,12.846449,-1.000000,8.284501,6.537180,6.537180,POINT (-87.92625 37.90602)
4,New Mexico,closed_roof_tank,16.2,35.045340,-106.648430,-1.187205e+07,4.170044e+06,35001,0,18.185426,9.373074,-1.000000,15.079099,-1.000000,14.347347,9.497491,9.497491,POINT (-106.64843 35.04534)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
977,Iowa,closed_roof_tank,19.2,42.411899,-90.732966,-1.010035e+07,5.222881e+06,19061,0,1.575536,17.648163,4.544047,21.537919,-1.000000,12.580429,9.647682,9.647682,POINT (-90.73297 42.41190)
978,Wyoming,sedimentation_tank,24.0,42.862335,-106.293070,-1.183249e+07,5.291041e+06,56025,0,3.312025,2.867939,-1.000000,10.280441,-1.000000,6.010181,3.745098,3.745098,POINT (-106.29307 42.86233)
979,Missouri,closed_roof_tank,8.4,36.608666,-89.573830,-9.971313e+06,4.384699e+06,29143,0,17.807754,23.810359,8.253384,24.042775,-1.000000,18.432187,15.391077,15.391077,POINT (-89.57383 36.60867)
980,Rhode Island,closed_roof_tank,43.8,41.831766,-71.371080,-7.944992e+06,5.135812e+06,44007,0,9.400549,11.049468,5.819224,19.608082,7.130619,21.502062,12.418334,12.418334,POINT (-71.37108 41.83177)


In [16]:
gdf_tanks = gpd.GeoDataFrame(
    df_tanks, geometry=gpd.points_from_xy(df_tanks.lon_t_4326, df_tanks.lat_t_4326))

In [17]:
gdf_tanks = gdf_tanks.drop(['state', 'tank_type', 'diameter', 'county', 'on_floodpl', 'adj_risk'], axis = 1)
gdf_tanks

Unnamed: 0,lat_t_4326,lon_t_4326,lat_t_3857,lon_t_3857,erqk_risks,swnd_risks,hrcn_risks,trnd_risks,cfld_risks,rfld_risks,avg_risk,geometry
0,30.501991,-91.188296,-1.015103e+07,3.568241e+06,4.149297,9.661013,14.415955,43.776313,9.471153,39.822684,20.216069,POINT (-91.18830 30.50199)
1,29.990189,-90.395876,-1.006282e+07,3.502289e+06,1.208395,6.264728,13.189863,13.190995,17.685820,12.877608,10.736235,POINT (-90.39588 29.99019)
2,34.221754,-83.783722,-9.326761e+06,4.058617e+06,5.628088,12.104342,5.312985,31.912282,-1.000000,7.696209,10.442318,POINT (-83.78372 34.22175)
3,37.906023,-87.926250,-9.787905e+06,4.566158e+06,4.926164,10.959311,2.206652,12.846449,-1.000000,8.284501,6.537180,POINT (-87.92625 37.90602)
4,35.045340,-106.648430,-1.187205e+07,4.170044e+06,18.185426,9.373074,-1.000000,15.079099,-1.000000,14.347347,9.497491,POINT (-106.64843 35.04534)
...,...,...,...,...,...,...,...,...,...,...,...,...
977,42.411899,-90.732966,-1.010035e+07,5.222881e+06,1.575536,17.648163,4.544047,21.537919,-1.000000,12.580429,9.647682,POINT (-90.73297 42.41190)
978,42.862335,-106.293070,-1.183249e+07,5.291041e+06,3.312025,2.867939,-1.000000,10.280441,-1.000000,6.010181,3.745098,POINT (-106.29307 42.86233)
979,36.608666,-89.573830,-9.971313e+06,4.384699e+06,17.807754,23.810359,8.253384,24.042775,-1.000000,18.432187,15.391077,POINT (-89.57383 36.60867)
980,41.831766,-71.371080,-7.944992e+06,5.135812e+06,9.400549,11.049468,5.819224,19.608082,7.130619,21.502062,12.418334,POINT (-71.37108 41.83177)


### Finding the closest tank to each household
To find the tanks nearest to each household, we use an algorithm developed by the University of Helsinki. This code is copyrighted and licensed under the Creative Commons Attribution-ShareAlike 4.0 International licence and is available to the public to share and adapt, as long as it is attributed correctly and re-shared if edits are made. The material can be found [here](https://automating-gis-processes.github.io/site/notebooks/L3/nearest-neighbor-faster.html). From this algorithm, we removed the code that calculates the distance between the two points. The reasoning for this is explained in further detail below.

These functions use the sklearn neighbors module, specifically the ```BallTree``` method, to use machine learning to identify the closest tank to each household. It returns a GeoDataFrame with the same number of indices inputted households GeoDataFrame, where each row corresponds to the row with the same index in the households GeoDataFrame. It also retains all the original columns in the inputted tanks GeoDataFrame.

In [18]:
from sklearn.neighbors import BallTree
import numpy as np

def get_nearest(src_points, candidates, k_neighbors=1):
    """Find nearest neighbors for all source points from a set of candidate points"""

    # Create tree from the candidate points
    tree = BallTree(candidates, leaf_size=15)

    # Find closest points and distances
    distances, indices = tree.query(src_points, k=k_neighbors)

    # Transpose to get distances and indices into arrays
    distances = distances.transpose()
    indices = indices.transpose()

    # Get closest indices and distances (i.e. array at index 0)
    # note: for the second closest points, you would take index 1, etc.
    closest = indices[0]
    closest_dist = distances[0]

    # Return indices and distances
    return (closest, closest_dist)


def nearest_neighbor(left_gdf, right_gdf, return_dist=False):
    """
    For each point in left_gdf, find closest point in right GeoDataFrame and return them.

    NOTICE: Assumes that the input Points are in WGS84 projection (lat/lon).
    """

    left_geom_col = left_gdf.geometry.name
    right_geom_col = right_gdf.geometry.name

    # Ensure that index in right gdf is formed of sequential numbers
    right = right_gdf.copy().reset_index(drop=True)

    # Parse coordinates from points and insert them into a numpy array as RADIANS
    left_radians = np.array(left_gdf[left_geom_col].apply(lambda geom: (geom.x * (np.pi / 180), geom.y * (np.pi / 180))).to_list())
    right_radians = np.array(right[right_geom_col].apply(lambda geom: (geom.x * (np.pi / 180), geom.y * (np.pi / 180))).to_list())

    # Find the nearest points
    # -----------------------
    # closest ==> index in right_gdf that corresponds to the closest point
    # dist ==> distance between the nearest neighbors (in meters)

    closest, dist = get_nearest(src_points=left_radians, candidates=right_radians)

    # Return points from right GeoDataFrame that are closest to points in left GeoDataFrame
    closest_points = right.loc[closest]

    # Ensure that the index corresponds the one in left_gdf
    closest_points = closest_points.reset_index(drop=True)

    return closest_points

Here, you can see the outputted dataframe has 960,407 rows- the same number of rows as the inputted ```gdf_harris``` GeoDataFrame, and the same columns as the inputted ```df_tanks``` GeoDataFrame. Tank at index 0 in ```df_closest_tanks_harris``` is the tank nearest to household at index 0 in ```df_harris```, which is in the same order as ```gdf_harris``` and so on. 

In [19]:
%%time
df_closest_tanks = nearest_neighbor(gdf_hh, gdf_tanks)
df_closest_tanks

CPU times: user 59.8 s, sys: 308 ms, total: 1min
Wall time: 1min


Unnamed: 0,lat_t_4326,lon_t_4326,lat_t_3857,lon_t_3857,erqk_risks,swnd_risks,hrcn_risks,trnd_risks,cfld_risks,rfld_risks,avg_risk,geometry
0,47.279640,-122.390263,-1.362442e+07,5.987838e+06,23.971695,6.226360,-1.000000,11.394691,21.238291,6.071374,11.483735,POINT (-122.39026 47.27964)
1,45.659215,-108.751859,-1.210620e+07,5.725905e+06,2.593052,7.262484,-1.000000,9.914697,-1.000000,7.868933,4.606528,POINT (-108.75186 45.65922)
2,48.226467,-101.369829,-1.128444e+07,6.144614e+06,0.407726,8.066726,-1.000000,9.625419,-1.000000,6.357442,4.076219,POINT (-101.36983 48.22647)
3,42.862335,-106.293070,-1.183249e+07,5.291041e+06,3.312025,2.867939,-1.000000,10.280441,-1.000000,6.010181,3.745098,POINT (-106.29307 42.86233)
4,44.457295,-73.223213,-8.151171e+06,5.536484e+06,3.862760,6.794995,3.156417,4.080984,-1.000000,5.920526,3.969280,POINT (-73.22321 44.45729)
...,...,...,...,...,...,...,...,...,...,...,...,...
960402,35.490080,-108.423861,-1.206969e+07,4.230682e+06,6.394724,4.414434,-1.000000,5.280541,-1.000000,41.749099,9.639800,POINT (-108.42386 35.49008)
960403,42.412615,-96.389272,-1.073000e+07,5.222989e+06,2.169482,24.438503,-1.000000,37.424939,-1.000000,12.827024,12.809991,POINT (-96.38927 42.41261)
960404,35.059399,-78.862264,-8.778907e+06,4.171956e+06,5.315547,15.156209,20.793640,27.745481,2.048231,19.620172,15.113213,POINT (-78.86226 35.05940)
960405,44.534765,-89.569527,-9.970834e+06,5.548575e+06,0.937095,14.637362,-1.000000,11.956780,-1.000000,4.829955,5.393532,POINT (-89.56953 44.53477)


In [20]:
df_closest_tanks = df_closest_tanks.drop(['geometry'], axis = 1)
df_closest_tanks

Unnamed: 0,lat_t_4326,lon_t_4326,lat_t_3857,lon_t_3857,erqk_risks,swnd_risks,hrcn_risks,trnd_risks,cfld_risks,rfld_risks,avg_risk
0,47.279640,-122.390263,-1.362442e+07,5.987838e+06,23.971695,6.226360,-1.000000,11.394691,21.238291,6.071374,11.483735
1,45.659215,-108.751859,-1.210620e+07,5.725905e+06,2.593052,7.262484,-1.000000,9.914697,-1.000000,7.868933,4.606528
2,48.226467,-101.369829,-1.128444e+07,6.144614e+06,0.407726,8.066726,-1.000000,9.625419,-1.000000,6.357442,4.076219
3,42.862335,-106.293070,-1.183249e+07,5.291041e+06,3.312025,2.867939,-1.000000,10.280441,-1.000000,6.010181,3.745098
4,44.457295,-73.223213,-8.151171e+06,5.536484e+06,3.862760,6.794995,3.156417,4.080984,-1.000000,5.920526,3.969280
...,...,...,...,...,...,...,...,...,...,...,...
960402,35.490080,-108.423861,-1.206969e+07,4.230682e+06,6.394724,4.414434,-1.000000,5.280541,-1.000000,41.749099,9.639800
960403,42.412615,-96.389272,-1.073000e+07,5.222989e+06,2.169482,24.438503,-1.000000,37.424939,-1.000000,12.827024,12.809991
960404,35.059399,-78.862264,-8.778907e+06,4.171956e+06,5.315547,15.156209,20.793640,27.745481,2.048231,19.620172,15.113213
960405,44.534765,-89.569527,-9.970834e+06,5.548575e+06,0.937095,14.637362,-1.000000,11.956780,-1.000000,4.829955,5.393532


Therefore, merging the two ```df_closest_tanks``` and ```df_hh_lat_lon``` will create a new dataframe, ```df_harris_dist``` with the coordinates of each household corresponding to that of the tank nearest to it. This information is what we use to calculate distance. We create new dataframe ```df_hh_lat_lon``` from ```df_hh``` and only keep the latitude and longitude of each household, as these are the only two columns necessary to merge with ```df_closest_tanks``` in order to compute the distance between the household coordinates and the tank coordinates for each household.

In [21]:
df_closest_tanks = df_closest_tanks.reset_index()
df_hh = df_hh.reset_index()

In [22]:
df_hh_lat_lon = df_hh[['lat_h_4326', 'lon_h_4326']]
df_hh_lat_lon = df_hh_lat_lon.reset_index()
df_hh_lat_lon

Unnamed: 0,index,lat_h_4326,lon_h_4326
0,0,47.536477,-123.299151
1,1,43.121187,-123.473248
2,2,38.462998,-111.014933
3,3,48.433631,-80.411200
4,4,42.203405,-73.802723
...,...,...,...
960402,960402,41.305474,-71.274986
960403,960403,40.467252,-90.764017
960404,960404,36.134337,-97.876163
960405,960405,37.978958,-106.804916


In [23]:
%%time
df_closest_tanks_hh = df_hh_lat_lon.merge(df_closest_tanks, left_index=True, right_index = True)
df_closest_tanks_hh = df_closest_tanks_hh.drop(['index_x', 'index_y'], axis = 1)
df_closest_tanks_hh

CPU times: user 105 ms, sys: 69.6 ms, total: 174 ms
Wall time: 172 ms


Unnamed: 0,lat_h_4326,lon_h_4326,lat_t_4326,lon_t_4326,lat_t_3857,lon_t_3857,erqk_risks,swnd_risks,hrcn_risks,trnd_risks,cfld_risks,rfld_risks,avg_risk
0,47.536477,-123.299151,47.279640,-122.390263,-1.362442e+07,5.987838e+06,23.971695,6.226360,-1.000000,11.394691,21.238291,6.071374,11.483735
1,43.121187,-123.473248,45.659215,-108.751859,-1.210620e+07,5.725905e+06,2.593052,7.262484,-1.000000,9.914697,-1.000000,7.868933,4.606528
2,38.462998,-111.014933,48.226467,-101.369829,-1.128444e+07,6.144614e+06,0.407726,8.066726,-1.000000,9.625419,-1.000000,6.357442,4.076219
3,48.433631,-80.411200,42.862335,-106.293070,-1.183249e+07,5.291041e+06,3.312025,2.867939,-1.000000,10.280441,-1.000000,6.010181,3.745098
4,42.203405,-73.802723,44.457295,-73.223213,-8.151171e+06,5.536484e+06,3.862760,6.794995,3.156417,4.080984,-1.000000,5.920526,3.969280
...,...,...,...,...,...,...,...,...,...,...,...,...,...
960402,41.305474,-71.274986,35.490080,-108.423861,-1.206969e+07,4.230682e+06,6.394724,4.414434,-1.000000,5.280541,-1.000000,41.749099,9.639800
960403,40.467252,-90.764017,42.412615,-96.389272,-1.073000e+07,5.222989e+06,2.169482,24.438503,-1.000000,37.424939,-1.000000,12.827024,12.809991
960404,36.134337,-97.876163,35.059399,-78.862264,-8.778907e+06,4.171956e+06,5.315547,15.156209,20.793640,27.745481,2.048231,19.620172,15.113213
960405,37.978958,-106.804916,44.534765,-89.569527,-9.970834e+06,5.548575e+06,0.937095,14.637362,-1.000000,11.956780,-1.000000,4.829955,5.393532


To compute the distance between the two sets of coordinates (the household ones and the ones of the nearest tank), we use the haversine library. This library calculates the distance between two coordinates in EPSG 4326 projection, in kilometers. We multiplied the value by 1,000 to find the distance in meters.

In [24]:
import haversine as hs

In [25]:
%%time
import pandas as pd
from geopy import distance

def distancer(row):
    coords_1 = (row['lat_h_4326'], row['lon_h_4326'])
    coords_2 = (row['lat_t_4326'], row['lon_t_4326'])
    return (hs.haversine(coords_1, coords_2) * 1000)

df_closest_tanks_hh['distance_m'] = df_closest_tanks_hh.apply(distancer, axis=1)
df_closest_tanks_hh



CPU times: user 14.8 s, sys: 59.2 ms, total: 14.9 s
Wall time: 14.9 s


Unnamed: 0,lat_h_4326,lon_h_4326,lat_t_4326,lon_t_4326,lat_t_3857,lon_t_3857,erqk_risks,swnd_risks,hrcn_risks,trnd_risks,cfld_risks,rfld_risks,avg_risk,distance_m
0,47.536477,-123.299151,47.279640,-122.390263,-1.362442e+07,5.987838e+06,23.971695,6.226360,-1.000000,11.394691,21.238291,6.071374,11.483735,7.411957e+04
1,43.121187,-123.473248,45.659215,-108.751859,-1.210620e+07,5.725905e+06,2.593052,7.262484,-1.000000,9.914697,-1.000000,7.868933,4.606528,1.201416e+06
2,38.462998,-111.014933,48.226467,-101.369829,-1.128444e+07,6.144614e+06,0.407726,8.066726,-1.000000,9.625419,-1.000000,6.357442,4.076219,1.334492e+06
3,48.433631,-80.411200,42.862335,-106.293070,-1.183249e+07,5.291041e+06,3.312025,2.867939,-1.000000,10.280441,-1.000000,6.010181,3.745098,2.093508e+06
4,42.203405,-73.802723,44.457295,-73.223213,-8.151171e+06,5.536484e+06,3.862760,6.794995,3.156417,4.080984,-1.000000,5.920526,3.969280,2.549651e+05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
960402,41.305474,-71.274986,35.490080,-108.423861,-1.206969e+07,4.230682e+06,6.394724,4.414434,-1.000000,5.280541,-1.000000,41.749099,9.639800,3.275372e+06
960403,40.467252,-90.764017,42.412615,-96.389272,-1.073000e+07,5.222989e+06,2.169482,24.438503,-1.000000,37.424939,-1.000000,12.827024,12.809991,5.162534e+05
960404,36.134337,-97.876163,35.059399,-78.862264,-8.778907e+06,4.171956e+06,5.315547,15.156209,20.793640,27.745481,2.048231,19.620172,15.113213,1.720569e+06
960405,37.978958,-106.804916,44.534765,-89.569527,-9.970834e+06,5.548575e+06,0.937095,14.637362,-1.000000,11.956780,-1.000000,4.829955,5.393532,1.610221e+06


Dropping latitude and longitude coordinates in the 4326 projection not used in our GPU visualizaitons (that this data is processed for). Also dropping latitude and longitude for nearest tanks, because this is the data for plotting households. Then, calculating distance in miles, as stipulated by our researcher.

In [26]:
df_closest_tanks_hh = df_closest_tanks_hh.drop(['lat_t_4326', 'lon_t_4326'], axis = 1)

In [27]:
df_closest_tanks_hh['distance_mi']  = df_closest_tanks_hh['distance_m'] / 1609.344
df_closest_tanks_hh

Unnamed: 0,lat_h_4326,lon_h_4326,lat_t_3857,lon_t_3857,erqk_risks,swnd_risks,hrcn_risks,trnd_risks,cfld_risks,rfld_risks,avg_risk,distance_m,distance_mi
0,47.536477,-123.299151,-1.362442e+07,5.987838e+06,23.971695,6.226360,-1.000000,11.394691,21.238291,6.071374,11.483735,7.411957e+04,46.055765
1,43.121187,-123.473248,-1.210620e+07,5.725905e+06,2.593052,7.262484,-1.000000,9.914697,-1.000000,7.868933,4.606528,1.201416e+06,746.525231
2,38.462998,-111.014933,-1.128444e+07,6.144614e+06,0.407726,8.066726,-1.000000,9.625419,-1.000000,6.357442,4.076219,1.334492e+06,829.214976
3,48.433631,-80.411200,-1.183249e+07,5.291041e+06,3.312025,2.867939,-1.000000,10.280441,-1.000000,6.010181,3.745098,2.093508e+06,1300.845825
4,42.203405,-73.802723,-8.151171e+06,5.536484e+06,3.862760,6.794995,3.156417,4.080984,-1.000000,5.920526,3.969280,2.549651e+05,158.427951
...,...,...,...,...,...,...,...,...,...,...,...,...,...
960402,41.305474,-71.274986,-1.206969e+07,4.230682e+06,6.394724,4.414434,-1.000000,5.280541,-1.000000,41.749099,9.639800,3.275372e+06,2035.221681
960403,40.467252,-90.764017,-1.073000e+07,5.222989e+06,2.169482,24.438503,-1.000000,37.424939,-1.000000,12.827024,12.809991,5.162534e+05,320.784963
960404,36.134337,-97.876163,-8.778907e+06,4.171956e+06,5.315547,15.156209,20.793640,27.745481,2.048231,19.620172,15.113213,1.720569e+06,1069.111893
960405,37.978958,-106.804916,-9.970834e+06,5.548575e+06,0.937095,14.637362,-1.000000,11.956780,-1.000000,4.829955,5.393532,1.610221e+06,1000.544765


Then, we categorize each household by its distances from the nearest tank. These boundaries were set by our researcher. Using the numpy library's ```.select()``` function, we can assign different values to each category. Households within 0.5 miles of a tank are marked as ```1```, households between 0.5 miles and one mile are marked as ```2``` and households between one and five miles from a tank are marked as ```3```. All other households are marked as ```4```.

In [28]:
import numpy as np
conditions = [(df_closest_tanks_hh['distance_mi'] <= 0.5),
              ((df_closest_tanks_hh['distance_mi'] > 0.5) & (df_closest_tanks_hh['distance_mi'] <= 1)),
              ((df_closest_tanks_hh['distance_mi'] > 1) & (df_closest_tanks_hh['distance_mi'] <= 5)),
              (df_closest_tanks_hh['distance_mi'] > 5)]



values = [1, 2, 3, 4]


df_closest_tanks_hh['distance_category'] = np.select(conditions, values)
df_closest_tanks_hh

Unnamed: 0,lat_h_4326,lon_h_4326,lat_t_3857,lon_t_3857,erqk_risks,swnd_risks,hrcn_risks,trnd_risks,cfld_risks,rfld_risks,avg_risk,distance_m,distance_mi,distance_category
0,47.536477,-123.299151,-1.362442e+07,5.987838e+06,23.971695,6.226360,-1.000000,11.394691,21.238291,6.071374,11.483735,7.411957e+04,46.055765,4
1,43.121187,-123.473248,-1.210620e+07,5.725905e+06,2.593052,7.262484,-1.000000,9.914697,-1.000000,7.868933,4.606528,1.201416e+06,746.525231,4
2,38.462998,-111.014933,-1.128444e+07,6.144614e+06,0.407726,8.066726,-1.000000,9.625419,-1.000000,6.357442,4.076219,1.334492e+06,829.214976,4
3,48.433631,-80.411200,-1.183249e+07,5.291041e+06,3.312025,2.867939,-1.000000,10.280441,-1.000000,6.010181,3.745098,2.093508e+06,1300.845825,4
4,42.203405,-73.802723,-8.151171e+06,5.536484e+06,3.862760,6.794995,3.156417,4.080984,-1.000000,5.920526,3.969280,2.549651e+05,158.427951,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
960402,41.305474,-71.274986,-1.206969e+07,4.230682e+06,6.394724,4.414434,-1.000000,5.280541,-1.000000,41.749099,9.639800,3.275372e+06,2035.221681,4
960403,40.467252,-90.764017,-1.073000e+07,5.222989e+06,2.169482,24.438503,-1.000000,37.424939,-1.000000,12.827024,12.809991,5.162534e+05,320.784963,4
960404,36.134337,-97.876163,-8.778907e+06,4.171956e+06,5.315547,15.156209,20.793640,27.745481,2.048231,19.620172,15.113213,1.720569e+06,1069.111893,4
960405,37.978958,-106.804916,-9.970834e+06,5.548575e+06,0.937095,14.637362,-1.000000,11.956780,-1.000000,4.829955,5.393532,1.610221e+06,1000.544765,4


Then, we merge the ```df_closest_tanks_hh``` dataframe with the ```df_hh``` dataframe to add back in the demographic data for each household, which we will use in our visualizations, and drop unnecessary columns.

In [29]:
df_hh = df_hh.drop(['index'], axis = 1)
df_hh = df_hh.reset_index()
df_closest_tanks_hh = df_closest_tanks_hh.reset_index()

In [30]:
df = df_hh.merge(df_closest_tanks_hh, left_index = True, right_index = True)
df

Unnamed: 0,index_x,zip,county_fips,state,child_num,has_child,age_code,lat_h_4326_x,lon_h_4326_x,lat_h_3857,...,erqk_risks,swnd_risks,hrcn_risks,trnd_risks,cfld_risks,rfld_risks,avg_risk,distance_m,distance_mi,distance_category
0,0,84606,37041,NH,12,0,H,47.536477,-123.299151,-1.372560e+07,...,23.971695,6.226360,-1.000000,11.394691,21.238291,6.071374,11.483735,7.411957e+04,46.055765,4
1,1,66723,23407,OR,3,0,A,43.121187,-123.473248,-1.374498e+07,...,2.593052,7.262484,-1.000000,9.914697,-1.000000,7.868933,4.606528,1.201416e+06,746.525231,4
2,2,59965,50536,NV,7,1,I,38.462998,-111.014933,-1.235813e+07,...,0.407726,8.066726,-1.000000,9.625419,-1.000000,6.357442,4.076219,1.334492e+06,829.214976,4
3,3,38676,38340,SD,3,1,B,48.433631,-80.411200,-8.951334e+06,...,3.312025,2.867939,-1.000000,10.280441,-1.000000,6.010181,3.745098,2.093508e+06,1300.845825,4
4,4,75640,24383,OR,15,1,J,42.203405,-73.802723,-8.215682e+06,...,3.862760,6.794995,3.156417,4.080984,-1.000000,5.920526,3.969280,2.549651e+05,158.427951,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
960402,960402,16202,52106,AL,5,1,M,41.305474,-71.274986,-7.934295e+06,...,6.394724,4.414434,-1.000000,5.280541,-1.000000,41.749099,9.639800,3.275372e+06,2035.221681,4
960403,960403,85007,53083,RI,6,1,B,40.467252,-90.764017,-1.010380e+07,...,2.169482,24.438503,-1.000000,37.424939,-1.000000,12.827024,12.809991,5.162534e+05,320.784963,4
960404,960404,64030,18524,AZ,7,0,I,36.134337,-97.876163,-1.089552e+07,...,5.315547,15.156209,20.793640,27.745481,2.048231,19.620172,15.113213,1.720569e+06,1069.111893,4
960405,960405,32071,12458,LA,1,1,J,37.978958,-106.804916,-1.188947e+07,...,0.937095,14.637362,-1.000000,11.956780,-1.000000,4.829955,5.393532,1.610221e+06,1000.544765,4


In [31]:
df = df.drop(['index_x', 'index_y', 'has_child', 'lat_h_4326_x', 'lon_h_4326_x', 'lat_t_3857', 'lon_t_3857'], axis = 1)
df = df.rename(columns = {'lat_h_4326_y': 'lat_h_4326', 'lon_h_4326_y': 'lon_h_4326'})
df

Unnamed: 0,zip,county_fips,state,child_num,age_code,lat_h_3857,lon_h_3857,lat_h_4326,lon_h_4326,erqk_risks,swnd_risks,hrcn_risks,trnd_risks,cfld_risks,rfld_risks,avg_risk,distance_m,distance_mi,distance_category
0,84606,37041,NH,12,H,-1.372560e+07,6.030085e+06,47.536477,-123.299151,23.971695,6.226360,-1.000000,11.394691,21.238291,6.071374,11.483735,7.411957e+04,46.055765,4
1,66723,23407,OR,3,A,-1.374498e+07,5.330436e+06,43.121187,-123.473248,2.593052,7.262484,-1.000000,9.914697,-1.000000,7.868933,4.606528,1.201416e+06,746.525231,4
2,59965,50536,NV,7,I,-1.235813e+07,4.645040e+06,38.462998,-111.014933,0.407726,8.066726,-1.000000,9.625419,-1.000000,6.357442,4.076219,1.334492e+06,829.214976,4
3,38676,38340,SD,3,B,-8.951334e+06,6.179301e+06,48.433631,-80.411200,3.312025,2.867939,-1.000000,10.280441,-1.000000,6.010181,3.745098,2.093508e+06,1300.845825,4
4,75640,24383,OR,15,J,-8.215682e+06,5.191497e+06,42.203405,-73.802723,3.862760,6.794995,3.156417,4.080984,-1.000000,5.920526,3.969280,2.549651e+05,158.427951,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
960402,16202,52106,AL,5,M,-7.934295e+06,5.057504e+06,41.305474,-71.274986,6.394724,4.414434,-1.000000,5.280541,-1.000000,41.749099,9.639800,3.275372e+06,2035.221681,4
960403,85007,53083,RI,6,B,-1.010380e+07,4.934076e+06,40.467252,-90.764017,2.169482,24.438503,-1.000000,37.424939,-1.000000,12.827024,12.809991,5.162534e+05,320.784963,4
960404,64030,18524,AZ,7,I,-1.089552e+07,4.319122e+06,36.134337,-97.876163,5.315547,15.156209,20.793640,27.745481,2.048231,19.620172,15.113213,1.720569e+06,1069.111893,4
960405,32071,12458,LA,1,J,-1.188947e+07,4.576454e+06,37.978958,-106.804916,0.937095,14.637362,-1.000000,11.956780,-1.000000,4.829955,5.393532,1.610221e+06,1000.544765,4


### Exporting to parquet file
Finally, we export this dataframe as a parquet file. It will be used in our visualizations.

In [34]:
df.to_parquet(DATA_DIR + '/distances_all_hh.parquet')

In [35]:
df = pd.read_parquet(DATA_DIR + '/distances_all_hh.parquet')
df

Unnamed: 0,zip,county_fips,state,child_num,age_code,lat_h_3857,lon_h_3857,lat_h_4326,lon_h_4326,erqk_risks,swnd_risks,hrcn_risks,trnd_risks,cfld_risks,rfld_risks,avg_risk,distance_m,distance_mi,distance_category
0,84606,37041,NH,12,H,-1.372560e+07,6.030085e+06,47.536477,-123.299151,23.971695,6.226360,-1.000000,11.394691,21.238291,6.071374,11.483735,7.411957e+04,46.055765,4
1,66723,23407,OR,3,A,-1.374498e+07,5.330436e+06,43.121187,-123.473248,2.593052,7.262484,-1.000000,9.914697,-1.000000,7.868933,4.606528,1.201416e+06,746.525231,4
2,59965,50536,NV,7,I,-1.235813e+07,4.645040e+06,38.462998,-111.014933,0.407726,8.066726,-1.000000,9.625419,-1.000000,6.357442,4.076219,1.334492e+06,829.214976,4
3,38676,38340,SD,3,B,-8.951334e+06,6.179301e+06,48.433631,-80.411200,3.312025,2.867939,-1.000000,10.280441,-1.000000,6.010181,3.745098,2.093508e+06,1300.845825,4
4,75640,24383,OR,15,J,-8.215682e+06,5.191497e+06,42.203405,-73.802723,3.862760,6.794995,3.156417,4.080984,-1.000000,5.920526,3.969280,2.549651e+05,158.427951,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
960402,16202,52106,AL,5,M,-7.934295e+06,5.057504e+06,41.305474,-71.274986,6.394724,4.414434,-1.000000,5.280541,-1.000000,41.749099,9.639800,3.275372e+06,2035.221681,4
960403,85007,53083,RI,6,B,-1.010380e+07,4.934076e+06,40.467252,-90.764017,2.169482,24.438503,-1.000000,37.424939,-1.000000,12.827024,12.809991,5.162534e+05,320.784963,4
960404,64030,18524,AZ,7,I,-1.089552e+07,4.319122e+06,36.134337,-97.876163,5.315547,15.156209,20.793640,27.745481,2.048231,19.620172,15.113213,1.720569e+06,1069.111893,4
960405,32071,12458,LA,1,J,-1.188947e+07,4.576454e+06,37.978958,-106.804916,0.937095,14.637362,-1.000000,11.956780,-1.000000,4.829955,5.393532,1.610221e+06,1000.544765,4
