# Calculating the shortest distances between households and storage tanks in Harris and Charleston County

### Import libraries

In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
import haversine as hs



### Reading in Harris County and Charleston County InfoUSA Data
This reads in the merged InfoUSA dataset, containing information for all zip codes provided.

In [3]:
df = pd.read_parquet('/hpc/group/codeplus22-vis/infousa_copy/zip_00_99_final.parquet')
df.head()

Unnamed: 0,zip,county,state,child_num,has_child,age_code,lat_h_4326,lon_h_4326,lat_h_3857,lon_h_3857
0,18833,113,PA,0,0,K,41.546738,-76.540436,-8520442.0,5093323.0
1,18833,15,PA,0,0,H,41.5908,-76.4242,-8507503.0,5099879.0
2,18833,15,PA,1,1,C,41.600392,-76.441724,-8509454.0,5101307.0
3,18833,15,PA,0,0,L,41.592483,-76.437832,-8509021.0,5100129.0
4,18833,15,PA,1,1,H,41.566196,-76.347977,-8499018.0,5096218.0


Since the InfoUSA dataframe above contains information from all zip codes, we filter by state and county to select only observations for Harris County, Texas. We then drop the columns that we will not be working with.

In [4]:
df_harris = df[(df['county'] == 201) & (df['state'] == 'TX')]
df_harris = df_harris.drop(['zip', 'county', 'state', 'child_num'], axis = 1)
df_harris

Unnamed: 0,zip,county,state,child_num,has_child,age_code,lat_h_4326,lon_h_4326,lat_h_3857,lon_h_3857
135007287,77244,201,TX,1,1,H,29.738,-95.606,-10642810.0,3469916.0
135007288,77244,201,TX,0,0,H,29.738,-95.606,-10642810.0,3469916.0
135007289,77244,201,TX,0,0,M,29.738,-95.606,-10642810.0,3469916.0
135007290,77244,201,TX,0,0,M,29.738,-95.606,-10642810.0,3469916.0
135007291,77244,201,TX,2,1,F,29.738,-95.606,-10642810.0,3469916.0


We do the same for Charleston County, South Carolina.

In [202]:
df_charleston = df[(df['county'] == 19) & (df['state'] == 'SC')]
df_charleston = df_charleston.drop(['zip', 'county', 'state', 'child_num'], axis = 1)
df_charleston

Unnamed: 0,zip,county,state,child_num,has_child,age_code,lat_h_4326,lon_h_4326,lat_h_3857,lon_h_3857
43688371,29402,19,SC,0,0,C,32.7765,-79.9308,-8.897856e+06,3.865676e+06
43688372,29402,19,SC,0,0,M,32.7765,-79.9308,-8.897856e+06,3.865676e+06
43688373,29402,19,SC,0,0,L,32.7765,-79.9308,-8.897856e+06,3.865676e+06
43688374,29402,19,SC,1,1,D,32.7765,-79.9308,-8.897856e+06,3.865676e+06
43688375,29402,19,SC,0,0,M,32.7765,-79.9308,-8.897856e+06,3.865676e+06
...,...,...,...,...,...,...,...,...,...,...
101215466,29417,19,SC,0,0,A,32.7870,-79.9917,-8.904635e+06,3.867066e+06
101215467,29417,19,SC,0,0,F,32.7870,-79.9917,-8.904635e+06,3.867066e+06
101215468,29417,19,SC,0,0,I,32.7870,-79.9917,-8.904635e+06,3.867066e+06
101215469,29417,19,SC,0,0,B,32.7870,-79.9917,-8.904635e+06,3.867066e+06


### Reading in AST data
To calculate the shortest distance between each household and tank, we must also read in the processed AST file. 

In [239]:
df_tanks = gpd.read_file('/hpc/group/codeplus22-vis/infousa_copy/ast_master.shp')
df_tanks

Unnamed: 0,state,tank_type,diameter,lat_t_4326,lon_t_4326,lat_t_3857,lon_t_3857,county,geometry
0,New York,closed_roof_tank,39.6,40.625572,-73.745231,-8.209282e+06,4.957270e+06,36059,"POLYGON ((-73.74547 40.62575, -73.74500 40.625..."
1,New York,closed_roof_tank,19.8,40.624761,-73.744420,-8.209191e+06,4.957151e+06,36059,"POLYGON ((-73.74465 40.62485, -73.74419 40.624..."
2,New York,closed_roof_tank,12.6,40.626086,-73.746257,-8.209396e+06,4.957345e+06,36059,"POLYGON ((-73.74633 40.62615, -73.74618 40.626..."
3,New York,closed_roof_tank,30.6,40.625786,-73.746203,-8.209390e+06,4.957301e+06,36059,"POLYGON ((-73.74639 40.62593, -73.74601 40.625..."
4,New York,closed_roof_tank,24.0,40.625781,-73.745813,-8.209346e+06,4.957300e+06,36059,"POLYGON ((-73.74595 40.62590, -73.74567 40.625..."
...,...,...,...,...,...,...,...,...,...
98164,Colorado,narrow_closed_roof_tank,5.4,39.777431,-104.920718,-1.167972e+07,4.833652e+06,08031,"POLYGON ((-104.92075 39.77746, -104.92069 39.7..."
98165,Colorado,narrow_closed_roof_tank,4.8,39.777301,-104.920631,-1.167971e+07,4.833633e+06,08031,"POLYGON ((-104.92066 39.77732, -104.92060 39.7..."
98166,Colorado,narrow_closed_roof_tank,3.6,39.777701,-104.920609,-1.167971e+07,4.833691e+06,08031,"POLYGON ((-104.92064 39.77772, -104.92058 39.7..."
98167,Colorado,narrow_closed_roof_tank,4.8,39.776628,-104.920617,-1.167971e+07,4.833535e+06,08031,"POLYGON ((-104.92065 39.77665, -104.92059 39.7..."


Since this dataframe contains information for tanks across the US, we filtered for tanks only in Harris County and Charleston County, then dropping all unrelevant columns. The tanks dataframes for Harris and Charleston County will be used at the end of our data processing.

In [240]:
df_tanks_harris = df_tanks[df_tanks['county'] == '48201']
df_tanks_charleston = df_tanks[df_tanks['county'] == '45019']
df_tanks = df_tanks.drop(['state', 'county'], axis = 1)

### Processing county data separately
Next, we will process each county's distances separately, as they will be saved in separate files for our visualizations. 

#### Harris County:

##### Finding the distance between each household and the nearest tank
The first step in finding the shortest distance between each household and a tank is converting the Harris households dataframe, ```df_harris``` into a GeoDataFrame. The code we run to find the distances rely on geometries, which are a property of GeoDataFrames. To do this, specify the name of the pandas dataframe to convert, then specify which columns to use for the ```POINT``` geometry. In this case, we use ```lon_h_4326``` and ```lat_h_4326```, which are the latitude and longitude coordinates of the household in EPSG 4326.

In [220]:
gdf_harris = gpd.GeoDataFrame(
    df_harris, geometry=gpd.points_from_xy(df_harris.lon_h_4326, df_harris.lat_h_4326))
gdf_harris = gdf_harris[['geometry']]
gdf_harris

Unnamed: 0,geometry
135007287,POINT (-95.60600 29.73800)
135007288,POINT (-95.60600 29.73800)
135007289,POINT (-95.60600 29.73800)
135007290,POINT (-95.60600 29.73800)
135007291,POINT (-95.60600 29.73800)
...,...
190664559,POINT (-95.56519 29.85317)
190664560,POINT (-95.53390 29.83656)
190664561,POINT (-95.59907 29.86504)
190664562,POINT (-95.55565 29.83541)


We then convert ```df_tanks``` to a GeoDataFrame. Here, we use ```df_tanks``` instead of ```df_tanks_harris``` because in edge cases, a household may be closest to a tank in another county. We will use ```df_tanks_harris``` later.

In [221]:
gdf_tanks = gpd.GeoDataFrame(
    df_tanks, geometry=gpd.points_from_xy(df_tanks.lon_t_4326, df_tanks.lat_t_4326))
gdf_tanks

Unnamed: 0,tank_type,diameter,lat_t_4326,lon_t_4326,lat_t_3857,lon_t_3857,geometry
0,closed_roof_tank,39.6,40.625572,-73.745231,-8.209282e+06,4.957270e+06,POINT (-73.74523 40.62557)
1,closed_roof_tank,19.8,40.624761,-73.744420,-8.209191e+06,4.957151e+06,POINT (-73.74442 40.62476)
2,closed_roof_tank,12.6,40.626086,-73.746257,-8.209396e+06,4.957345e+06,POINT (-73.74626 40.62609)
3,closed_roof_tank,30.6,40.625786,-73.746203,-8.209390e+06,4.957301e+06,POINT (-73.74620 40.62579)
4,closed_roof_tank,24.0,40.625781,-73.745813,-8.209346e+06,4.957300e+06,POINT (-73.74581 40.62578)
...,...,...,...,...,...,...,...
98164,narrow_closed_roof_tank,5.4,39.777431,-104.920718,-1.167972e+07,4.833652e+06,POINT (-104.92072 39.77743)
98165,narrow_closed_roof_tank,4.8,39.777301,-104.920631,-1.167971e+07,4.833633e+06,POINT (-104.92063 39.77730)
98166,narrow_closed_roof_tank,3.6,39.777701,-104.920609,-1.167971e+07,4.833691e+06,POINT (-104.92061 39.77770)
98167,narrow_closed_roof_tank,4.8,39.776628,-104.920617,-1.167971e+07,4.833535e+06,POINT (-104.92062 39.77663)


To find the tanks nearest to each household, we use an algorithm developed by the University of Helsinki. This code is copyrighted and licensed under the Creative Commons Attribution-ShareAlike 4.0 International licence and is available to the public to share and adapt, as long as it is attributed correctly and re-shared if edits are made. The material can be found [here](https://automating-gis-processes.github.io/site/notebooks/L3/nearest-neighbor-faster.html). From this algorithm, we removed the code that calculates the distance between the two points. The reasoning for this is explained in further detail below.

These functions use the sklearn neighbors module, specifically the ```BallTree``` method, to use machine learning to identify the closest tank to each household. It returns a GeoDataFrame with the same number of indices inputted households GeoDataFrame, where each row corresponds to the row with the same index in the households GeoDataFrame. It also retains all the original columns in the inputted tanks GeoDataFrame.

In [222]:
from sklearn.neighbors import BallTree
import numpy as np

def get_nearest(src_points, candidates, k_neighbors=1):
    """Find nearest neighbors for all source points from a set of candidate points"""

    # Create tree from the candidate points
    tree = BallTree(candidates, leaf_size=15, metric = 'euclidean')

    # Find closest points and distances
    distances, indices = tree.query(src_points, k=k_neighbors)

    # Transpose to get distances and indices into arrays
    distances = distances.transpose()
    indices = indices.transpose()

    # Get closest indices and distances (i.e. array at index 0)
    # note: for the second closest points, you would take index 1, etc.
    closest = indices[0]
    closest_dist = distances[0]

    # Return indices and distances
    return (closest, closest_dist)


def nearest_neighbor(left_gdf, right_gdf):
    """
    For each point in left_gdf, find closest point in right GeoDataFrame and return them.

    NOTICE: Assumes that the input Points are in WGS84 projection (lat/lon).
    """

    left_geom_col = left_gdf.geometry.name
    right_geom_col = right_gdf.geometry.name

    # Ensure that index in right gdf is formed of sequential numbers
    right = right_gdf.copy().reset_index(drop=True)

    # Parse coordinates from points and insert them into a numpy array as RADIANS
    left_radians = np.array(left_gdf[left_geom_col].apply(lambda geom: (geom.x * (np.pi / 180), geom.y * (np.pi / 180))).to_list())
    right_radians = np.array(right[right_geom_col].apply(lambda geom: (geom.x * (np.pi / 180), geom.y * (np.pi / 180))).to_list())

    # Find the nearest points
    # -----------------------
    # closest ==> index in right_gdf that corresponds to the closest point
    # dist ==> distance between the nearest neighbors (in meters)

    closest, dist = get_nearest(src_points=left_radians, candidates=right_radians)

    # Return points from right GeoDataFrame that are closest to points in left GeoDataFrame
    closest_points = right.loc[closest]

    # Ensure that the index corresponds the one in left_gdf
    closest_points = closest_points.reset_index(drop=True)
    
    return closest_points

Here, you can see the outputted dataframe has 2,335,208 rows- the same number of rows as the inputted ```gdf_harris``` GeoDataFrame, and the same columns as the inputted ```df_tanks``` GeoDataFrame. Tank at index 0 in ```df_closest_tanks_harris``` is the tank nearest to household at index 0 in ```df_harris```, which is in the same order as ```gdf_harris``` and so on. 

In [223]:
%%time
df_closest_tanks_harris = nearest_neighbor(gdf_harris, gdf_tanks)
df_closest_tanks_harris.head()

CPU times: user 2min 41s, sys: 918 ms, total: 2min 42s
Wall time: 2min 42s


Unnamed: 0,tank_type,diameter,lat_t_4326,lon_t_4326,lat_t_3857,lon_t_3857,geometry
0,closed_roof_tank,10.2,29.652685,-95.412834,-1.062131e+07,3.458983e+06,POINT (-95.41283 29.65269)
1,closed_roof_tank,10.2,29.652685,-95.412834,-1.062131e+07,3.458983e+06,POINT (-95.41283 29.65269)
2,closed_roof_tank,10.2,29.652685,-95.412834,-1.062131e+07,3.458983e+06,POINT (-95.41283 29.65269)
3,closed_roof_tank,10.2,29.652685,-95.412834,-1.062131e+07,3.458983e+06,POINT (-95.41283 29.65269)
4,closed_roof_tank,10.2,29.652685,-95.412834,-1.062131e+07,3.458983e+06,POINT (-95.41283 29.65269)
...,...,...,...,...,...,...,...
2335203,closed_roof_tank,57.6,29.867938,-95.437601,-1.062407e+07,3.486586e+06,POINT (-95.43760 29.86794)
2335204,closed_roof_tank,57.6,29.867938,-95.437601,-1.062407e+07,3.486586e+06,POINT (-95.43760 29.86794)
2335205,closed_roof_tank,57.6,29.867938,-95.437601,-1.062407e+07,3.486586e+06,POINT (-95.43760 29.86794)
2335206,closed_roof_tank,57.6,29.867938,-95.437601,-1.062407e+07,3.486586e+06,POINT (-95.43760 29.86794)


Therefore, merging the two ```df_closest_tanks_harris``` and ```df_harris``` will create a new dataframe, ```df_harris_dist``` with the coordinates of each household corresponding to that of the tank nearest to it. This information is what we use to calculate distance.

In [224]:
df_closest_tanks_harris = df_closest_tanks_harris.reset_index(drop = True)
df_harris = df_harris.reset_index(drop = True)

Unnamed: 0,tank_type,diameter,lat_t_4326,lon_t_4326,lat_t_3857,lon_t_3857,geometry
0,closed_roof_tank,10.2,29.652685,-95.412834,-1.062131e+07,3.458983e+06,POINT (-95.41283 29.65269)
1,closed_roof_tank,10.2,29.652685,-95.412834,-1.062131e+07,3.458983e+06,POINT (-95.41283 29.65269)
2,closed_roof_tank,10.2,29.652685,-95.412834,-1.062131e+07,3.458983e+06,POINT (-95.41283 29.65269)
3,closed_roof_tank,10.2,29.652685,-95.412834,-1.062131e+07,3.458983e+06,POINT (-95.41283 29.65269)
4,closed_roof_tank,10.2,29.652685,-95.412834,-1.062131e+07,3.458983e+06,POINT (-95.41283 29.65269)
...,...,...,...,...,...,...,...
2335203,closed_roof_tank,57.6,29.867938,-95.437601,-1.062407e+07,3.486586e+06,POINT (-95.43760 29.86794)
2335204,closed_roof_tank,57.6,29.867938,-95.437601,-1.062407e+07,3.486586e+06,POINT (-95.43760 29.86794)
2335205,closed_roof_tank,57.6,29.867938,-95.437601,-1.062407e+07,3.486586e+06,POINT (-95.43760 29.86794)
2335206,closed_roof_tank,57.6,29.867938,-95.437601,-1.062407e+07,3.486586e+06,POINT (-95.43760 29.86794)


In [226]:
df_harris_dist = df_harris.merge(df_closest_tanks_harris, left_index=True, right_index = True)
df_harris_dist = df_harris_dist.drop(['geometry_x', 'geometry_y'], axis = 1)
df_harris_dist.head()

Unnamed: 0,has_child,age_code,lat_h_4326,lon_h_4326,lat_h_3857,lon_h_3857,tank_type,diameter,lat_t_4326,lon_t_4326,lat_t_3857,lon_t_3857
0,1,H,29.738000,-95.606000,-1.064281e+07,3.469916e+06,closed_roof_tank,10.2,29.652685,-95.412834,-1.062131e+07,3.458983e+06
1,0,H,29.738000,-95.606000,-1.064281e+07,3.469916e+06,closed_roof_tank,10.2,29.652685,-95.412834,-1.062131e+07,3.458983e+06
2,0,M,29.738000,-95.606000,-1.064281e+07,3.469916e+06,closed_roof_tank,10.2,29.652685,-95.412834,-1.062131e+07,3.458983e+06
3,0,M,29.738000,-95.606000,-1.064281e+07,3.469916e+06,closed_roof_tank,10.2,29.652685,-95.412834,-1.062131e+07,3.458983e+06
4,1,F,29.738000,-95.606000,-1.064281e+07,3.469916e+06,closed_roof_tank,10.2,29.652685,-95.412834,-1.062131e+07,3.458983e+06
...,...,...,...,...,...,...,...,...,...,...,...,...
2335203,0,E,29.853170,-95.565193,-1.063827e+07,3.484690e+06,closed_roof_tank,57.6,29.867938,-95.437601,-1.062407e+07,3.486586e+06
2335204,0,C,29.836561,-95.533901,-1.063479e+07,3.482559e+06,closed_roof_tank,57.6,29.867938,-95.437601,-1.062407e+07,3.486586e+06
2335205,0,H,29.865044,-95.599070,-1.064204e+07,3.486214e+06,closed_roof_tank,57.6,29.867938,-95.437601,-1.062407e+07,3.486586e+06
2335206,0,B,29.835408,-95.555649,-1.063721e+07,3.482411e+06,closed_roof_tank,57.6,29.867938,-95.437601,-1.062407e+07,3.486586e+06


To compute the distance between the two sets of coordinates (the household ones and the ones of the nearest tank), we use the haversine library. This library calculates the distance between two coordinates in EPSG 4326 projection, in kilometers. We multiplied the value by 1,000 to find the distance in meters.

In [227]:
%%time

def distancer(row):
    coords_1 = (row['lat_h_4326'], row['lon_h_4326'])
    coords_2 = (row['lat_t_4326'], row['lon_t_4326'])
    return (hs.haversine(coords_1, coords_2) * 1000)

df_harris_dist['distance_m'] = df_harris_dist.apply(distancer, axis=1)
df_harris_dist

CPU times: user 36.2 s, sys: 573 ms, total: 36.8 s
Wall time: 36.9 s


Unnamed: 0,has_child,age_code,lat_h_4326,lon_h_4326,lat_h_3857,lon_h_3857,tank_type,diameter,lat_t_4326,lon_t_4326,lat_t_3857,lon_t_3857,distance_m
0,1,H,29.738000,-95.606000,-1.064281e+07,3.469916e+06,closed_roof_tank,10.2,29.652685,-95.412834,-1.062131e+07,3.458983e+06,20931.486080
1,0,H,29.738000,-95.606000,-1.064281e+07,3.469916e+06,closed_roof_tank,10.2,29.652685,-95.412834,-1.062131e+07,3.458983e+06,20931.486080
2,0,M,29.738000,-95.606000,-1.064281e+07,3.469916e+06,closed_roof_tank,10.2,29.652685,-95.412834,-1.062131e+07,3.458983e+06,20931.486080
3,0,M,29.738000,-95.606000,-1.064281e+07,3.469916e+06,closed_roof_tank,10.2,29.652685,-95.412834,-1.062131e+07,3.458983e+06,20931.486080
4,1,F,29.738000,-95.606000,-1.064281e+07,3.469916e+06,closed_roof_tank,10.2,29.652685,-95.412834,-1.062131e+07,3.458983e+06,20931.486080
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2335203,0,E,29.853170,-95.565193,-1.063827e+07,3.484690e+06,closed_roof_tank,57.6,29.867938,-95.437601,-1.062407e+07,3.486586e+06,12413.140255
2335204,0,C,29.836561,-95.533901,-1.063479e+07,3.482559e+06,closed_roof_tank,57.6,29.867938,-95.437601,-1.062407e+07,3.486586e+06,9920.965430
2335205,0,H,29.865044,-95.599070,-1.064204e+07,3.486214e+06,closed_roof_tank,57.6,29.867938,-95.437601,-1.062407e+07,3.486586e+06,15573.298866
2335206,0,B,29.835408,-95.555649,-1.063721e+07,3.482411e+06,closed_roof_tank,57.6,29.867938,-95.437601,-1.062407e+07,3.486586e+06,11945.509912


Dropping latitude and longitude coordinates in the 4326 projection not used in our GPU visualizations (that this data is processed for). Also dropping latitude and longitude for nearest tanks, because this is the data for plotting households. Then, calculating distance in miles, as stipulated by our researcher.

In [228]:
df_harris_dist = df_harris_dist.drop(['lat_h_4326', 'lon_h_4326', 'lat_t_4326', 'lon_t_4326', 'lat_t_3857', 'lon_t_3857'], axis = 1)

In [229]:
df_harris_dist['distance_mi']  = df_harris_dist['distance_m'] / 1609.344
df_harris_dist

Unnamed: 0,has_child,age_code,lat_h_3857,lon_h_3857,tank_type,diameter,distance_m,distance_mi
0,1,H,-1.064281e+07,3.469916e+06,closed_roof_tank,10.2,20931.486080,13.006222
1,0,H,-1.064281e+07,3.469916e+06,closed_roof_tank,10.2,20931.486080,13.006222
2,0,M,-1.064281e+07,3.469916e+06,closed_roof_tank,10.2,20931.486080,13.006222
3,0,M,-1.064281e+07,3.469916e+06,closed_roof_tank,10.2,20931.486080,13.006222
4,1,F,-1.064281e+07,3.469916e+06,closed_roof_tank,10.2,20931.486080,13.006222
...,...,...,...,...,...,...,...,...
2335203,0,E,-1.063827e+07,3.484690e+06,closed_roof_tank,57.6,12413.140255,7.713168
2335204,0,C,-1.063479e+07,3.482559e+06,closed_roof_tank,57.6,9920.965430,6.164602
2335205,0,H,-1.064204e+07,3.486214e+06,closed_roof_tank,57.6,15573.298866,9.676799
2335206,0,B,-1.063721e+07,3.482411e+06,closed_roof_tank,57.6,11945.509912,7.422596


Then, we categorize each household by its distances from the nearest tank. These boundaries were set by our researcher. Using the numpy library's ```.select()``` function, we can assign different values to each category. Households within 0.5 miles of a tank are marked as ```1```, households between 0.5 miles and one mile are marked as ```2``` and households between one and five miles from a tank are marked as ```3```. All other households are marked as ```4```.

In [230]:
conditions_harris = [(df_harris_dist['distance_mi'] <= 0.5),
              ((df_harris_dist['distance_mi'] > 0.5) & (df_harris_dist['distance_mi'] <= 1)),
              ((df_harris_dist['distance_mi'] > 1) & (df_harris_dist['distance_mi'] <= 5)),
              (df_harris_dist['distance_mi'] > 5)]

values_harris = [1, 2, 3, 4]

df_harris_dist['distance_category'] = np.select(conditions_harris, values_harris)
df_harris_dist

Unnamed: 0,has_child,age_code,lat_h_3857,lon_h_3857,tank_type,diameter,distance_m,distance_mi,distance_category
0,1,H,-1.064281e+07,3.469916e+06,closed_roof_tank,10.2,20931.486080,13.006222,4
1,0,H,-1.064281e+07,3.469916e+06,closed_roof_tank,10.2,20931.486080,13.006222,4
2,0,M,-1.064281e+07,3.469916e+06,closed_roof_tank,10.2,20931.486080,13.006222,4
3,0,M,-1.064281e+07,3.469916e+06,closed_roof_tank,10.2,20931.486080,13.006222,4
4,1,F,-1.064281e+07,3.469916e+06,closed_roof_tank,10.2,20931.486080,13.006222,4
...,...,...,...,...,...,...,...,...,...
2335203,0,E,-1.063827e+07,3.484690e+06,closed_roof_tank,57.6,12413.140255,7.713168,4
2335204,0,C,-1.063479e+07,3.482559e+06,closed_roof_tank,57.6,9920.965430,6.164602,4
2335205,0,H,-1.064204e+07,3.486214e+06,closed_roof_tank,57.6,15573.298866,9.676799,4
2335206,0,B,-1.063721e+07,3.482411e+06,closed_roof_tank,57.6,11945.509912,7.422596,4


##### Processing the data for GPU visualizations
Next, we process this data specifically for creating visualizations of it with the GPUs through the Cuxfilter library. 

The Datashader plotting library that Cuxfilter uses to create our visualization through the use of Graphical Processing Units (GPUs) is optimized for working with large dataframes. This comes with a couple restraints, however. One of these is that Datashader only takes numerical inputs when creating the custom charts the user can interact with, like the multiselect chart or the range slider. This means that instead of being able to categorize each household by whether or not its head of household is eldery by labelling it with ```strings``` as ```'Elderly'``` or ```'No elderly'```, we must label it numerically. Therefore, we must convert each age code to a number that indicates whether or not that household has an elderly head of household.

This is done with the numpy library's ```.where()``` function, which uses if-else conditions to assign values in a new column. In the code below, if the age_code is ```J```, ```K```, ```L``` or ```M```, the household is marked as ```1```, meaning elderly (this is based on the InfoUSA data dictionary), and marked as ```2```, not elderly, for all other values. 

In [232]:
df_harris_dist['is_elderly'] = np.where(((df_harris_dist['age_code'] == 'J') | (df_harris_dist['age_code'] == 'K') |
                                       (df_harris_dist['age_code'] == 'L') | (df_harris_dist['age_code'] == 'M')), 1, 2)
df_harris_dist

Unnamed: 0,has_child,age_code,lat_h_3857,lon_h_3857,tank_type,diameter,distance_m,distance_mi,distance_category,is_elderly
0,1,H,-1.064281e+07,3.469916e+06,closed_roof_tank,10.2,20931.486080,13.006222,4,2
1,0,H,-1.064281e+07,3.469916e+06,closed_roof_tank,10.2,20931.486080,13.006222,4,2
2,0,M,-1.064281e+07,3.469916e+06,closed_roof_tank,10.2,20931.486080,13.006222,4,1
3,0,M,-1.064281e+07,3.469916e+06,closed_roof_tank,10.2,20931.486080,13.006222,4,1
4,1,F,-1.064281e+07,3.469916e+06,closed_roof_tank,10.2,20931.486080,13.006222,4,2
...,...,...,...,...,...,...,...,...,...,...
2335203,0,E,-1.063827e+07,3.484690e+06,closed_roof_tank,57.6,12413.140255,7.713168,4,2
2335204,0,C,-1.063479e+07,3.482559e+06,closed_roof_tank,57.6,9920.965430,6.164602,4,2
2335205,0,H,-1.064204e+07,3.486214e+06,closed_roof_tank,57.6,15573.298866,9.676799,4,2
2335206,0,B,-1.063721e+07,3.482411e+06,closed_roof_tank,57.6,11945.509912,7.422596,4,2


To remain consistent the same structure as above, even though the ```has_child``` column is already numerical, we changed the values so that ```1``` indicates that the household has children, ```2``` indicates that the household has no children, and ```0``` indicates that the point is a tank. Previously, ```0``` indicated no children and ```1``` indicated children. In all our categorical variable columns, ```0``` indicates that the point is a tank, so we wanted to remain consistent.

In [233]:
df_harris_dist['has_child'] = np.where(df_harris_dist['has_child'] == 1, 1, 2)
df_harris_dist

Unnamed: 0,has_child,age_code,lat_h_3857,lon_h_3857,tank_type,diameter,distance_m,distance_mi,distance_category,is_elderly
0,1,H,-1.064281e+07,3.469916e+06,closed_roof_tank,10.2,20931.486080,13.006222,4,2
1,2,H,-1.064281e+07,3.469916e+06,closed_roof_tank,10.2,20931.486080,13.006222,4,2
2,2,M,-1.064281e+07,3.469916e+06,closed_roof_tank,10.2,20931.486080,13.006222,4,1
3,2,M,-1.064281e+07,3.469916e+06,closed_roof_tank,10.2,20931.486080,13.006222,4,1
4,1,F,-1.064281e+07,3.469916e+06,closed_roof_tank,10.2,20931.486080,13.006222,4,2
...,...,...,...,...,...,...,...,...,...,...
2335203,2,E,-1.063827e+07,3.484690e+06,closed_roof_tank,57.6,12413.140255,7.713168,4,2
2335204,2,C,-1.063479e+07,3.482559e+06,closed_roof_tank,57.6,9920.965430,6.164602,4,2
2335205,2,H,-1.064204e+07,3.486214e+06,closed_roof_tank,57.6,15573.298866,9.676799,4,2
2335206,2,B,-1.063721e+07,3.482411e+06,closed_roof_tank,57.6,11945.509912,7.422596,4,2


In addition, the Cuxfilter library only pulls coordinates from two columns: on latitude and one longitude column. This means that all the points displayed in the dashboard must be in the same column. Therefore, to plot tanks and households on the same dashboard, we append the dataframe with the coordinates for each tank to the dataframe with the coordinates for each household. To do so, the columns must be the same across both columns. Therefore, we renamed the ```lat_h_3857``` and ```lon_h_3857``` columns in the ```df_harris_dist``` dataframe to ```lat_3857``` and ```lon_3857```. When the ```df_tanks_harris``` dataframe is appended to this one, we will have general latitude and longitude columns including coordinate information for all the households and tanks in Harris County.

In [234]:
df_harris_dist.rename(columns = {'lat_h_3857': 'lat_3857', 'lon_h_3857': 'lon_3857'}, inplace = True)

In order for the tanks to display on Cuxfilter when using the distance range slider, we set the distance to the maximum distance between a household and a tank. This is a limited solution potentially solveable by calculating the distance for each tank to the nearest household and including those values.

We add the ```has_child```, ```distance_category``` and ```is_elderly``` columns to the ```df_tanks_harris``` dataframe, setting all their values to ```0``` to indicate that the point is a tank when plotted on the dashboard.

In [235]:
df_harris_dist['distance_mi'].max()

32.552652838389264

In [242]:
df_tanks_harris = df_tanks_harris.drop(['state', 'county', 'lat_t_4326', 'lon_t_4326', 'geometry'], axis = 1)
df_tanks_harris['has_child'] = 0
df_tanks_harris['distance_category'] = 0
df_tanks_harris['is_elderly'] = 0
df_tanks_harris['distance_mi'] = 35
df_tanks_harris.rename(columns = {'lat_t_3857': 'lat_3857', 'lon_t_3857': 'lon_3857'}, inplace = True)
df_tanks_harris

Unnamed: 0,tank_type,diameter,lat_3857,lon_3857,has_child,distance_category,is_elderly,distance_mi
787,closed_roof_tank,35.4,-1.061876e+07,3.500643e+06,0,0,0,35
788,closed_roof_tank,22.2,-1.061869e+07,3.500631e+06,0,0,0,35
789,closed_roof_tank,16.8,-1.062179e+07,3.496787e+06,0,0,0,35
790,closed_roof_tank,21.6,-1.062392e+07,3.496593e+06,0,0,0,35
791,closed_roof_tank,18.0,-1.062221e+07,3.503704e+06,0,0,0,35
...,...,...,...,...,...,...,...,...
89848,narrow_closed_roof_tank,4.2,-1.062105e+07,3.488964e+06,0,0,0,35
89849,narrow_closed_roof_tank,4.2,-1.062105e+07,3.488936e+06,0,0,0,35
89850,narrow_closed_roof_tank,4.8,-1.062105e+07,3.488927e+06,0,0,0,35
89851,closed_roof_tank,20.4,-1.061700e+07,3.488957e+06,0,0,0,35


In [243]:
df_harris_merged = df_harris_dist.append(df_tanks_harris, ignore_index = True)
df_harris_merged

  df_harris_merged = df_harris_dist.append(df_tanks_harris, ignore_index = True)


Unnamed: 0,has_child,age_code,lat_3857,lon_3857,tank_type,diameter,distance_m,distance_mi,distance_category,is_elderly
0,1,H,-1.064281e+07,3.469916e+06,closed_roof_tank,10.2,20931.48608,13.006222,4,2
1,2,H,-1.064281e+07,3.469916e+06,closed_roof_tank,10.2,20931.48608,13.006222,4,2
2,2,M,-1.064281e+07,3.469916e+06,closed_roof_tank,10.2,20931.48608,13.006222,4,1
3,2,M,-1.064281e+07,3.469916e+06,closed_roof_tank,10.2,20931.48608,13.006222,4,1
4,1,F,-1.064281e+07,3.469916e+06,closed_roof_tank,10.2,20931.48608,13.006222,4,2
...,...,...,...,...,...,...,...,...,...,...
2336530,0,,-1.062105e+07,3.488964e+06,narrow_closed_roof_tank,4.2,,35.000000,0,0
2336531,0,,-1.062105e+07,3.488936e+06,narrow_closed_roof_tank,4.2,,35.000000,0,0
2336532,0,,-1.062105e+07,3.488927e+06,narrow_closed_roof_tank,4.8,,35.000000,0,0
2336533,0,,-1.061700e+07,3.488957e+06,closed_roof_tank,20.4,,35.000000,0,0


Finally, we save this as a parquet file so we can use it in our visualizations.

In [None]:
df_harris_merged.to_parquet('/hpc/group/codeplus22-vis/infousa_copy/distances_harris_final.parquet')

In [245]:
df = pd.read_parquet('/hpc/group/codeplus22-vis/infousa_copy/distances_harris_final.parquet')
df

Unnamed: 0,has_child,age_code,lat_3857,lon_3857,tank_type,diameter,distance_m,distance_mi,distance_category,is_elderly
0,1,H,-1.064281e+07,3.469916e+06,closed_roof_tank,10.2,20931.48608,13.006222,4,2
1,2,H,-1.064281e+07,3.469916e+06,closed_roof_tank,10.2,20931.48608,13.006222,4,2
2,2,M,-1.064281e+07,3.469916e+06,closed_roof_tank,10.2,20931.48608,13.006222,4,1
3,2,M,-1.064281e+07,3.469916e+06,closed_roof_tank,10.2,20931.48608,13.006222,4,1
4,1,F,-1.064281e+07,3.469916e+06,closed_roof_tank,10.2,20931.48608,13.006222,4,2
...,...,...,...,...,...,...,...,...,...,...
2336530,0,,-1.062105e+07,3.488964e+06,narrow_closed_roof_tank,4.2,,35.000000,0,0
2336531,0,,-1.062105e+07,3.488936e+06,narrow_closed_roof_tank,4.2,,35.000000,0,0
2336532,0,,-1.062105e+07,3.488927e+06,narrow_closed_roof_tank,4.8,,35.000000,0,0
2336533,0,,-1.061700e+07,3.488957e+06,closed_roof_tank,20.4,,35.000000,0,0


#### Charleston County
The same process from above is repeated for Charleston County.

##### Finding the distance between each household and the nearest tank

In [246]:
gdf_charleston = gpd.GeoDataFrame(
    df_charleston, geometry=gpd.points_from_xy(df_charleston.lon_h_4326, df_charleston.lat_h_4326))
gdf_charleston = gdf_charleston[['geometry']]
gdf_charleston

Unnamed: 0,geometry
43688371,POINT (-79.93080 32.77650)
43688372,POINT (-79.93080 32.77650)
43688373,POINT (-79.93080 32.77650)
43688374,POINT (-79.93080 32.77650)
43688375,POINT (-79.93080 32.77650)
...,...
101215466,POINT (-79.99170 32.78700)
101215467,POINT (-79.99170 32.78700)
101215468,POINT (-79.99170 32.78700)
101215469,POINT (-79.99170 32.78700)


In [247]:
gdf_tanks = gpd.GeoDataFrame(
    df_tanks, geometry=gpd.points_from_xy(df_tanks.lon_t_4326, df_tanks.lat_t_4326))
gdf_tanks

Unnamed: 0,tank_type,diameter,lat_t_4326,lon_t_4326,lat_t_3857,lon_t_3857,geometry
0,closed_roof_tank,39.6,40.625572,-73.745231,-8.209282e+06,4.957270e+06,POINT (-73.74523 40.62557)
1,closed_roof_tank,19.8,40.624761,-73.744420,-8.209191e+06,4.957151e+06,POINT (-73.74442 40.62476)
2,closed_roof_tank,12.6,40.626086,-73.746257,-8.209396e+06,4.957345e+06,POINT (-73.74626 40.62609)
3,closed_roof_tank,30.6,40.625786,-73.746203,-8.209390e+06,4.957301e+06,POINT (-73.74620 40.62579)
4,closed_roof_tank,24.0,40.625781,-73.745813,-8.209346e+06,4.957300e+06,POINT (-73.74581 40.62578)
...,...,...,...,...,...,...,...
98164,narrow_closed_roof_tank,5.4,39.777431,-104.920718,-1.167972e+07,4.833652e+06,POINT (-104.92072 39.77743)
98165,narrow_closed_roof_tank,4.8,39.777301,-104.920631,-1.167971e+07,4.833633e+06,POINT (-104.92063 39.77730)
98166,narrow_closed_roof_tank,3.6,39.777701,-104.920609,-1.167971e+07,4.833691e+06,POINT (-104.92061 39.77770)
98167,narrow_closed_roof_tank,4.8,39.776628,-104.920617,-1.167971e+07,4.833535e+06,POINT (-104.92062 39.77663)


In [248]:
%%time
df_closest_tanks_charleston = nearest_neighbor(gdf_charleston, gdf_tanks)
df_closest_tanks_charleston

CPU times: user 21.2 s, sys: 43.6 ms, total: 21.2 s
Wall time: 21.3 s


Unnamed: 0,tank_type,diameter,lat_t_4326,lon_t_4326,lat_t_3857,lon_t_3857,geometry
0,external_floating_roof_tank,10.8,32.761407,-79.950218,-8.900018e+06,3.863677e+06,POINT (-79.95022 32.76141)
1,external_floating_roof_tank,10.8,32.761407,-79.950218,-8.900018e+06,3.863677e+06,POINT (-79.95022 32.76141)
2,external_floating_roof_tank,10.8,32.761407,-79.950218,-8.900018e+06,3.863677e+06,POINT (-79.95022 32.76141)
3,external_floating_roof_tank,10.8,32.761407,-79.950218,-8.900018e+06,3.863677e+06,POINT (-79.95022 32.76141)
4,external_floating_roof_tank,10.8,32.761407,-79.950218,-8.900018e+06,3.863677e+06,POINT (-79.95022 32.76141)
...,...,...,...,...,...,...,...
245095,closed_roof_tank,8.4,32.826088,-79.962751,-8.901413e+06,3.872243e+06,POINT (-79.96275 32.82609)
245096,closed_roof_tank,8.4,32.826088,-79.962751,-8.901413e+06,3.872243e+06,POINT (-79.96275 32.82609)
245097,closed_roof_tank,8.4,32.826088,-79.962751,-8.901413e+06,3.872243e+06,POINT (-79.96275 32.82609)
245098,closed_roof_tank,8.4,32.826088,-79.962751,-8.901413e+06,3.872243e+06,POINT (-79.96275 32.82609)


In [249]:
df_closest_tanks_charleston = df_closest_tanks_charleston.reset_index(drop = True)
df_closest_tanks_charleston = df_closest_tanks_charleston.reset_index(drop = True)

Unnamed: 0,tank_type,diameter,lat_t_4326,lon_t_4326,lat_t_3857,lon_t_3857,geometry
0,external_floating_roof_tank,10.8,32.761407,-79.950218,-8.900018e+06,3.863677e+06,POINT (-79.95022 32.76141)
1,external_floating_roof_tank,10.8,32.761407,-79.950218,-8.900018e+06,3.863677e+06,POINT (-79.95022 32.76141)
2,external_floating_roof_tank,10.8,32.761407,-79.950218,-8.900018e+06,3.863677e+06,POINT (-79.95022 32.76141)
3,external_floating_roof_tank,10.8,32.761407,-79.950218,-8.900018e+06,3.863677e+06,POINT (-79.95022 32.76141)
4,external_floating_roof_tank,10.8,32.761407,-79.950218,-8.900018e+06,3.863677e+06,POINT (-79.95022 32.76141)
...,...,...,...,...,...,...,...
245095,closed_roof_tank,8.4,32.826088,-79.962751,-8.901413e+06,3.872243e+06,POINT (-79.96275 32.82609)
245096,closed_roof_tank,8.4,32.826088,-79.962751,-8.901413e+06,3.872243e+06,POINT (-79.96275 32.82609)
245097,closed_roof_tank,8.4,32.826088,-79.962751,-8.901413e+06,3.872243e+06,POINT (-79.96275 32.82609)
245098,closed_roof_tank,8.4,32.826088,-79.962751,-8.901413e+06,3.872243e+06,POINT (-79.96275 32.82609)


In [251]:
df_charleston_dist = df_charleston.merge(df_closest_tanks_charleston, left_index=True, right_index = True)
df_charleston_dist = df_charleston_dist.drop(['geometry_x', 'geometry_y'], axis = 1)
df_charleston_dist

Unnamed: 0,has_child,age_code,lat_h_4326,lon_h_4326,lat_h_3857,lon_h_3857,tank_type,diameter,lat_t_4326,lon_t_4326,lat_t_3857,lon_t_3857
0,0,C,32.7765,-79.9308,-8.897856e+06,3.865676e+06,external_floating_roof_tank,10.8,32.761407,-79.950218,-8.900018e+06,3.863677e+06
1,0,M,32.7765,-79.9308,-8.897856e+06,3.865676e+06,external_floating_roof_tank,10.8,32.761407,-79.950218,-8.900018e+06,3.863677e+06
2,0,L,32.7765,-79.9308,-8.897856e+06,3.865676e+06,external_floating_roof_tank,10.8,32.761407,-79.950218,-8.900018e+06,3.863677e+06
3,1,D,32.7765,-79.9308,-8.897856e+06,3.865676e+06,external_floating_roof_tank,10.8,32.761407,-79.950218,-8.900018e+06,3.863677e+06
4,0,M,32.7765,-79.9308,-8.897856e+06,3.865676e+06,external_floating_roof_tank,10.8,32.761407,-79.950218,-8.900018e+06,3.863677e+06
...,...,...,...,...,...,...,...,...,...,...,...,...
245095,0,A,32.7870,-79.9917,-8.904635e+06,3.867066e+06,closed_roof_tank,8.4,32.826088,-79.962751,-8.901413e+06,3.872243e+06
245096,0,F,32.7870,-79.9917,-8.904635e+06,3.867066e+06,closed_roof_tank,8.4,32.826088,-79.962751,-8.901413e+06,3.872243e+06
245097,0,I,32.7870,-79.9917,-8.904635e+06,3.867066e+06,closed_roof_tank,8.4,32.826088,-79.962751,-8.901413e+06,3.872243e+06
245098,0,B,32.7870,-79.9917,-8.904635e+06,3.867066e+06,closed_roof_tank,8.4,32.826088,-79.962751,-8.901413e+06,3.872243e+06


In [252]:
%%time

def distancer(row):
    coords_1 = (row['lat_h_4326'], row['lon_h_4326'])
    coords_2 = (row['lat_t_4326'], row['lon_t_4326'])
    return (hs.haversine(coords_1, coords_2) * 1000)

df_charleston_dist['distance_m'] = df_charleston_dist.apply(distancer, axis=1)
df_charleston_dist

CPU times: user 4.06 s, sys: 22.8 ms, total: 4.09 s
Wall time: 4.1 s


Unnamed: 0,has_child,age_code,lat_h_4326,lon_h_4326,lat_h_3857,lon_h_3857,tank_type,diameter,lat_t_4326,lon_t_4326,lat_t_3857,lon_t_3857,distance_m
0,0,C,32.7765,-79.9308,-8.897856e+06,3.865676e+06,external_floating_roof_tank,10.8,32.761407,-79.950218,-8.900018e+06,3.863677e+06,2472.401421
1,0,M,32.7765,-79.9308,-8.897856e+06,3.865676e+06,external_floating_roof_tank,10.8,32.761407,-79.950218,-8.900018e+06,3.863677e+06,2472.401421
2,0,L,32.7765,-79.9308,-8.897856e+06,3.865676e+06,external_floating_roof_tank,10.8,32.761407,-79.950218,-8.900018e+06,3.863677e+06,2472.401421
3,1,D,32.7765,-79.9308,-8.897856e+06,3.865676e+06,external_floating_roof_tank,10.8,32.761407,-79.950218,-8.900018e+06,3.863677e+06,2472.401421
4,0,M,32.7765,-79.9308,-8.897856e+06,3.865676e+06,external_floating_roof_tank,10.8,32.761407,-79.950218,-8.900018e+06,3.863677e+06,2472.401421
...,...,...,...,...,...,...,...,...,...,...,...,...,...
245095,0,A,32.7870,-79.9917,-8.904635e+06,3.867066e+06,closed_roof_tank,8.4,32.826088,-79.962751,-8.901413e+06,3.872243e+06,5119.645368
245096,0,F,32.7870,-79.9917,-8.904635e+06,3.867066e+06,closed_roof_tank,8.4,32.826088,-79.962751,-8.901413e+06,3.872243e+06,5119.645368
245097,0,I,32.7870,-79.9917,-8.904635e+06,3.867066e+06,closed_roof_tank,8.4,32.826088,-79.962751,-8.901413e+06,3.872243e+06,5119.645368
245098,0,B,32.7870,-79.9917,-8.904635e+06,3.867066e+06,closed_roof_tank,8.4,32.826088,-79.962751,-8.901413e+06,3.872243e+06,5119.645368


In [253]:
df_charleston_dist = df_charleston_dist.drop(['lat_h_4326', 'lon_h_4326', 'lat_t_4326', 'lon_t_4326', 'lat_t_3857', 'lon_t_3857'], axis = 1)

In [254]:
df_charleston_dist['distance_mi']  = df_charleston_dist['distance_m'] / 1609.344
df_charleston_dist

Unnamed: 0,has_child,age_code,lat_h_3857,lon_h_3857,tank_type,diameter,distance_m,distance_mi
0,0,C,-8.897856e+06,3.865676e+06,external_floating_roof_tank,10.8,2472.401421,1.536279
1,0,M,-8.897856e+06,3.865676e+06,external_floating_roof_tank,10.8,2472.401421,1.536279
2,0,L,-8.897856e+06,3.865676e+06,external_floating_roof_tank,10.8,2472.401421,1.536279
3,1,D,-8.897856e+06,3.865676e+06,external_floating_roof_tank,10.8,2472.401421,1.536279
4,0,M,-8.897856e+06,3.865676e+06,external_floating_roof_tank,10.8,2472.401421,1.536279
...,...,...,...,...,...,...,...,...
245095,0,A,-8.904635e+06,3.867066e+06,closed_roof_tank,8.4,5119.645368,3.181200
245096,0,F,-8.904635e+06,3.867066e+06,closed_roof_tank,8.4,5119.645368,3.181200
245097,0,I,-8.904635e+06,3.867066e+06,closed_roof_tank,8.4,5119.645368,3.181200
245098,0,B,-8.904635e+06,3.867066e+06,closed_roof_tank,8.4,5119.645368,3.181200


In [255]:
conditions_charleston = [(df_charleston_dist['distance_mi'] <= 0.5),
              ((df_charleston_dist['distance_mi'] > 0.5) & (df_charleston_dist['distance_mi'] <= 1)),
              ((df_charleston_dist['distance_mi'] > 1) & (df_charleston_dist['distance_mi'] <= 5)),
              (df_charleston_dist['distance_mi'] > 5)]

values_charleston = [1, 2, 3, 4]

df_charleston_dist['distance_category'] = np.select(conditions_charleston, values_charleston)
df_charleston_dist

Unnamed: 0,has_child,age_code,lat_h_3857,lon_h_3857,tank_type,diameter,distance_m,distance_mi,distance_category
0,0,C,-8.897856e+06,3.865676e+06,external_floating_roof_tank,10.8,2472.401421,1.536279,3
1,0,M,-8.897856e+06,3.865676e+06,external_floating_roof_tank,10.8,2472.401421,1.536279,3
2,0,L,-8.897856e+06,3.865676e+06,external_floating_roof_tank,10.8,2472.401421,1.536279,3
3,1,D,-8.897856e+06,3.865676e+06,external_floating_roof_tank,10.8,2472.401421,1.536279,3
4,0,M,-8.897856e+06,3.865676e+06,external_floating_roof_tank,10.8,2472.401421,1.536279,3
...,...,...,...,...,...,...,...,...,...
245095,0,A,-8.904635e+06,3.867066e+06,closed_roof_tank,8.4,5119.645368,3.181200,3
245096,0,F,-8.904635e+06,3.867066e+06,closed_roof_tank,8.4,5119.645368,3.181200,3
245097,0,I,-8.904635e+06,3.867066e+06,closed_roof_tank,8.4,5119.645368,3.181200,3
245098,0,B,-8.904635e+06,3.867066e+06,closed_roof_tank,8.4,5119.645368,3.181200,3


##### Processing the data for GPU visualizations

In [257]:
df_charleston_dist['is_elderly'] = np.where(((df_charleston_dist['age_code'] == 'J') | (df_charleston_dist['age_code'] == 'K') |
                                       (df_charleston_dist['age_code'] == 'L') | (df_charleston_dist['age_code'] == 'M')), 1, 2)
df_charleston_dist

Unnamed: 0,has_child,age_code,lat_h_3857,lon_h_3857,tank_type,diameter,distance_m,distance_mi,distance_category,is_elderly
0,0,C,-8.897856e+06,3.865676e+06,external_floating_roof_tank,10.8,2472.401421,1.536279,3,2
1,0,M,-8.897856e+06,3.865676e+06,external_floating_roof_tank,10.8,2472.401421,1.536279,3,1
2,0,L,-8.897856e+06,3.865676e+06,external_floating_roof_tank,10.8,2472.401421,1.536279,3,1
3,1,D,-8.897856e+06,3.865676e+06,external_floating_roof_tank,10.8,2472.401421,1.536279,3,2
4,0,M,-8.897856e+06,3.865676e+06,external_floating_roof_tank,10.8,2472.401421,1.536279,3,1
...,...,...,...,...,...,...,...,...,...,...
245095,0,A,-8.904635e+06,3.867066e+06,closed_roof_tank,8.4,5119.645368,3.181200,3,2
245096,0,F,-8.904635e+06,3.867066e+06,closed_roof_tank,8.4,5119.645368,3.181200,3,2
245097,0,I,-8.904635e+06,3.867066e+06,closed_roof_tank,8.4,5119.645368,3.181200,3,2
245098,0,B,-8.904635e+06,3.867066e+06,closed_roof_tank,8.4,5119.645368,3.181200,3,2


In [258]:
df_charleston_dist['has_child'] = np.where(df_charleston_dist['has_child'] == 1, 1, 2)
df_charleston_dist

Unnamed: 0,has_child,age_code,lat_h_3857,lon_h_3857,tank_type,diameter,distance_m,distance_mi,distance_category,is_elderly
0,2,C,-8.897856e+06,3.865676e+06,external_floating_roof_tank,10.8,2472.401421,1.536279,3,2
1,2,M,-8.897856e+06,3.865676e+06,external_floating_roof_tank,10.8,2472.401421,1.536279,3,1
2,2,L,-8.897856e+06,3.865676e+06,external_floating_roof_tank,10.8,2472.401421,1.536279,3,1
3,1,D,-8.897856e+06,3.865676e+06,external_floating_roof_tank,10.8,2472.401421,1.536279,3,2
4,2,M,-8.897856e+06,3.865676e+06,external_floating_roof_tank,10.8,2472.401421,1.536279,3,1
...,...,...,...,...,...,...,...,...,...,...
245095,2,A,-8.904635e+06,3.867066e+06,closed_roof_tank,8.4,5119.645368,3.181200,3,2
245096,2,F,-8.904635e+06,3.867066e+06,closed_roof_tank,8.4,5119.645368,3.181200,3,2
245097,2,I,-8.904635e+06,3.867066e+06,closed_roof_tank,8.4,5119.645368,3.181200,3,2
245098,2,B,-8.904635e+06,3.867066e+06,closed_roof_tank,8.4,5119.645368,3.181200,3,2


In [259]:
df_charleston_dist.rename(columns = {'lat_h_3857': 'lat_3857', 'lon_h_3857': 'lon_3857'}, inplace = True)

In [260]:
df_charleston_dist['distance_mi'].max()

31.909051279452356

In [261]:
df_tanks_charleston = df_tanks_charleston.drop(['state', 'county', 'lat_t_4326', 'lon_t_4326', 'geometry'], axis = 1)
df_tanks_charleston['has_child'] = 0
df_tanks_charleston['distance_category'] = 0
df_tanks_charleston['is_elderly'] = 0
df_tanks_charleston['distance_mi'] = 35
df_tanks_charleston.rename(columns = {'lat_t_3857': 'lat_3857', 'lon_t_3857': 'lon_3857'}, inplace = True)
df_tanks_charleston

Unnamed: 0,tank_type,diameter,lat_3857,lon_3857,has_child,distance_category,is_elderly,distance_mi
26475,sedimentation_tank,42.0,-8.899485e+06,3.872865e+06,0,0,0,35
26476,sedimentation_tank,42.0,-8.899550e+06,3.872849e+06,0,0,0,35
26477,closed_roof_tank,22.8,-8.899416e+06,3.872105e+06,0,0,0,35
26478,closed_roof_tank,32.4,-8.899302e+06,3.872134e+06,0,0,0,35
26479,closed_roof_tank,31.2,-8.899021e+06,3.872302e+06,0,0,0,35
...,...,...,...,...,...,...,...,...
96278,spherical_tank,48.6,-8.898830e+06,3.872435e+06,0,0,0,35
96279,spherical_tank,51.6,-8.898897e+06,3.872415e+06,0,0,0,35
96280,closed_roof_tank,12.6,-8.898822e+06,3.872099e+06,0,0,0,35
96281,closed_roof_tank,12.6,-8.898841e+06,3.872089e+06,0,0,0,35


In [262]:
df_charleston_merged = df_charleston_dist.append(df_tanks_charleston, ignore_index = True)
df_charleston_merged

  df_charleston_merged = df_charleston_dist.append(df_tanks_charleston, ignore_index = True)


Unnamed: 0,has_child,age_code,lat_3857,lon_3857,tank_type,diameter,distance_m,distance_mi,distance_category,is_elderly
0,2,C,-8.897856e+06,3.865676e+06,external_floating_roof_tank,10.8,2472.401421,1.536279,3,2
1,2,M,-8.897856e+06,3.865676e+06,external_floating_roof_tank,10.8,2472.401421,1.536279,3,1
2,2,L,-8.897856e+06,3.865676e+06,external_floating_roof_tank,10.8,2472.401421,1.536279,3,1
3,1,D,-8.897856e+06,3.865676e+06,external_floating_roof_tank,10.8,2472.401421,1.536279,3,2
4,2,M,-8.897856e+06,3.865676e+06,external_floating_roof_tank,10.8,2472.401421,1.536279,3,1
...,...,...,...,...,...,...,...,...,...,...
245345,0,,-8.898830e+06,3.872435e+06,spherical_tank,48.6,,35.000000,0,0
245346,0,,-8.898897e+06,3.872415e+06,spherical_tank,51.6,,35.000000,0,0
245347,0,,-8.898822e+06,3.872099e+06,closed_roof_tank,12.6,,35.000000,0,0
245348,0,,-8.898841e+06,3.872089e+06,closed_roof_tank,12.6,,35.000000,0,0


In [263]:
df_charleston_merged.to_parquet('/hpc/group/codeplus22-vis/infousa_copy/distances_charleston_final.parquet')

In [264]:
df = pd.read_parquet('/hpc/group/codeplus22-vis/infousa_copy/distances_charleston_final.parquet')
df

Unnamed: 0,has_child,age_code,lat_3857,lon_3857,tank_type,diameter,distance_m,distance_mi,distance_category,is_elderly
0,2,C,-8.897856e+06,3.865676e+06,external_floating_roof_tank,10.8,2472.401421,1.536279,3,2
1,2,M,-8.897856e+06,3.865676e+06,external_floating_roof_tank,10.8,2472.401421,1.536279,3,1
2,2,L,-8.897856e+06,3.865676e+06,external_floating_roof_tank,10.8,2472.401421,1.536279,3,1
3,1,D,-8.897856e+06,3.865676e+06,external_floating_roof_tank,10.8,2472.401421,1.536279,3,2
4,2,M,-8.897856e+06,3.865676e+06,external_floating_roof_tank,10.8,2472.401421,1.536279,3,1
...,...,...,...,...,...,...,...,...,...,...
245345,0,,-8.898830e+06,3.872435e+06,spherical_tank,48.6,,35.000000,0,0
245346,0,,-8.898897e+06,3.872415e+06,spherical_tank,51.6,,35.000000,0,0
245347,0,,-8.898822e+06,3.872099e+06,closed_roof_tank,12.6,,35.000000,0,0
245348,0,,-8.898841e+06,3.872089e+06,closed_roof_tank,12.6,,35.000000,0,0
