# Appending, Merging, and Processing Pandas Dataframes
### Processing tank risk and household data for GPU visualizations

### Import statements

In [2]:
import pandas as pd
import geopandas as gpd
import numpy as np



### Reading InfoUSA data

This is a pre-processed file including tank and household distances, transformed latitude longitude coordinates, as well as all of national risk index data for six natural hazards (earthquake, strong winds, hurricanes, tornadoes, cold floods, and riverine floods), as stipulated by our researcher. We then drop columns we will not be using in our visualizations.

In [3]:
df_hh = pd.read_parquet('/hpc/group/codeplus22-vis/infousa_copy/distances_all_hh_with_children_final.parquet')
df_hh

Unnamed: 0,zip,county,state,child_num,age_code,lat_h_3857,lon_h_3857,lat_h_4326,lon_h_4326,erqk_risks,swnd_risks,hrcn_risks,trnd_risks,cfld_risks,rfld_risks,avg_risk,distance_m,distance_mi,distance_category
0,18833,15,PA,1,C,-8.509454e+06,5.101307e+06,41.600392,-76.441724,2.050670,15.375901,5.380037,14.512438,-1.000000,17.062917,9.063660,53847.632898,33.459368,4
1,18833,15,PA,1,H,-8.499018e+06,5.096218e+06,41.566196,-76.347977,4.881886,15.876431,4.895073,24.892845,-1.000000,30.218719,13.460825,45869.438119,28.501947,4
2,18833,15,PA,0,E,-8.496356e+06,5.099448e+06,41.587904,-76.324061,4.881886,15.876431,4.895073,24.892845,-1.000000,30.218719,13.460825,46015.805516,28.592896,4
3,18833,15,PA,1,G,-8.509963e+06,5.103102e+06,41.612450,-76.446301,2.050670,15.375901,5.380037,14.512438,-1.000000,17.062917,9.063660,54518.419780,33.876175,4
4,18833,15,PA,1,G,-8.508370e+06,5.099066e+06,41.585339,-76.431989,2.050670,15.375901,5.380037,14.512438,-1.000000,17.062917,9.063660,53297.730315,33.117674,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53067356,92003,73,CA,0,H,-1.304989e+07,3.930304e+06,33.263291,-117.229201,34.617855,11.334705,1.771182,19.203448,2.036342,18.929178,14.648785,53258.019576,33.092999,4
53067357,92003,73,CA,0,F,-1.304547e+07,3.934604e+06,33.295585,-117.189475,34.617855,11.334705,1.771182,19.203448,2.036342,18.929178,14.648785,56199.475559,34.920735,4
53067358,92003,73,CA,2,L,-1.304803e+07,3.934243e+06,33.292877,-117.212471,34.617855,11.334705,1.771182,19.203448,2.036342,18.929178,14.648785,56209.490501,34.926958,4
53067359,92003,73,CA,1,D,-1.304785e+07,3.933154e+06,33.284700,-117.210800,34.617855,11.334705,1.771182,19.203448,2.036342,18.929178,14.648785,55287.090852,34.353806,4


In [4]:
df_hh = df_hh.drop(['zip', 'county', 'state', 'child_num', 'lat_h_4326','lon_h_4326', 'distance_m'], axis = 1)

### Using numpy's ```.where()``` to classify age codes

For our visualizations, we would like to have a tool allowing the user to select only points with elderly head of household (defining elderly as 65+, as used by the World Health Organization). Therefore, using the data dictionary provided by InfoUSA, age codes ```J```, ```K```, ```L```, or ```M``` will be classified as elderly.

Since we are processing this data to be used by Datashader through the Cuxfilter library, we need to be aware of some of the formatting of this dataframe so that it meets certain requirements. The Datashader plotting library that Cuxfilter uses to create our visualization through the use of Graphical Processing Units (GPUs) is optimized for working with large dataframes. This comes with a couple constraints, however. One of these is that Datashader only takes numerical inputs when creating the custom charts the user can interact with, like the multiselect chart or the range slider. This means that instead of being able to categorize each household by whether or not its head of household is eldery by labelling it with ```strings``` as ```'Elderly'``` or ```'No elderly'```, we must label it numerically. Therefore, we must convert each age code to a number that indicates whether or not that household has an elderly head of household.

This is done with the numpy library's ```.where()``` function, which uses if-else conditions to assign values in a new column. In the code below, if the age_code is ```J```, ```K```, ```L``` or ```M```, the household is marked as ```1```, meaning elderly, and marked as ```2```, not elderly, for all other values. 

In [5]:
df_hh['is_elderly'] = np.where((df_hh['age_code'] == 'J') | (df_hh['age_code'] == 'K') | (df_hh['age_code'] == 'L') | (df_hh['age_code'] == 'M'), 1, 2)
df_hh

Unnamed: 0,age_code,lat_h_3857,lon_h_3857,erqk_risks,swnd_risks,hrcn_risks,trnd_risks,cfld_risks,rfld_risks,avg_risk,distance_mi,distance_category,is_elderly
0,C,-8.509454e+06,5.101307e+06,2.050670,15.375901,5.380037,14.512438,-1.000000,17.062917,9.063660,33.459368,4,2
1,H,-8.499018e+06,5.096218e+06,4.881886,15.876431,4.895073,24.892845,-1.000000,30.218719,13.460825,28.501947,4,2
2,E,-8.496356e+06,5.099448e+06,4.881886,15.876431,4.895073,24.892845,-1.000000,30.218719,13.460825,28.592896,4,2
3,G,-8.509963e+06,5.103102e+06,2.050670,15.375901,5.380037,14.512438,-1.000000,17.062917,9.063660,33.876175,4,2
4,G,-8.508370e+06,5.099066e+06,2.050670,15.375901,5.380037,14.512438,-1.000000,17.062917,9.063660,33.117674,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
53067356,H,-1.304989e+07,3.930304e+06,34.617855,11.334705,1.771182,19.203448,2.036342,18.929178,14.648785,33.092999,4,2
53067357,F,-1.304547e+07,3.934604e+06,34.617855,11.334705,1.771182,19.203448,2.036342,18.929178,14.648785,34.920735,4,2
53067358,L,-1.304803e+07,3.934243e+06,34.617855,11.334705,1.771182,19.203448,2.036342,18.929178,14.648785,34.926958,4,1
53067359,D,-1.304785e+07,3.933154e+06,34.617855,11.334705,1.771182,19.203448,2.036342,18.929178,14.648785,34.353806,4,2


### Using ```.rename()``` to rename columns

In addition, the Cuxfilter library only pulls coordinates from two columns: on latitude and one longitude column. This means that all the points displayed in the dashboard must be in the same column. Therefore, to plot tanks and households on the same dashboard, we append the dataframe with the coordinates for each tank to the dataframe with the coordinates for each household. To do so, the columns must be the same across both columns. Therefore, we renamed the ```lat_h_3857``` and ```lon_h_3857``` columns in the ```df_hh``` dataframe to ```lat_3857``` and ```lon_3857```. When the ```df_tanks``` dataframe is appended to this one, we will have general latitude and longitude columns including coordinate information for all the households and tanks in the US.

In [7]:
df_hh.rename(columns = {'lat_h_3857': 'lat_3857'}, inplace = True)
df_hh.rename(columns = {'lon_h_3857': 'lon_3857'}, inplace = True)
df_hh

Unnamed: 0,age_code,lat_3857,lon_3857,erqk_risks,swnd_risks,hrcn_risks,trnd_risks,cfld_risks,rfld_risks,avg_risk,distance_mi,distance_category,is_elderly
0,C,-8.509454e+06,5.101307e+06,2.050670,15.375901,5.380037,14.512438,-1.000000,17.062917,9.063660,33.459368,4,2
1,H,-8.499018e+06,5.096218e+06,4.881886,15.876431,4.895073,24.892845,-1.000000,30.218719,13.460825,28.501947,4,2
2,E,-8.496356e+06,5.099448e+06,4.881886,15.876431,4.895073,24.892845,-1.000000,30.218719,13.460825,28.592896,4,2
3,G,-8.509963e+06,5.103102e+06,2.050670,15.375901,5.380037,14.512438,-1.000000,17.062917,9.063660,33.876175,4,2
4,G,-8.508370e+06,5.099066e+06,2.050670,15.375901,5.380037,14.512438,-1.000000,17.062917,9.063660,33.117674,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
53067356,H,-1.304989e+07,3.930304e+06,34.617855,11.334705,1.771182,19.203448,2.036342,18.929178,14.648785,33.092999,4,2
53067357,F,-1.304547e+07,3.934604e+06,34.617855,11.334705,1.771182,19.203448,2.036342,18.929178,14.648785,34.920735,4,2
53067358,L,-1.304803e+07,3.934243e+06,34.617855,11.334705,1.771182,19.203448,2.036342,18.929178,14.648785,34.926958,4,1
53067359,D,-1.304785e+07,3.933154e+06,34.617855,11.334705,1.771182,19.203448,2.036342,18.929178,14.648785,34.353806,4,2


### Reading and Processing Tank Data

This is anothe pre-processed dataframe, which contains the coordinates of each tank and also each of the six national risk index values associated with each tank based on county. We then drop columns we will not need in our visualizations, and rename columns as explained above.

In [10]:
df_tanks = gpd.read_file('/hpc/group/codeplus22-vis/infousa_copy/tanks_risk_score_final.shp')
df_tanks = df_tanks.drop(['state', 'tank_type', 'diameter', 'county', 'on_floodpl'], axis = 1)
df_tanks.head(n=3)

Unnamed: 0,lat_t_4326,lon_t_4326,lat_t_3857,lon_t_3857,erqk_risks,swnd_risks,hrcn_risks,trnd_risks,cfld_risks,rfld_risks,avg_risk,adj_risk,geometry
0,40.625572,-73.745231,-8209282.0,4957270.0,6.887656,14.447002,4.095282,13.081208,6.959016,14.834784,10.050825,10.050825,"POLYGON ((-73.74547 40.62575, -73.74500 40.625..."
1,40.624761,-73.74442,-8209191.0,4957151.0,6.887656,14.447002,4.095282,13.081208,6.959016,14.834784,10.050825,10.050825,"POLYGON ((-73.74465 40.62485, -73.74419 40.624..."
2,40.626086,-73.746257,-8209396.0,4957345.0,6.887656,14.447002,4.095282,13.081208,6.959016,14.834784,10.050825,10.050825,"POLYGON ((-73.74633 40.62615, -73.74618 40.626..."


In [11]:
df_tanks.rename(columns = {'lat_t_3857': 'lat_3857'}, inplace = True)
df_tanks.rename(columns = {'lon_t_3857': 'lon_3857'}, inplace = True)

In order for the tanks to display on Cuxfilter when using the distance range slider, we set the distance to the maximum distance between a household and a tank. This is because the distance column in the final merged dataframe used in our visualizations will represent the distance between a household and the tank nearest to it. However, for tanks, there is no associated distance, and when users play with the distance range slider, tanks will not appear on the visualization. We get around this by setting the distance to the maximum distance between a household and a tank. This is a limited solution potentially solveable by calculating the distance for each tank to the nearest household and including those values.

We add the ```distance_category``` and ```is_elderly``` columns to the ```df_tanks_harris``` dataframe, setting all their values to ```0``` to indicate that the point is a tank when plotted on the dashboard.

In [15]:
print(df_hh['distance_mi'].max())

213.4276172929592


In [17]:
df_tanks['distance_mi'] = 215
df_tanks['distance_category'] = 0
df_tanks['is_elderly'] = 0
df_tanks.head(n=3)

Unnamed: 0,lat_t_4326,lon_t_4326,lat_3857,lon_3857,erqk_risks,swnd_risks,hrcn_risks,trnd_risks,cfld_risks,rfld_risks,avg_risk,adj_risk,geometry,distance_category,is_elderly,distance_mi
0,40.625572,-73.745231,-8209282.0,4957270.0,6.887656,14.447002,4.095282,13.081208,6.959016,14.834784,10.050825,10.050825,"POLYGON ((-73.74547 40.62575, -73.74500 40.625...",0,0,215
1,40.624761,-73.74442,-8209191.0,4957151.0,6.887656,14.447002,4.095282,13.081208,6.959016,14.834784,10.050825,10.050825,"POLYGON ((-73.74465 40.62485, -73.74419 40.624...",0,0,215
2,40.626086,-73.746257,-8209396.0,4957345.0,6.887656,14.447002,4.095282,13.081208,6.959016,14.834784,10.050825,10.050825,"POLYGON ((-73.74633 40.62615, -73.74618 40.626...",0,0,215


### Using ```.append()``` to join the two dataframes

Here we are appending the ```df_tanks```, our tank data, to ```df_hh```, our household data. Once this data is appended, the```.drop()``` method to drop columns we will not use in our visualizations. In this method, ```axis``` is set to ```1``` to indicate that we are dropping columns, not indices.

We then save this as a ```.parquet()``` file.

In [18]:
df = df_hh.append(df_tanks, ignore_index=True)
df = df.drop(['lat_t_4326', 'lon_t_4326', 'adj_risk', 'geometry'], axis = 1)
df

  df = df_hh.append(df_tanks, ignore_index=True)


Unnamed: 0,age_code,lat_3857,lon_3857,erqk_risks,swnd_risks,hrcn_risks,trnd_risks,cfld_risks,rfld_risks,avg_risk,distance_mi,distance_category,is_elderly
0,C,-8.509454e+06,5.101307e+06,2.050670,15.375901,5.380037,14.512438,-1.0,17.062917,9.063660,33.459368,4,2
1,H,-8.499018e+06,5.096218e+06,4.881886,15.876431,4.895073,24.892845,-1.0,30.218719,13.460825,28.501947,4,2
2,E,-8.496356e+06,5.099448e+06,4.881886,15.876431,4.895073,24.892845,-1.0,30.218719,13.460825,28.592896,4,2
3,G,-8.509963e+06,5.103102e+06,2.050670,15.375901,5.380037,14.512438,-1.0,17.062917,9.063660,33.876175,4,2
4,G,-8.508370e+06,5.099066e+06,2.050670,15.375901,5.380037,14.512438,-1.0,17.062917,9.063660,33.117674,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
53165525,,-1.167972e+07,4.833652e+06,7.743007,12.625942,-1.000000,45.758161,-1.0,6.179840,12.051158,215.000000,0,0
53165526,,-1.167971e+07,4.833633e+06,7.743007,12.625942,-1.000000,45.758161,-1.0,6.179840,12.051158,215.000000,0,0
53165527,,-1.167971e+07,4.833691e+06,7.743007,12.625942,-1.000000,45.758161,-1.0,6.179840,12.051158,215.000000,0,0
53165528,,-1.167971e+07,4.833535e+06,7.743007,12.625942,-1.000000,45.758161,-1.0,6.179840,12.051158,215.000000,0,0


In [20]:
df.to_parquet('/hpc/group/codeplus22-vis/infousa_copy/hh_tank_risk.parquet')

### Processing dataframes for individual natural hazards

Since we want to plot all of the risk dashboards separately, are now breaking down this dataframe, one for each type of natural hazard.

#### Processing the dataframe for earthquake risk

We are taking the macro dataframe we made above and dropping all of the other risks except for earthquake before exporting it to a parquet file to be used in visualizations.

In [13]:
df_erqk = df.drop(['swnd_risks', 'hrcn_risks', 'trnd_risks', 'cfld_risks', 'rfld_risks', 'avg_risk'], axis = 1)
df_erqk = df_erqk[['is_elderly', 'distance_mi', 'erqk_risks', 'lat_3857', 'lon_3857']]
df_erqk.rename(columns = {'erqk_risks': 'earthquake_risk'}, inplace = True)
df_erqk

Unnamed: 0,is_elderly,distance_mi,earthquake_risk,lat_3857,lon_3857
0,2,33.459368,2.050670,-8.509454e+06,5.101307e+06
1,2,28.501947,4.881886,-8.499018e+06,5.096218e+06
2,2,28.592896,4.881886,-8.496356e+06,5.099448e+06
3,2,33.876175,2.050670,-8.509963e+06,5.103102e+06
4,2,33.117674,2.050670,-8.508370e+06,5.099066e+06
...,...,...,...,...,...
53165525,0,215.000000,7.743007,-1.167972e+07,4.833652e+06
53165526,0,215.000000,7.743007,-1.167971e+07,4.833633e+06
53165527,0,215.000000,7.743007,-1.167971e+07,4.833691e+06
53165528,0,215.000000,7.743007,-1.167971e+07,4.833535e+06


In [5]:
df_erqk.to_parquet('/hpc/group/codeplus22-vis/infousa_copy/earthquake_risk_final.parquet')

#### Processing dataframe for strong wind risk
Following the same steps as above, this time dropping all of the other risks except for strong wind risk before exporting it to a parquet file to be used in visualizations.

In [16]:
df_swnd = df.drop(['erqk_risks', 'hrcn_risks', 'trnd_risks', 'cfld_risks', 'rfld_risks', 'avg_risk'], axis = 1)
df_swnd = df_swnd[['is_elderly', 'distance_mi', 'swnd_risks', 'lat_3857', 'lon_3857']]
df_swnd.rename(columns = {'swnd_risks': 'strong_wind_risk'}, inplace = True)
df_swnd

Unnamed: 0,is_elderly,distance_mi,strong_wind_risk,lat_3857,lon_3857
0,2,33.459368,15.375901,-8.509454e+06,5.101307e+06
1,2,28.501947,15.876431,-8.499018e+06,5.096218e+06
2,2,28.592896,15.876431,-8.496356e+06,5.099448e+06
3,2,33.876175,15.375901,-8.509963e+06,5.103102e+06
4,2,33.117674,15.375901,-8.508370e+06,5.099066e+06
...,...,...,...,...,...
53165525,0,215.000000,12.625942,-1.167972e+07,4.833652e+06
53165526,0,215.000000,12.625942,-1.167971e+07,4.833633e+06
53165527,0,215.000000,12.625942,-1.167971e+07,4.833691e+06
53165528,0,215.000000,12.625942,-1.167971e+07,4.833535e+06


In [17]:
df_swnd.to_parquet('/hpc/group/codeplus22-vis/infousa_copy/strong_wind_risk_final.parquet')

#### Processing dataframe for hurricane risk
Following the same steps as above, this time dropping all of the other risks except for hurricane risk before exporting it to a parquet file to be used in visualizations.

In [4]:
df_hrcn = df.drop(['erqk_risks','swnd_risks', 'trnd_risks', 'cfld_risks', 'rfld_risks', 'avg_risk'], axis = 1)
df_hrcn = df_hrcn[['is_elderly', 'distance_mi', 'hrcn_risks', 'lat_3857', 'lon_3857']]
df_hrcn.rename(columns = {'hrcn_risks': 'hurricane_risk'}, inplace = True)

df_hrcn

Unnamed: 0,is_elderly,distance_mi,hurricane_risk,lat_3857,lon_3857
0,2,33.459368,5.380037,-8.509454e+06,5.101307e+06
1,2,28.501947,4.895073,-8.499018e+06,5.096218e+06
2,2,28.592896,4.895073,-8.496356e+06,5.099448e+06
3,2,33.876175,5.380037,-8.509963e+06,5.103102e+06
4,2,33.117674,5.380037,-8.508370e+06,5.099066e+06
...,...,...,...,...,...
53165525,0,215.000000,-1.000000,-1.167972e+07,4.833652e+06
53165526,0,215.000000,-1.000000,-1.167971e+07,4.833633e+06
53165527,0,215.000000,-1.000000,-1.167971e+07,4.833691e+06
53165528,0,215.000000,-1.000000,-1.167971e+07,4.833535e+06


In [7]:
df_hrcn.to_parquet('/hpc/group/codeplus22-vis/infousa_copy/hurricane_risks_final.parquet')

#### Processing dataframe for tornado risk
Following the same steps as above, this time dropping all of the other risks except for tornado risk before exporting it to a parquet file to be used in visualizations.

In [5]:
df_trnd = df.drop(['erqk_risks','swnd_risks', 'hrcn_risks', 'cfld_risks', 'rfld_risks', 'avg_risk'], axis = 1)
df_trnd = df_trnd[['is_elderly', 'distance_mi','trnd_risks', 'lat_3857', 'lon_3857']]
df_trnd.rename(columns = {'trnd_risks': 'tornado_risk'}, inplace = True)
df_trnd

Unnamed: 0,is_elderly,distance_mi,tornado_risk,lat_3857,lon_3857
0,2,33.459368,14.512438,-8.509454e+06,5.101307e+06
1,2,28.501947,24.892845,-8.499018e+06,5.096218e+06
2,2,28.592896,24.892845,-8.496356e+06,5.099448e+06
3,2,33.876175,14.512438,-8.509963e+06,5.103102e+06
4,2,33.117674,14.512438,-8.508370e+06,5.099066e+06
...,...,...,...,...,...
53165525,0,215.000000,45.758161,-1.167972e+07,4.833652e+06
53165526,0,215.000000,45.758161,-1.167971e+07,4.833633e+06
53165527,0,215.000000,45.758161,-1.167971e+07,4.833691e+06
53165528,0,215.000000,45.758161,-1.167971e+07,4.833535e+06


In [6]:
df_trnd.to_parquet('/hpc/group/codeplus22-vis/infousa_copy/tornado_risks_final.parquet')

#### Processing dataframe for coastal flood risk
Following the same steps as above, this time dropping all of the other risks except for coastal flood risk before exporting it to a parquet file to be used in visualizations.

In [10]:
df_cfld = df.drop(['erqk_risks','swnd_risks', 'trnd_risks', 'hrcn_risks', 'rfld_risks', 'avg_risk'], axis = 1)
df_cfld = df_cfld[['is_elderly', 'distance_mi','cfld_risks', 'lat_3857', 'lon_3857']]
df_cfld.rename(columns = {'cfld_risks': 'coastal_flood_risk'}, inplace = True)

df_cfld

Unnamed: 0,is_elderly,distance_mi,coastal_flood_risk,lat_3857,lon_3857
0,2,33.459368,-1.0,-8.509454e+06,5.101307e+06
1,2,28.501947,-1.0,-8.499018e+06,5.096218e+06
2,2,28.592896,-1.0,-8.496356e+06,5.099448e+06
3,2,33.876175,-1.0,-8.509963e+06,5.103102e+06
4,2,33.117674,-1.0,-8.508370e+06,5.099066e+06
...,...,...,...,...,...
53165525,0,215.000000,-1.0,-1.167972e+07,4.833652e+06
53165526,0,215.000000,-1.0,-1.167971e+07,4.833633e+06
53165527,0,215.000000,-1.0,-1.167971e+07,4.833691e+06
53165528,0,215.000000,-1.0,-1.167971e+07,4.833535e+06


In [13]:
df_cfld.to_parquet('/hpc/group/codeplus22-vis/infousa_copy/coast_flood_risks_final.parquet')

#### Processing dataframe for riverine flood risk
Following the same steps as above, this time dropping all of the other risks except for riverine flood risk before exporting it to a parquet file to be used in visualizations.

In [12]:
df_rfld = df.drop(['erqk_risks','swnd_risks', 'trnd_risks', 'hrcn_risks', 'cfld_risks', 'avg_risk'], axis = 1)
df_rfld = df_rfld[[ 'is_elderly', 'distance_mi', 'rfld_risks', 'lat_3857', 'lon_3857']]
df_rfld.rename(columns = {'rfld_risks': 'riverine_flood_risk'}, inplace = True)

df_rfld

Unnamed: 0,is_elderly,distance_mi,riverine_flood_risk,lat_3857,lon_3857
0,2,33.459368,17.062917,-8.509454e+06,5.101307e+06
1,2,28.501947,30.218719,-8.499018e+06,5.096218e+06
2,2,28.592896,30.218719,-8.496356e+06,5.099448e+06
3,2,33.876175,17.062917,-8.509963e+06,5.103102e+06
4,2,33.117674,17.062917,-8.508370e+06,5.099066e+06
...,...,...,...,...,...
53165525,0,215.000000,6.179840,-1.167972e+07,4.833652e+06
53165526,0,215.000000,6.179840,-1.167971e+07,4.833633e+06
53165527,0,215.000000,6.179840,-1.167971e+07,4.833691e+06
53165528,0,215.000000,6.179840,-1.167971e+07,4.833535e+06


In [14]:
df_rfld.to_parquet('/hpc/group/codeplus22-vis/infousa_copy/riverine_flood_risks_final.parquet')

#### Processing dataframe for average risk
Following the same steps as above, this time dropping all of the other risks except for average risk before exporting it to a parquet file to be used in visualizations.

In [5]:
df_avg = df.drop(['erqk_risks','swnd_risks', 'trnd_risks', 'hrcn_risks', 'cfld_risks', 'rfld_risks'], axis = 1)
df_avg = df_avg[[ 'is_elderly', 'distance_mi', 'avg_risk', 'lat_3857', 'lon_3857']]
df_avg.rename(columns = {'avg_risk': 'average_risk'}, inplace = True)
df_avg

Unnamed: 0,is_elderly,distance_mi,average_risk,lat_3857,lon_3857
0,2,33.459368,9.063660,-8.509454e+06,5.101307e+06
1,2,28.501947,13.460825,-8.499018e+06,5.096218e+06
2,2,28.592896,13.460825,-8.496356e+06,5.099448e+06
3,2,33.876175,9.063660,-8.509963e+06,5.103102e+06
4,2,33.117674,9.063660,-8.508370e+06,5.099066e+06
...,...,...,...,...,...
53165525,0,215.000000,12.051158,-1.167972e+07,4.833652e+06
53165526,0,215.000000,12.051158,-1.167971e+07,4.833633e+06
53165527,0,215.000000,12.051158,-1.167971e+07,4.833691e+06
53165528,0,215.000000,12.051158,-1.167971e+07,4.833535e+06


In [7]:
df_avg.to_parquet('/hpc/group/codeplus22-vis/infousa_copy/average_risk_final.parquet')