# Appending, Merging, and Processing Pandas Dataframes
### Processing tank risk and household data for natural hazard GPU visualizations

### Import statements

In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import os



### Setting ```DATA_DIR```
In order to read in files from this repository, we must set ```DATA_DIR``` to be the data folder within this repository. This requires ```os.getcwd()``` to return the path to the processing notebook of this repository, so ```xxx/codeplus-celine-dcc-package/procesing```, where ```xxx``` is the path to where you cloned this repository. If it is not, use ```os.chdir(path)``` to change the current working directory to ```xxx/codeplus-celine-dcc-package/procesing``` before getting the current working directory in ```DATA_DIR = os.getcwd()```, where ```path``` is ```xxx/codeplus-celine-dcc-package/procesing```.

In [2]:
DATA_DIR = os.getcwd()
DATA_DIR = DATA_DIR.replace('processing', 'data')
DATA_DIR

'/hpc/home/at341/ondemand/codeplus-celine-dcc-package/data'

### Reading InfoUSA data

This is a pre-processed file including tank and household distances, transformed latitude longitude coordinates, as well as all of national risk index data for six natural hazards (earthquake, strong winds, hurricanes, tornadoes, cold floods, and riverine floods), as stipulated by our researcher. To understand how it was processed in more detail, visit processing notebok **05_all_us_dist_processing**.  We then drop columns we will not be using in our visualizations.

In [12]:
df = pd.read_parquet(DATA_DIR + '/distances_all_hh.parquet')
df

Unnamed: 0,child_num,age_code,lat_3857,lon_3857,lat_4326,lon_4326,erqk_risks,swnd_risks,hrcn_risks,trnd_risks,cfld_risks,rfld_risks,avg_risk,distance_mi,distance_category,is_elderly
0,3.0,C,-8.556472e+06,4.754685e+06,39.230097,-76.864096,7.975933,32.552888,23.825800,45.694335,3.543941,17.889439,21.913723,14.977140,4.0,2
1,5.0,C,-1.076073e+07,5.469166e+06,44.024061,-96.665285,6.897670,11.518197,-1.000000,4.252650,-1.000000,9.396389,5.344151,812.480090,4.0,2
2,1.0,I,-1.251261e+07,4.094840e+06,34.490381,-112.402712,0.509994,8.385226,5.089364,9.677147,-1.000000,10.415246,5.679496,934.400050,4.0,2
3,10.0,K,-9.857755e+06,4.129311e+06,34.745220,-88.553720,1.289903,15.693502,3.403131,19.224199,-1.000000,8.517747,8.021414,509.132686,4.0,1
4,1.0,C,-9.267351e+06,5.493176e+06,44.178941,-83.250028,2.205648,26.709422,5.171142,17.753031,0.000000,8.982879,10.137020,48.826977,4.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73086,,Z,-1.010035e+07,5.222881e+06,42.411899,-90.732966,1.575536,17.648163,4.544047,21.537919,-1.000000,12.580429,9.647682,2810.000000,0.0,0
73087,,Z,-1.183249e+07,5.291041e+06,42.862335,-106.293070,3.312025,2.867939,-1.000000,10.280441,-1.000000,6.010181,3.745098,2810.000000,0.0,0
73088,,Z,-9.971313e+06,4.384699e+06,36.608666,-89.573830,17.807754,23.810359,8.253384,24.042775,-1.000000,18.432187,15.391077,2810.000000,0.0,0
73089,,Z,-7.944992e+06,5.135812e+06,41.831766,-71.371080,9.400549,11.049468,5.819224,19.608082,7.130619,21.502062,12.418334,2810.000000,0.0,0


In [13]:
df = df.drop(['child_num', 'lat_4326','lon_4326', 'distance_category', 'is_elderly'], axis = 1)

### Processing dataframes for individual natural hazards

Since we want to plot all of the risk dashboards separately, are now breaking down this dataframe, one for each type of natural hazard.

#### Processing the dataframe for earthquake risk

We are taking the macro dataframe we made above and dropping all of the other risks except for earthquake before exporting it to a parquet file to be used in visualizations.

In [16]:
df_erqk = df.drop(['swnd_risks', 'hrcn_risks', 'trnd_risks', 'cfld_risks', 'rfld_risks', 'avg_risk'], axis = 1)
df_erqk.rename(columns = {'erqk_risks': 'earthquake_risk'}, inplace = True)
df_erqk

Unnamed: 0,age_code,lat_3857,lon_3857,earthquake_risk,distance_mi
0,C,-8.556472e+06,4.754685e+06,7.975933,14.977140
1,C,-1.076073e+07,5.469166e+06,6.897670,812.480090
2,I,-1.251261e+07,4.094840e+06,0.509994,934.400050
3,K,-9.857755e+06,4.129311e+06,1.289903,509.132686
4,C,-9.267351e+06,5.493176e+06,2.205648,48.826977
...,...,...,...,...,...
73086,Z,-1.010035e+07,5.222881e+06,1.575536,2810.000000
73087,Z,-1.183249e+07,5.291041e+06,3.312025,2810.000000
73088,Z,-9.971313e+06,4.384699e+06,17.807754,2810.000000
73089,Z,-7.944992e+06,5.135812e+06,9.400549,2810.000000


In [17]:
df_erqk.to_parquet(DATA_DIR + '/earthquake_risk.parquet')

#### Processing dataframe for strong wind risk
Following the same steps as above, this time dropping all of the other risks except for strong wind risk before exporting it to a parquet file to be used in visualizations.

In [18]:
df_swnd = df.drop(['erqk_risks', 'hrcn_risks', 'trnd_risks', 'cfld_risks', 'rfld_risks', 'avg_risk'], axis = 1)
df_swnd.rename(columns = {'swnd_risks': 'strong_wind_risk'}, inplace = True)
df_swnd

Unnamed: 0,age_code,lat_3857,lon_3857,strong_wind_risk,distance_mi
0,C,-8.556472e+06,4.754685e+06,32.552888,14.977140
1,C,-1.076073e+07,5.469166e+06,11.518197,812.480090
2,I,-1.251261e+07,4.094840e+06,8.385226,934.400050
3,K,-9.857755e+06,4.129311e+06,15.693502,509.132686
4,C,-9.267351e+06,5.493176e+06,26.709422,48.826977
...,...,...,...,...,...
73086,Z,-1.010035e+07,5.222881e+06,17.648163,2810.000000
73087,Z,-1.183249e+07,5.291041e+06,2.867939,2810.000000
73088,Z,-9.971313e+06,4.384699e+06,23.810359,2810.000000
73089,Z,-7.944992e+06,5.135812e+06,11.049468,2810.000000


In [19]:
df_swnd.to_parquet(DATA_DIR + '/strong_wind_risk.parquet')

#### Processing dataframe for hurricane risk
Following the same steps as above, this time dropping all of the other risks except for hurricane risk before exporting it to a parquet file to be used in visualizations.

In [20]:
df_hrcn = df.drop(['erqk_risks','swnd_risks', 'trnd_risks', 'cfld_risks', 'rfld_risks', 'avg_risk'], axis = 1)
df_hrcn.rename(columns = {'hrcn_risks': 'hurricane_risk'}, inplace = True)
df_hrcn

Unnamed: 0,age_code,lat_3857,lon_3857,hurricane_risk,distance_mi
0,C,-8.556472e+06,4.754685e+06,23.825800,14.977140
1,C,-1.076073e+07,5.469166e+06,-1.000000,812.480090
2,I,-1.251261e+07,4.094840e+06,5.089364,934.400050
3,K,-9.857755e+06,4.129311e+06,3.403131,509.132686
4,C,-9.267351e+06,5.493176e+06,5.171142,48.826977
...,...,...,...,...,...
73086,Z,-1.010035e+07,5.222881e+06,4.544047,2810.000000
73087,Z,-1.183249e+07,5.291041e+06,-1.000000,2810.000000
73088,Z,-9.971313e+06,4.384699e+06,8.253384,2810.000000
73089,Z,-7.944992e+06,5.135812e+06,5.819224,2810.000000


In [21]:
df_hrcn.to_parquet(DATA_DIR + '/hurricane_risk.parquet')

#### Processing dataframe for tornado risk
Following the same steps as above, this time dropping all of the other risks except for tornado risk before exporting it to a parquet file to be used in visualizations.

In [22]:
df_trnd = df.drop(['erqk_risks','swnd_risks', 'hrcn_risks', 'cfld_risks', 'rfld_risks', 'avg_risk'], axis = 1)
df_trnd.rename(columns = {'trnd_risks': 'tornado_risk'}, inplace = True)
df_trnd

Unnamed: 0,age_code,lat_3857,lon_3857,tornado_risk,distance_mi
0,C,-8.556472e+06,4.754685e+06,45.694335,14.977140
1,C,-1.076073e+07,5.469166e+06,4.252650,812.480090
2,I,-1.251261e+07,4.094840e+06,9.677147,934.400050
3,K,-9.857755e+06,4.129311e+06,19.224199,509.132686
4,C,-9.267351e+06,5.493176e+06,17.753031,48.826977
...,...,...,...,...,...
73086,Z,-1.010035e+07,5.222881e+06,21.537919,2810.000000
73087,Z,-1.183249e+07,5.291041e+06,10.280441,2810.000000
73088,Z,-9.971313e+06,4.384699e+06,24.042775,2810.000000
73089,Z,-7.944992e+06,5.135812e+06,19.608082,2810.000000


In [23]:
df_trnd.to_parquet(DATA_DIR + '/tornado_risk.parquet')

#### Processing dataframe for coastal flood risk
Following the same steps as above, this time dropping all of the other risks except for coastal flood risk before exporting it to a parquet file to be used in visualizations.

In [24]:
df_cfld = df.drop(['erqk_risks','swnd_risks', 'trnd_risks', 'hrcn_risks', 'rfld_risks', 'avg_risk'], axis = 1)
df_cfld.rename(columns = {'cfld_risks': 'coastal_flood_risk'}, inplace = True)
df_cfld

Unnamed: 0,age_code,lat_3857,lon_3857,coastal_flood_risk,distance_mi
0,C,-8.556472e+06,4.754685e+06,3.543941,14.977140
1,C,-1.076073e+07,5.469166e+06,-1.000000,812.480090
2,I,-1.251261e+07,4.094840e+06,-1.000000,934.400050
3,K,-9.857755e+06,4.129311e+06,-1.000000,509.132686
4,C,-9.267351e+06,5.493176e+06,0.000000,48.826977
...,...,...,...,...,...
73086,Z,-1.010035e+07,5.222881e+06,-1.000000,2810.000000
73087,Z,-1.183249e+07,5.291041e+06,-1.000000,2810.000000
73088,Z,-9.971313e+06,4.384699e+06,-1.000000,2810.000000
73089,Z,-7.944992e+06,5.135812e+06,7.130619,2810.000000


In [25]:
df_cfld.to_parquet(DATA_DIR + '/coastal_flood_risk.parquet')

#### Processing dataframe for riverine flood risk
Following the same steps as above, this time dropping all of the other risks except for riverine flood risk before exporting it to a parquet file to be used in visualizations.

In [26]:
df_rfld = df.drop(['erqk_risks','swnd_risks', 'trnd_risks', 'hrcn_risks', 'cfld_risks', 'avg_risk'], axis = 1)
df_rfld.rename(columns = {'rfld_risks': 'riverine_flood_risk'}, inplace = True)
df_rfld

Unnamed: 0,age_code,lat_3857,lon_3857,riverine_flood_risk,distance_mi
0,C,-8.556472e+06,4.754685e+06,17.889439,14.977140
1,C,-1.076073e+07,5.469166e+06,9.396389,812.480090
2,I,-1.251261e+07,4.094840e+06,10.415246,934.400050
3,K,-9.857755e+06,4.129311e+06,8.517747,509.132686
4,C,-9.267351e+06,5.493176e+06,8.982879,48.826977
...,...,...,...,...,...
73086,Z,-1.010035e+07,5.222881e+06,12.580429,2810.000000
73087,Z,-1.183249e+07,5.291041e+06,6.010181,2810.000000
73088,Z,-9.971313e+06,4.384699e+06,18.432187,2810.000000
73089,Z,-7.944992e+06,5.135812e+06,21.502062,2810.000000


In [27]:
df_rfld.to_parquet(DATA_DIR + '/riverine_flood_risk.parquet')

#### Processing dataframe for average risk
Following the same steps as above, this time dropping all of the other risks except for average risk before exporting it to a parquet file to be used in visualizations.

In [28]:
df_avg = df.drop(['erqk_risks','swnd_risks', 'trnd_risks', 'hrcn_risks', 'cfld_risks', 'rfld_risks'], axis = 1)
df_avg.rename(columns = {'avg_risk': 'average_risk'}, inplace = True)
df_avg

Unnamed: 0,age_code,lat_3857,lon_3857,average_risk,distance_mi
0,C,-8.556472e+06,4.754685e+06,21.913723,14.977140
1,C,-1.076073e+07,5.469166e+06,5.344151,812.480090
2,I,-1.251261e+07,4.094840e+06,5.679496,934.400050
3,K,-9.857755e+06,4.129311e+06,8.021414,509.132686
4,C,-9.267351e+06,5.493176e+06,10.137020,48.826977
...,...,...,...,...,...
73086,Z,-1.010035e+07,5.222881e+06,9.647682,2810.000000
73087,Z,-1.183249e+07,5.291041e+06,3.745098,2810.000000
73088,Z,-9.971313e+06,4.384699e+06,15.391077,2810.000000
73089,Z,-7.944992e+06,5.135812e+06,12.418334,2810.000000


In [29]:
df_avg.to_parquet(DATA_DIR + '/avg_risk.parquet')