# Filtering Through Data, Converting Coordinate Systems, Using ```.sjoin()```
### Processing AST data for further use

### Import statements

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import geopandas as gpd
import os

### Reading AST data

In [3]:
DATA_DIR = os.getcwd()
DATA_DIR = DATA_DIR.replace('processing', 'data')
DATA_DIR

'/hpc/home/at341/ondemand/codeplus-celine-dcc-package/data'

In [4]:
df_tanks = gpd.read_file(DATA_DIR + '/ast_files/ast_synthetic.shp')
df_tanks.head(n=3)

Unnamed: 0,tile_name,minx_polyg,miny_polyg,maxx_polyg,maxy_polyg,nw_corner_,nw_corne_1,se_corner_,se_corne_1,object_cla,diameter (,merged_bbo,bbox_withi,Category1,Category2,Category3,Category4,Category5,state,geometry
0,m_4308962_se_16_060_20181004,2928,3816,2975,3862,43.044916,-89.296309,43.044675,-89.295954,closed_roof_tank,27.6,1,0,0.0,0.0,0.0,0.0,0.0,Wisconsin,"POLYGON ((-89.29631 43.04492, -89.29595 43.044..."
1,m_4409534_sw_15_060_20190808,8127,267,8162,298,44.440611,-95.820717,44.44045,-95.820446,closed_roof_tank,18.6,0,0,0.0,0.0,0.0,0.0,0.0,Minnesota,"POLYGON ((-95.82072 44.44061, -95.82045 44.440..."
2,m_3411762_sw_11_060_20180822,5945,3665,6016,3733,34.044154,-117.338215,34.043787,-117.337752,closed_roof_tank,40.8,1,0,0.0,0.0,0.0,0.0,0.0,California,"POLYGON ((-117.33821 34.04415, -117.33775 34.0..."


### Filtering through the data

This original dataset provided to us by our research has columns we will not use for the purposes of our visualizations. To minimize memory consumption and maximize runtime efficiency, we only keep the columns necessary for our visualizations.

In [6]:
df_tanks = df_tanks[['nw_corner_', 'nw_corne_1', 'se_corner_', 'se_corne_1', 'object_cla', 'diameter (', 'state', 'geometry']]
df_tanks.head(n=3)

Unnamed: 0,nw_corner_,nw_corne_1,se_corner_,se_corne_1,object_cla,diameter (,state,geometry
0,43.044916,-89.296309,43.044675,-89.295954,closed_roof_tank,27.6,Wisconsin,"POLYGON ((-89.29631 43.04492, -89.29595 43.044..."
1,44.440611,-95.820717,44.44045,-95.820446,closed_roof_tank,18.6,Minnesota,"POLYGON ((-95.82072 44.44061, -95.82045 44.440..."
2,34.044154,-117.338215,34.043787,-117.337752,closed_roof_tank,40.8,California,"POLYGON ((-117.33821 34.04415, -117.33775 34.0..."


### Computing average latitude and longitude coordinates for each tank
The orignal tank locations came in polygon geometries; however, since we are plotting the tanks across the US, plotting all ~98,000 of them as Polygon geometries through GeoViews is a time-consuming and unfeasible process. Thus, we use the four corners of the tank geometries, ```nw_corner_```, ```nw_corne_1```, ```se_corner_``` and ```se_corne_1``` to calculate the center latitude and longitude coordinates for each tank. Like this, we can create a Point geometry for each tank to replace the Polygon geometry and plot all points through GeoViews without running into time issues.

In [7]:
df_tanks['avg_lat'] = (df_tanks['nw_corner_'] + df_tanks['se_corner_'])/2
df_tanks['avg_long'] = (df_tanks['nw_corne_1'] + df_tanks['se_corne_1'])/2
df_tanks.head(n=3)

Unnamed: 0,nw_corner_,nw_corne_1,se_corner_,se_corne_1,object_cla,diameter (,state,geometry,avg_lat,avg_long
0,43.044916,-89.296309,43.044675,-89.295954,closed_roof_tank,27.6,Wisconsin,"POLYGON ((-89.29631 43.04492, -89.29595 43.044...",43.044796,-89.296132
1,44.440611,-95.820717,44.44045,-95.820446,closed_roof_tank,18.6,Minnesota,"POLYGON ((-95.82072 44.44061, -95.82045 44.440...",44.44053,-95.820582
2,34.044154,-117.338215,34.043787,-117.337752,closed_roof_tank,40.8,California,"POLYGON ((-117.33821 34.04415, -117.33775 34.0...",34.043971,-117.337983


We then filter again for only relevant columns. We also rename each column name so that they are standardized moving forward. The average latitude and longitude are named ```lat_t_4326``` and ```lon_t_4326```, respectively, to indicate that they are the latitude and longitude coordinates for a tank, in EPSG 4326 projection. This will be important moving forward, when we convert coordinate systems for our visualizations.

In [8]:
df_tanks = df_tanks[['state', 'object_cla', 'diameter (', 'avg_lat', 'avg_long', 'geometry']]

In [9]:
df_tanks.rename(columns = {'avg_lat':'lat_t_4326'}, inplace = True)
df_tanks.rename(columns = {'avg_long':'lon_t_4326'}, inplace = True)
df_tanks.rename(columns = {'object_cla':'tank_type'}, inplace = True)
df_tanks.rename(columns = {'diameter (':'diameter'}, inplace = True)
df_tanks.head()

Unnamed: 0,state,tank_type,diameter,lat_t_4326,lon_t_4326,geometry
0,Wisconsin,closed_roof_tank,27.6,43.044796,-89.296132,"POLYGON ((-89.29631 43.04492, -89.29595 43.044..."
1,Minnesota,closed_roof_tank,18.6,44.44053,-95.820582,"POLYGON ((-95.82072 44.44061, -95.82045 44.440..."
2,California,closed_roof_tank,40.8,34.043971,-117.337983,"POLYGON ((-117.33821 34.04415, -117.33775 34.0..."
3,Nebraska,closed_roof_tank,14.4,40.987509,-100.815133,"POLYGON ((-100.81523 40.98757, -100.81503 40.9..."
4,Colorado,narrow_closed_roof_tank,3.0,39.876695,-104.883375,"POLYGON ((-104.88339 39.87671, -104.88336 39.8..."


### Using pyproj and PROJ's transformer to convert from EPSG 4326 to EPSG 3857
A lot of our visualizations need coordinates in EPSG 3857, however these coordinates are in EPSG 4326. Therefore, we use the pyproj interface, which allows us to use the PROJ coordinate transformation software to transform our EPSG 4326 coordinates to EPSG 3857. This creates two new columns in our original dataset with the transformed coordinates.

In [10]:
from pyproj import Proj, Transformer

transform_4326_to_3857 = Transformer.from_crs('epsg:4326', 'epsg:3857')
df_tanks['lat_t_3857'], df_tanks['lon_t_3857'] = transform_4326_to_3857.transform(
                                                df_tanks['lat_t_4326'], df_tanks['lon_t_4326'])
df_tanks.head()

Unnamed: 0,state,tank_type,diameter,lat_t_4326,lon_t_4326,geometry,lat_t_3857,lon_t_3857
0,Wisconsin,closed_roof_tank,27.6,43.044796,-89.296132,"POLYGON ((-89.29631 43.04492, -89.29595 43.044...",-9940400.0,5318793.0
1,Minnesota,closed_roof_tank,18.6,44.44053,-95.820582,"POLYGON ((-95.82072 44.44061, -95.82045 44.440...",-10666700.0,5533870.0
2,California,closed_roof_tank,40.8,34.043971,-117.337983,"POLYGON ((-117.33821 34.04415, -117.33775 34.0...",-13062000.0,4034708.0
3,Nebraska,closed_roof_tank,14.4,40.987509,-100.815133,"POLYGON ((-100.81523 40.98757, -100.81503 40.9...",-11222690.0,5010499.0
4,Colorado,narrow_closed_roof_tank,3.0,39.876695,-104.883375,"POLYGON ((-104.88339 39.87671, -104.88336 39.8...",-11675560.0,4848040.0


### Using ```.sjoin()``` to classify tanks by county

#### Converting ```df_tanks``` to a GeoDataFrame
For some of our further processing of data, we need to classify each tank by county. To do so, we will use GeoPandas' ```.sjoin()``` method to identify which county each tank belongs to. Since the ```.sjoin()``` method takes in two GeoDataFrames, we must convert ```df_tanks``` to a GeoDataFrame by using the ```lat_t_4326``` and ```lon_t_4326``` columns to create Point geometries.

To do so, we use GeoPandas' ```.GeoDataFrame``` method. We first pass in ```df_tanks``` (the dataframe we will convert to a GeoDataFrame), then specify which columns to use for the ```POINT``` geometry. In this case, we use ```lon_t_4326``` and ```lat_t_4326```.

In [11]:
df_tanks = gpd.GeoDataFrame(
    df_tanks, geometry=gpd.points_from_xy(df_tanks.lon_t_4326, df_tanks.lat_t_4326))
df_tanks.head()

Unnamed: 0,state,tank_type,diameter,lat_t_4326,lon_t_4326,geometry,lat_t_3857,lon_t_3857
0,Wisconsin,closed_roof_tank,27.6,43.044796,-89.296132,POINT (-89.29613 43.04480),-9940400.0,5318793.0
1,Minnesota,closed_roof_tank,18.6,44.44053,-95.820582,POINT (-95.82058 44.44053),-10666700.0,5533870.0
2,California,closed_roof_tank,40.8,34.043971,-117.337983,POINT (-117.33798 34.04397),-13062000.0,4034708.0
3,Nebraska,closed_roof_tank,14.4,40.987509,-100.815133,POINT (-100.81513 40.98751),-11222690.0,5010499.0
4,Colorado,narrow_closed_roof_tank,3.0,39.876695,-104.883375,POINT (-104.88337 39.87669),-11675560.0,4848040.0


#### Reading in county shapefile
To find which tanks are in each county, we use GeoPandas' ```.sjoin()``` method. Using this method, we will perform a spatial join between each county's geometry and the dataframe including Point geometries for each tank in the US. For this, we need a dataframe with geometries for all counties in the US- which we took from the United States Census Bureau's Cartographic Boundary Files (available [here](https://www.census.gov/geographies/mapping-files/time-series/geo/carto-boundary-file.html)). Then, we filter to exclude counties from Alaska, Hawaii, Puerto Rico, Virgin Islands, American Samoa, Guam, Northern Marian Islands, as there are no tanks in those regions in the AST dataset. We also drop unnecessary columns and rename the column names in a standardized way.

In [12]:
DATA_DIR

'/hpc/home/at341/ondemand/codeplus-celine-dcc-package/data'

In [14]:
df_counties = gpd.read_file(DATA_DIR + '/county_shapefiles/counties.shp')
df_counties = df_counties[((df_counties['STATEFP'] != '02') & (df_counties['STATEFP'] != '15') &
                          (df_counties['STATEFP'] != '72') & (df_counties['STATEFP'] != '78') &
                          (df_counties['STATEFP'] != '60') & (df_counties['STATEFP'] != '66') &
                          (df_counties['STATEFP'] != '69'))]
df_counties = df_counties[['NAME', 'GEOID', 'geometry']]
df_counties.rename(columns = {'NAME': 'county', 'GEOID': 'geoid'}, inplace = True)
df_counties.head()

Unnamed: 0,county,geoid,geometry
0,Riley,20161,"POLYGON ((-96.96095 39.28670, -96.96106 39.288..."
1,Ringgold,19159,"POLYGON ((-94.47167 40.81255, -94.47166 40.819..."
2,Carbon,30009,"POLYGON ((-109.79867 45.16734, -109.68779 45.1..."
3,Bear Lake,16007,"POLYGON ((-111.63452 42.57034, -111.63010 42.5..."
4,Buffalo,55011,"POLYGON ((-92.08384 44.41200, -92.08310 44.414..."


#### Iterating through each county in and finding the tanks in that county
Next, since we must find which tanks are in every county in the US, we must iterate through every county in ```df_counties```. For each county, we perform a spatial join between that county GeoDataFrame and the ```df_tanks``` GeoDataFrame. The ```.sjoin()``` function returns a new GeoDataFrame that only includes the geometries that are the intersections of the two original GeoDataFrames. In this case, passing in a GeoDataFrame with the geometry for Harris County and a GeoDataFrame with all the tanks to the ```.sjoin()``` method returns a new GeoDataFrame with all the tanks in Harris County, as it returns all the Point geometries that intersects the Harris County Polygon geometry. This new GeoDataFrame keeps the index for each tank as it was in the original ```df_tanks``` dataframe. This is key- it allows us to take a list of these indices, and then loop over all of them to change the value of the ```county``` column in ```df_tanks``` at each of those indices to ```Harris County```.

However, we need to do this for all counties in the US, so we use for loop. This loop iterates every row of ```df_counties``` finds the intersection between that county and the tanks GeoDataFrame (```df_tanks```), creates a list of the indices of those tanks, mutates the ```county``` column in ```df_tanks``` to label each of those tanks with that county name. 

We intentially labelled each tank with the ```geo_id``` column, because in future processing, we will merge this dataframe with another dataframe based on county FIPS numbers. 

This takes around three minutes, since we are looping through 3,000 counties.

In [15]:
%%time

df_tanks['county'] = ''

for i in range(0, len(df_counties)):
    county = df_counties.iloc[i] ## finding county
    frame = county.to_frame() ## making county to a dataframe, as .iloc[i] returns a series
    row = gpd.GeoDataFrame(frame.T) ## transforming pandas df to geodataframe
    df_intersect = gpd.sjoin(df_tanks, row, how='inner', predicate='intersects') ## finding tanks in that county
    idx = list(df_intersect.index.values) ## finding indices of those tanks
    for num in idx: ## looping over those indices 
        df_tanks['county'].iloc[idx] = row.iloc[0]['geoid']

df_tanks.head()

CPU times: user 37.7 s, sys: 11.9 ms, total: 37.7 s
Wall time: 38 s


Unnamed: 0,state,tank_type,diameter,lat_t_4326,lon_t_4326,geometry,lat_t_3857,lon_t_3857,county
0,Wisconsin,closed_roof_tank,27.6,43.044796,-89.296132,POINT (-89.29613 43.04480),-9940400.0,5318793.0,55025
1,Minnesota,closed_roof_tank,18.6,44.44053,-95.820582,POINT (-95.82058 44.44053),-10666700.0,5533870.0,27083
2,California,closed_roof_tank,40.8,34.043971,-117.337983,POINT (-117.33798 34.04397),-13062000.0,4034708.0,6071
3,Nebraska,closed_roof_tank,14.4,40.987509,-100.815133,POINT (-100.81513 40.98751),-11222690.0,5010499.0,31111
4,Colorado,narrow_closed_roof_tank,3.0,39.876695,-104.883375,POINT (-104.88337 39.87669),-11675560.0,4848040.0,8001


### Exporting dataframe to shapefile

Now, since this code has been processed, we are exporting the new tank data into a shapefile that will be used later on. The input to the ```to_file``` function is the path where you want the dataframe to be exported to and the name of the file it will be saved in.

In [16]:
DATA_DIR

'/hpc/home/at341/ondemand/codeplus-celine-dcc-package/data'

In [17]:
df_tanks.to_file(DATA_DIR + '/ast_master_TEST.shp')

In [18]:
df = gpd.read_file(DATA_DIR + '/ast_master_TEST.shp')
df

Unnamed: 0,state,tank_type,diameter,lat_t_4326,lon_t_4326,lat_t_3857,lon_t_3857,county,geometry
0,Wisconsin,closed_roof_tank,27.6,43.044796,-89.296132,-9.940400e+06,5.318793e+06,55025,POINT (-89.29613 43.04480)
1,Minnesota,closed_roof_tank,18.6,44.440530,-95.820582,-1.066670e+07,5.533870e+06,27083,POINT (-95.82058 44.44053)
2,California,closed_roof_tank,40.8,34.043971,-117.337983,-1.306200e+07,4.034708e+06,06071,POINT (-117.33798 34.04397)
3,Nebraska,closed_roof_tank,14.4,40.987509,-100.815133,-1.122269e+07,5.010499e+06,31111,POINT (-100.81513 40.98751)
4,Colorado,narrow_closed_roof_tank,3.0,39.876695,-104.883375,-1.167556e+07,4.848040e+06,08001,POINT (-104.88337 39.87669)
...,...,...,...,...,...,...,...,...,...
977,Illinois,closed_roof_tank,42.6,39.819945,-90.565439,-1.008170e+07,4.839812e+06,17137,POINT (-90.56544 39.81994)
978,Virginia,closed_roof_tank,40.8,38.811118,-77.463824,-8.623233e+06,4.694652e+06,51059,POINT (-77.46382 38.81112)
979,Texas,closed_roof_tank,12.0,28.862296,-96.021863,-1.068910e+07,3.358131e+06,48321,POINT (-96.02186 28.86230)
980,Alabama,closed_roof_tank,31.8,33.459705,-86.876591,-9.671058e+06,3.956482e+06,01073,POINT (-86.87659 33.45971)
