# Processing AST data for further use, using ```.sjoin()```

### Import statements

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import geopandas as gpd

### Reading AST data

In [3]:
df_tanks = gpd.read_file('/hpc/group/codeplus22-vis/celine_data/source_files/ast_dataset/tile_level_annotations.shp')
df_tanks.head(n=3)

Unnamed: 0,tile_name,minx_polyg,miny_polyg,maxx_polyg,maxy_polyg,nw_corner_,nw_corne_1,se_corner_,se_corne_1,object_cla,diameter (,merged_bbo,bbox_withi,Category1,Category2,Category3,Category4,Category5,state,geometry
0,m_4007327_nw_18_060_20190809,974,314,1041,380,40.625753,-73.745466,40.625392,-73.744997,closed_roof_tank,39.6,1,0,0.0,0.0,0.0,0.0,0.0,New York,"POLYGON ((-73.74547 40.62575, -73.74500 40.625..."
1,m_4007327_nw_18_060_20190809,1091,479,1157,512,40.624853,-73.744652,40.624669,-73.744188,closed_roof_tank,19.8,0,0,0.0,0.0,0.0,0.0,0.0,New York,"POLYGON ((-73.74465 40.62485, -73.74419 40.624..."
2,m_4007327_nw_18_060_20190809,851,243,872,265,40.626147,-73.746331,40.626026,-73.746184,closed_roof_tank,12.6,0,0,0.0,0.0,0.0,0.0,0.0,New York,"POLYGON ((-73.74633 40.62615, -73.74618 40.626..."


### Filtering through the data

This original dataset provided to us by our research has columns we will not use for the purposes of our visualizations. To minimize memory consumption and maximize runtime efficiency, we only keep the columns necessary for our visualizations.

In [28]:
df_tanks = df_tanks[['nw_corner_', 'nw_corne_1', 'se_corner_', 'se_corne_1', 'object_cla', 'diameter (', 'state', 'geometry']]
df_tanks.head(n=3)

Unnamed: 0,nw_corner_,nw_corne_1,se_corner_,se_corne_1,object_cla,diameter (,state,geometry
0,40.625753,-73.745466,40.625392,-73.744997,closed_roof_tank,39.6,New York,"POLYGON ((-73.74547 40.62575, -73.74500 40.625..."
1,40.624853,-73.744652,40.624669,-73.744188,closed_roof_tank,19.8,New York,"POLYGON ((-73.74465 40.62485, -73.74419 40.624..."
2,40.626147,-73.746331,40.626026,-73.746184,closed_roof_tank,12.6,New York,"POLYGON ((-73.74633 40.62615, -73.74618 40.626..."


### Computing average latitude and longitude coordinates for each tank
The orignal tank locations came in polygon geometries; however, since we are plotting the tanks across the US, plotting all ~98,000 of them as Polygon geometries through GeoViews is a time-consuming and unfeasible process. Thus, we use the four corners of the tank geometries, ```nw_corner_```, ```nw_corne_1```, ```se_corner_``` and ```se_corne_1``` to calculate the center latitude and longitude coordinates for each tank. Like this, we can create a Point geometry for each tank to replace the Polygon geometry and plot all points through GeoViews without running into time issues.

In [29]:
df_tanks['avg_lat'] = (df_tanks['nw_corner_'] + df_tanks['se_corner_'])/2
df_tanks['avg_long'] = (df_tanks['nw_corne_1'] + df_tanks['se_corne_1'])/2
df_tanks.head(n=3)

Unnamed: 0,nw_corner_,nw_corne_1,se_corner_,se_corne_1,object_cla,diameter (,state,geometry,avg_lat,avg_long
0,40.625753,-73.745466,40.625392,-73.744997,closed_roof_tank,39.6,New York,"POLYGON ((-73.74547 40.62575, -73.74500 40.625...",40.625572,-73.745231
1,40.624853,-73.744652,40.624669,-73.744188,closed_roof_tank,19.8,New York,"POLYGON ((-73.74465 40.62485, -73.74419 40.624...",40.624761,-73.74442
2,40.626147,-73.746331,40.626026,-73.746184,closed_roof_tank,12.6,New York,"POLYGON ((-73.74633 40.62615, -73.74618 40.626...",40.626086,-73.746257


We then filter again for only relevant columns. We also rename each column name so that they are standardized moving forward. The average latitude and longitude are named ```lat_t_4326``` and ```lon_t_4326```, respectively, to indicate that they are the latitude and longitude coordinates for a tank, in EPSG 4326 projection. This will be important moving forward, when we convert coordinate systems for our visualizations.

In [30]:
df_tanks = df_tanks[['state', 'object_cla', 'diameter (', 'avg_lat', 'avg_long', 'geometry']]

In [31]:
df_tanks.rename(columns = {'avg_lat':'lat_t_4326'}, inplace = True)
df_tanks.rename(columns = {'avg_long':'lon_t_4326'}, inplace = True)
df_tanks.rename(columns = {'object_cla':'tank_type'}, inplace = True)
df_tanks.rename(columns = {'diameter (':'diameter'}, inplace = True)
df_tanks.head()

Unnamed: 0,state,tank_type,diameter,lat_t_4326,lon_t_4326,geometry
0,New York,closed_roof_tank,39.6,40.625572,-73.745231,"POLYGON ((-73.74547 40.62575, -73.74500 40.625..."
1,New York,closed_roof_tank,19.8,40.624761,-73.74442,"POLYGON ((-73.74465 40.62485, -73.74419 40.624..."
2,New York,closed_roof_tank,12.6,40.626086,-73.746257,"POLYGON ((-73.74633 40.62615, -73.74618 40.626..."
3,New York,closed_roof_tank,30.6,40.625786,-73.746203,"POLYGON ((-73.74639 40.62593, -73.74601 40.625..."
4,New York,closed_roof_tank,24.0,40.625781,-73.745813,"POLYGON ((-73.74595 40.62590, -73.74567 40.625..."


### Using pyproj and PROJ's transformer to convert from EPSG 4326 to EPSG 3857
A lot of our visualizations need coordinates in EPSG 3857, however these coordinates are in EPSG 4326. Therefore, we use the pyproj interface, which allows us to use the PROJ coordinate transformation software to transform our EPSG 4326 coordinates to EPSG 3857. This creates two new columns in our original dataset with the transformed coordinates.

In [32]:
from pyproj import Proj, Transformer

transform_4326_to_3857 = Transformer.from_crs('epsg:4326', 'epsg:3857')
df_tanks['lat_t_3857'], df_tanks['lon_t_3857'] = transform_4326_to_3857.transform(
                                                df_tanks['lat_t_4326'], df_tanks['lon_t_4326'])
df_tanks.head()

Unnamed: 0,state,tank_type,diameter,lat_t_4326,lon_t_4326,geometry,lat_t_3857,lon_t_3857
0,New York,closed_roof_tank,39.6,40.625572,-73.745231,"POLYGON ((-73.74547 40.62575, -73.74500 40.625...",-8209282.0,4957270.0
1,New York,closed_roof_tank,19.8,40.624761,-73.74442,"POLYGON ((-73.74465 40.62485, -73.74419 40.624...",-8209191.0,4957151.0
2,New York,closed_roof_tank,12.6,40.626086,-73.746257,"POLYGON ((-73.74633 40.62615, -73.74618 40.626...",-8209396.0,4957345.0
3,New York,closed_roof_tank,30.6,40.625786,-73.746203,"POLYGON ((-73.74639 40.62593, -73.74601 40.625...",-8209390.0,4957301.0
4,New York,closed_roof_tank,24.0,40.625781,-73.745813,"POLYGON ((-73.74595 40.62590, -73.74567 40.625...",-8209346.0,4957300.0


### Using ```.sjoin()``` to classify tanks by county

#### Converting ```df_tanks``` to a GeoDataFrame
For some of our further processing of data, we need to classify each tank by county. To do so, we will use GeoPandas' ```.sjoin()``` method to identify which county each tank belongs to. Since the ```.sjoin()``` method takes in two GeoDataFrames, we must convert ```df_tanks``` to a GeoDataFrame by using the ```lat_t_4326``` and ```lon_t_4326``` columns to create Point geometries.

To do so, we use GeoPandas' ```.GeoDataFrame``` method. We first pass in ```df_tanks``` (the dataframe we will convert to a GeoDataFrame), then specify which columns to use for the ```POINT``` geometry. In this case, we use ```lon_t_4326``` and ```lat_t_4326```.

In [33]:
df_tanks = gpd.GeoDataFrame(
    df_tanks, geometry=gpd.points_from_xy(df_tanks.lon_t_4326, df_tanks.lat_t_4326))
df_tanks.head()

Unnamed: 0,state,tank_type,diameter,lat_t_4326,lon_t_4326,geometry,lat_t_3857,lon_t_3857
0,New York,closed_roof_tank,39.6,40.625572,-73.745231,POINT (-73.74523 40.62557),-8209282.0,4957270.0
1,New York,closed_roof_tank,19.8,40.624761,-73.74442,POINT (-73.74442 40.62476),-8209191.0,4957151.0
2,New York,closed_roof_tank,12.6,40.626086,-73.746257,POINT (-73.74626 40.62609),-8209396.0,4957345.0
3,New York,closed_roof_tank,30.6,40.625786,-73.746203,POINT (-73.74620 40.62579),-8209390.0,4957301.0
4,New York,closed_roof_tank,24.0,40.625781,-73.745813,POINT (-73.74581 40.62578),-8209346.0,4957300.0


#### Reading in county shapefile
To find which tanks are in each county, we use GeoPandas' ```.sjoin()``` method. Using this method, we will perform a spatial join between each county's geometry and the dataframe including Point geometries for each tank in the US. For this, we need a dataframe with geometries for all counties in the US- which we took from the United States Census Bureau's Cartographic Boundary Files (available [here](https://www.census.gov/geographies/mapping-files/time-series/geo/carto-boundary-file.html)). Then, we filter to exclude counties from Alaska, Hawaii, Puerto Rico, Virgin Islands, American Samoa, Guam, Northern Marian Islands, as there are no tanks in those regions in the AST dataset. We also drop unnecessary columns and rename the column names in a standardized way.

In [5]:
df_counties = gpd.read_file('/hpc/group/codeplus22-vis/celine_data/source_files/county_shapefiles/counties.shp')
df_counties = df_counties[((df_counties['STATEFP'] != '02') & (df_counties['STATEFP'] != '15') &
                          (df_counties['STATEFP'] != '72') & (df_counties['STATEFP'] != '78') &
                          (df_counties['STATEFP'] != '60') & (df_counties['STATEFP'] != '66') &
                          (df_counties['STATEFP'] != '69'))]
df_counties = df_counties[['NAME', 'GEOID', 'geometry']]
df_counties.rename(columns = {'NAME': 'county', 'GEOID': 'geoid'}, inplace = True)
df_counties.head()

Unnamed: 0,county,geoid,geometry
0,Riley,20161,"POLYGON ((-96.96095 39.28670, -96.96106 39.288..."
1,Ringgold,19159,"POLYGON ((-94.47167 40.81255, -94.47166 40.819..."
2,Carbon,30009,"POLYGON ((-109.79867 45.16734, -109.68779 45.1..."
3,Bear Lake,16007,"POLYGON ((-111.63452 42.57034, -111.63010 42.5..."
4,Buffalo,55011,"POLYGON ((-92.08384 44.41200, -92.08310 44.414..."


#### Iterating through each county in and finding the tanks in that county
Next, since we must find which tanks are in every county in the US, we must iterate through every county in ```df_counties```. For each county, we perform a spatial join between that county GeoDataFrame and the ```df_tanks``` GeoDataFrame. The ```.sjoin()``` function returns a new GeoDataFrame that only includes the geometries that are the intersections of the two original GeoDataFrames. In this case, passing in a GeoDataFrame with the geometry for Harris County and a GeoDataFrame with all the tanks to the ```.sjoin()``` method returns a new GeoDataFrame with all the tanks in Harris County, as it returns all the Point geometries that intersects the Harris County Polygon geometry. This new GeoDataFrame keeps the index for each tank as it was in the original ```df_tanks``` dataframe. This is key- it allows us to take a list of these indices, and then loop over all of them to change the value of the ```county``` column in ```df_tanks``` at each of those indices to ```Harris County```.

However, we need to do this for all counties in the US, so we use for loop. This loop iterates every row of ```df_counties``` finds the intersection between that county and the tanks GeoDataFrame (```df_tanks```), creates a list of the indices of those tanks, mutates the ```county``` column in ```df_tanks``` to label each of those tanks with that county name. 

We intentially labelled each tank with the ```geo_id``` column, because in future processing, we will merge this dataframe with another dataframe based on county FIPS numbers. 

This takes around three minutes, since we are looping through 3,000 counties.

In [None]:
%%time

df_tanks['county'] = ''

for i in range(0, len(df_counties)):
    county = df_counties.iloc[i] ## finding county
    frame = county.to_frame() ## making county to a dataframe, as .iloc[i] returns a series
    row = gpd.GeoDataFrame(frame.T) ## transforming pandas df to geodataframe
    df_intersect = gpd.sjoin(df_tanks, row, how='inner', predicate='intersects') ## finding tanks in that county
    idx = list(df_intersect.index.values) ## finding indices of those tanks
    for num in idx: ## looping over those indices 
        df_tanks['county'].iloc[idx] = row.iloc[0]['geoid']

df_tanks.head()

### Exporting dataframe to shapefile

Now, since this code has been processed, we are exporting the new tank data into a shapefile that will be used later on. The input to the ```to_file``` function is the path where you want the dataframe to be exported to and the name of the file it will be saved in.

In [24]:
df_tanks.to_file('/hpc/group/codeplus22-vis/celine_data/ast_master.shp')

In [7]:
df = gpd.read_file('/hpc/group/codeplus22-vis/celine_data/ast_master.shp')
df

Unnamed: 0,state,tank_type,diameter,lat_t_4326,lon_t_4326,lat_t_3857,lon_t_3857,county,geometry
0,New York,closed_roof_tank,39.6,40.625572,-73.745231,-8.209282e+06,4.957270e+06,36059,POINT (-73.74523 40.62557)
1,New York,closed_roof_tank,19.8,40.624761,-73.744420,-8.209191e+06,4.957151e+06,36059,POINT (-73.74442 40.62476)
2,New York,closed_roof_tank,12.6,40.626086,-73.746257,-8.209396e+06,4.957345e+06,36059,POINT (-73.74626 40.62609)
3,New York,closed_roof_tank,30.6,40.625786,-73.746203,-8.209390e+06,4.957301e+06,36059,POINT (-73.74620 40.62579)
4,New York,closed_roof_tank,24.0,40.625781,-73.745813,-8.209346e+06,4.957300e+06,36059,POINT (-73.74581 40.62578)
...,...,...,...,...,...,...,...,...,...
98164,Colorado,narrow_closed_roof_tank,5.4,39.777431,-104.920718,-1.167972e+07,4.833652e+06,08031,POINT (-104.92072 39.77743)
98165,Colorado,narrow_closed_roof_tank,4.8,39.777301,-104.920631,-1.167971e+07,4.833633e+06,08031,POINT (-104.92063 39.77730)
98166,Colorado,narrow_closed_roof_tank,3.6,39.777701,-104.920609,-1.167971e+07,4.833691e+06,08031,POINT (-104.92061 39.77770)
98167,Colorado,narrow_closed_roof_tank,4.8,39.776628,-104.920617,-1.167971e+07,4.833535e+06,08031,POINT (-104.92062 39.77663)
