# Ast Data Master File

### Importing packages and libraries

In [2]:
import pandas as pd
import geopandas as gpd

### Importing tank data

In [11]:
df_tanks = gpd.read_file('/hpc/group/codeplus22-vis/ast_dataset/tile_level_annotations.shp')

df_tanks.head()

Unnamed: 0,tile_name,minx_polyg,miny_polyg,maxx_polyg,maxy_polyg,nw_corner_,nw_corne_1,se_corner_,se_corne_1,object_cla,diameter (,merged_bbo,bbox_withi,Category1,Category2,Category3,Category4,Category5,state,geometry
0,m_4007327_nw_18_060_20190809,974,314,1041,380,40.625753,-73.745466,40.625392,-73.744997,closed_roof_tank,39.6,1,0,0.0,0.0,0.0,0.0,0.0,New York,"POLYGON ((-73.74547 40.62575, -73.74500 40.625..."
1,m_4007327_nw_18_060_20190809,1091,479,1157,512,40.624853,-73.744652,40.624669,-73.744188,closed_roof_tank,19.8,0,0,0.0,0.0,0.0,0.0,0.0,New York,"POLYGON ((-73.74465 40.62485, -73.74419 40.624..."
2,m_4007327_nw_18_060_20190809,851,243,872,265,40.626147,-73.746331,40.626026,-73.746184,closed_roof_tank,12.6,0,0,0.0,0.0,0.0,0.0,0.0,New York,"POLYGON ((-73.74633 40.62615, -73.74618 40.626..."
3,m_4007327_nw_18_060_20190809,843,284,897,335,40.625926,-73.746392,40.625646,-73.746014,closed_roof_tank,30.6,0,0,0.0,0.0,0.0,0.0,0.0,New York,"POLYGON ((-73.74639 40.62593, -73.74601 40.625..."
4,m_4007327_nw_18_060_20190809,905,288,945,331,40.625899,-73.745952,40.625664,-73.745673,closed_roof_tank,24.0,0,0,0.0,0.0,0.0,0.0,0.0,New York,"POLYGON ((-73.74595 40.62590, -73.74567 40.625..."


### Filtering out tank data

Here, we are filtering for certain columns in the ast dataset. The original dataframe has columns that we will not use for the purposes of our visualizations. To minimize memory consumption and maximize runtime, we are only keeping the columns necessary for our visualizations

In [12]:
df_tanks = df_tanks[['nw_corner_', 'nw_corne_1', 'se_corner_', 'se_corne_1', 'object_cla', 'diameter (', 'state', 'geometry']]
df_tanks

Unnamed: 0,nw_corner_,nw_corne_1,se_corner_,se_corne_1,object_cla,diameter (,state,geometry
0,40.625753,-73.745466,40.625392,-73.744997,closed_roof_tank,39.6,New York,"POLYGON ((-73.74547 40.62575, -73.74500 40.625..."
1,40.624853,-73.744652,40.624669,-73.744188,closed_roof_tank,19.8,New York,"POLYGON ((-73.74465 40.62485, -73.74419 40.624..."
2,40.626147,-73.746331,40.626026,-73.746184,closed_roof_tank,12.6,New York,"POLYGON ((-73.74633 40.62615, -73.74618 40.626..."
3,40.625926,-73.746392,40.625646,-73.746014,closed_roof_tank,30.6,New York,"POLYGON ((-73.74639 40.62593, -73.74601 40.625..."
4,40.625899,-73.745952,40.625664,-73.745673,closed_roof_tank,24.0,New York,"POLYGON ((-73.74595 40.62590, -73.74567 40.625..."
...,...,...,...,...,...,...,...,...
98164,39.777458,-104.920750,39.777404,-104.920687,narrow_closed_roof_tank,5.4,Colorado,"POLYGON ((-104.92075 39.77746, -104.92069 39.7..."
98165,39.777323,-104.920659,39.777279,-104.920603,narrow_closed_roof_tank,4.8,Colorado,"POLYGON ((-104.92066 39.77732, -104.92060 39.7..."
98166,39.777717,-104.920637,39.777685,-104.920581,narrow_closed_roof_tank,3.6,Colorado,"POLYGON ((-104.92064 39.77772, -104.92058 39.7..."
98167,39.776652,-104.920645,39.776604,-104.920589,narrow_closed_roof_tank,4.8,Colorado,"POLYGON ((-104.92065 39.77665, -104.92059 39.7..."


### Finding average latitude and longitude coordinates of tanks

We are calculating the average longitude and latitudes of the positions of our tanks. The orignal tank locations came in polygon geometries; however, since we are plotting the tanks across the US, plotting all ~98,000 of them in shapes will be too slow. Thus, we are finding the average longitude and latitudes using the 4 corners of the tank geometries so that we can plot the tanks in points.

In [13]:
df_tanks['avg_lat'] = (df_tanks['nw_corner_'] + df_tanks['se_corner_'])/2
df_tanks['avg_long'] = (df_tanks['nw_corne_1'] + df_tanks['se_corne_1'])/2

df_tanks

Unnamed: 0,nw_corner_,nw_corne_1,se_corner_,se_corne_1,object_cla,diameter (,state,geometry,avg_lat,avg_long
0,40.625753,-73.745466,40.625392,-73.744997,closed_roof_tank,39.6,New York,"POLYGON ((-73.74547 40.62575, -73.74500 40.625...",40.625572,-73.745231
1,40.624853,-73.744652,40.624669,-73.744188,closed_roof_tank,19.8,New York,"POLYGON ((-73.74465 40.62485, -73.74419 40.624...",40.624761,-73.744420
2,40.626147,-73.746331,40.626026,-73.746184,closed_roof_tank,12.6,New York,"POLYGON ((-73.74633 40.62615, -73.74618 40.626...",40.626086,-73.746257
3,40.625926,-73.746392,40.625646,-73.746014,closed_roof_tank,30.6,New York,"POLYGON ((-73.74639 40.62593, -73.74601 40.625...",40.625786,-73.746203
4,40.625899,-73.745952,40.625664,-73.745673,closed_roof_tank,24.0,New York,"POLYGON ((-73.74595 40.62590, -73.74567 40.625...",40.625781,-73.745813
...,...,...,...,...,...,...,...,...,...,...
98164,39.777458,-104.920750,39.777404,-104.920687,narrow_closed_roof_tank,5.4,Colorado,"POLYGON ((-104.92075 39.77746, -104.92069 39.7...",39.777431,-104.920718
98165,39.777323,-104.920659,39.777279,-104.920603,narrow_closed_roof_tank,4.8,Colorado,"POLYGON ((-104.92066 39.77732, -104.92060 39.7...",39.777301,-104.920631
98166,39.777717,-104.920637,39.777685,-104.920581,narrow_closed_roof_tank,3.6,Colorado,"POLYGON ((-104.92064 39.77772, -104.92058 39.7...",39.777701,-104.920609
98167,39.776652,-104.920645,39.776604,-104.920589,narrow_closed_roof_tank,4.8,Colorado,"POLYGON ((-104.92065 39.77665, -104.92059 39.7...",39.776628,-104.920617


### More tank data processing

We are now only keeping the new average longitude and latitude coordinates along with the other neccessary columns in the dataframe.

Below, we are also renaming the variables so that they are standardized moving forward. The average latitude and longitude are named with 4326 at the end because they currently are in the 4326 projection. 

In [1]:
df_tanks = df_tanks[['state', 'object_cla', 'diameter (', 'avg_lat', 'avg_long', 'geometry']]
df_tanks

NameError: name 'df_tanks' is not defined

In [15]:
df_tanks.rename(columns = {'avg_lat':'lat_t_4326'}, inplace = True)
df_tanks.rename(columns = {'avg_long':'lon_t_4326'}, inplace = True)
df_tanks.rename(columns = {'object_cla':'tank_type'}, inplace = True)
df_tanks.rename(columns = {'diameter (':'diameter'}, inplace = True)

df_tanks

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tanks.rename(columns = {'avg_lat':'lat_t_4326'}, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tanks.rename(columns = {'avg_long':'lon_t_4326'}, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tanks.rename(columns = {'object_cla':'tank_type'}, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing

Unnamed: 0,state,tank_type,diameter,lat_t_4326,lon_t_4326,geometry
0,New York,closed_roof_tank,39.6,40.625572,-73.745231,"POLYGON ((-73.74547 40.62575, -73.74500 40.625..."
1,New York,closed_roof_tank,19.8,40.624761,-73.744420,"POLYGON ((-73.74465 40.62485, -73.74419 40.624..."
2,New York,closed_roof_tank,12.6,40.626086,-73.746257,"POLYGON ((-73.74633 40.62615, -73.74618 40.626..."
3,New York,closed_roof_tank,30.6,40.625786,-73.746203,"POLYGON ((-73.74639 40.62593, -73.74601 40.625..."
4,New York,closed_roof_tank,24.0,40.625781,-73.745813,"POLYGON ((-73.74595 40.62590, -73.74567 40.625..."
...,...,...,...,...,...,...
98164,Colorado,narrow_closed_roof_tank,5.4,39.777431,-104.920718,"POLYGON ((-104.92075 39.77746, -104.92069 39.7..."
98165,Colorado,narrow_closed_roof_tank,4.8,39.777301,-104.920631,"POLYGON ((-104.92066 39.77732, -104.92060 39.7..."
98166,Colorado,narrow_closed_roof_tank,3.6,39.777701,-104.920609,"POLYGON ((-104.92064 39.77772, -104.92058 39.7..."
98167,Colorado,narrow_closed_roof_tank,4.8,39.776628,-104.920617,"POLYGON ((-104.92065 39.77665, -104.92059 39.7..."


### Adding the transformed latitude and longitude coordinates

Next, in order for the coordinates to be plotted on visualizations, they must be transformed from 4326 projections into the 3857 coordinate projection. 

In [16]:
from pyproj import Proj, Transformer

# Apply transformation
transform_4326_to_3857 = Transformer.from_crs('epsg:4326', 'epsg:3857')
df_tanks['lat_t_3857'], df_tanks['lon_t_3857'] = transform_4326_to_3857.transform(
                                                df_tanks['lat_t_4326'], df_tanks['lon_t_4326'])
df_tanks

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super(GeoDataFrame, self).__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super(GeoDataFrame, self).__setitem__(key, value)


Unnamed: 0,state,tank_type,diameter,lat_t_4326,lon_t_4326,geometry,lat_t_3857,lon_t_3857
0,New York,closed_roof_tank,39.6,40.625572,-73.745231,"POLYGON ((-73.74547 40.62575, -73.74500 40.625...",-8.209282e+06,4.957270e+06
1,New York,closed_roof_tank,19.8,40.624761,-73.744420,"POLYGON ((-73.74465 40.62485, -73.74419 40.624...",-8.209191e+06,4.957151e+06
2,New York,closed_roof_tank,12.6,40.626086,-73.746257,"POLYGON ((-73.74633 40.62615, -73.74618 40.626...",-8.209396e+06,4.957345e+06
3,New York,closed_roof_tank,30.6,40.625786,-73.746203,"POLYGON ((-73.74639 40.62593, -73.74601 40.625...",-8.209390e+06,4.957301e+06
4,New York,closed_roof_tank,24.0,40.625781,-73.745813,"POLYGON ((-73.74595 40.62590, -73.74567 40.625...",-8.209346e+06,4.957300e+06
...,...,...,...,...,...,...,...,...
98164,Colorado,narrow_closed_roof_tank,5.4,39.777431,-104.920718,"POLYGON ((-104.92075 39.77746, -104.92069 39.7...",-1.167972e+07,4.833652e+06
98165,Colorado,narrow_closed_roof_tank,4.8,39.777301,-104.920631,"POLYGON ((-104.92066 39.77732, -104.92060 39.7...",-1.167971e+07,4.833633e+06
98166,Colorado,narrow_closed_roof_tank,3.6,39.777701,-104.920609,"POLYGON ((-104.92064 39.77772, -104.92058 39.7...",-1.167971e+07,4.833691e+06
98167,Colorado,narrow_closed_roof_tank,4.8,39.776628,-104.920617,"POLYGON ((-104.92065 39.77665, -104.92059 39.7...",-1.167971e+07,4.833535e+06


### Exporting dataframe to shapefile

Now, since this code has been processed, we are exporting the new tank data into a shapefile that will be used later on.

In [18]:
df_tanks.to_file('/hpc/group/codeplus22-vis/infousa_copy/ast_master.shp')

  pd.Int64Index,
