# TRI Modeling Validation

gl
<br>
09.29.20

- only three sensors in reasonable proximity for 90-99 and all lead
- want to check the 2000-2018 for potential matches

In [329]:
#Libraries
import pandas as pd
import geopandas as gpd 
import contextily as ctx
import matplotlib.pyplot as plt
from math import radians, cos, sin, asin, sqrt

import glob
import click
import glob
import pandas as pd
import numpy as np
import xarray as xr 
from shapely.geometry import Point
import os 


In [330]:
#Functions
def haversine(lon1, lat1, lon2, lat2):

    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r

## Pulling the sensor data

In [331]:
#Load in TRI data from 1990 - 2018 to look for relevant sensors (run with makefile commands in processed/data_origin.txt)
TRI_base_process_90_18_nopubchem_df = pd.read_csv('/home/boogie2/Hanson_Lab/TRI_STILT/data/processed/TRI_base_process_90_18_nopubchem.csv')
TRI_base_process_90_18_nopubchem_df = TRI_base_process_90_18_nopubchem_df.drop(columns = ['Unnamed: 0'])

#While there may be duplicates in the data, we don't need them for this analysis
TRI_base_process_90_18_nopubchem_df = TRI_base_process_90_18_nopubchem_df.drop_duplicates()

#Load in EPA monitors data
EPA_mon = pd.read_csv('/home/boogie2/Hanson_Lab/TRI_STILT/data/validation/TRIChemicals_Monitors.csv')

#Interested in the monitors from 1990 to 2018
valid_monitors = EPA_mon[(EPA_mon['first_year']>=1990)]

print('Total TRI releases 1990-2018: {0}'.format(TRI_base_process_90_18_nopubchem_df.shape[0]))
print('\nTotal number of EPA monitors recording after 1990: {0}'.format(valid_monitors.shape[0]))
print('\nEPA tracked chemicals: ')
print(*valid_monitors['chemicalname'].drop_duplicates().values, sep = ", ")
print('\nUnique EPA Sensor Locations: {}'.format(valid_monitors.drop_duplicates(subset= ['latitude','longitude']).shape[0]))

Total TRI releases 1990-2018: 2231

Total number of EPA monitors recording after 1990: 88

EPA tracked chemicals: 
ETHYLBENZENE, STYRENE, 1,2-DIBROMOETHANE, 1,3-BUTADIENE, 1,2-DICHLOROETHANE, METHYL ISOBUTYL KETONE, TETRACHLOROETHYLENE, FORMALDEHYDE, CHLOROFORM, BENZENE, LEAD, NICKEL, CADMIUM, COBALT, DICHLOROMETHANE, ETHYLENE OXIDE, TRICHLOROETHYLENE, NAPHTHALENE, CUMENE

Unique EPA Sensor Locations: 12


In [332]:
#First need to sort the EPA sensors by ID to see which chemicals are at which facilities
valid_monitors['casnumber'] = valid_monitors['casnumber'].str.replace('-','')
epa_sensors_locs = valid_monitors.groupby(['latitude','longitude'])['casnumber'].apply(list)
epa_sensors_locs = epa_sensors_locs.reset_index()

#Gather the CAS numbers for the unique chemicals (EPA sensors)
unique_cas = epa_sensors_locs['casnumber'].to_list()
unique_cas = [item for sublist in unique_cas for item in sublist]
unique_cas =list(dict.fromkeys(unique_cas))

In [333]:
#First calculate the TRI emitters which are closest to the origin source AND have chemicals within the list of TRI emitters 
a= []

for idx in range(epa_sensors_locs.shape[0]):
    locs = TRI_base_process_90_18_nopubchem_df
    temp =epa_sensors_locs.iloc[idx] # This is EPA monitor

    #Should be using Haversin because of the rounded nature of the earth
    locs['haversine_distance_km']=locs.apply(lambda row : haversine(row['LONGITUDE'],row['LATITUDE'],temp['longitude'],temp['latitude']), axis = 1)

    #In order to add multiple entries per each - I think I will just change the iloc here to a boolean based upon distance
    matches = locs[locs['CAS#/COMPOUNDID'].isin(epa_sensors_locs['casnumber'].iloc[idx])]
    matches = matches[matches.haversine_distance_km<50]
    a.append(matches)

nearest_ls = pd.concat([epa_sensors_locs.reset_index(),pd.DataFrame(a).reset_index()],axis=1)
nearest_ls = nearest_ls.drop(columns=['index'])

#Remove any sensors which have no sensors nearby
nearest_ls = nearest_ls[nearest_ls[0].apply(lambda x: x.empty)==False]

In [334]:
#Converting the list of dataframes into one large dataframe
temp_list = []
for rows in range(nearest_ls.shape[0]):
    temp_df = nearest_ls[0].iloc[rows]
    temp_df['EPA_lat'] = nearest_ls['latitude'].iloc[rows]
    temp_df['EPA_long'] = nearest_ls['longitude'].iloc[rows]
    temp_df['casnumber'] = str(nearest_ls['casnumber'].iloc[rows])
    temp_list.append(temp_df)

EPA_TRI_merge_by_nearest_sensor = pd.concat(temp_list)
EPA_TRI_merge_by_nearest_sensor = EPA_TRI_merge_by_nearest_sensor.dropna(subset=['Group'])

In [335]:
#Cleaning up the dataframes a bit 
EPA_TRI_merge_by_nearest_sensor = EPA_TRI_merge_by_nearest_sensor[['FRSID',
                                                                    'YEAR',
                                                                    'TRIFD',
                                                                    'CAS#/COMPOUNDID',
                                                                    'CHEMICAL',
                                                                    'LATITUDE',
                                                                    'LONGITUDE',
                                                                    'EPA_lat',
                                                                    'EPA_long',
                                                                    'casnumber',
                                                                    'haversine_distance_km']]


# Expanding the output so each EPA lat/long ~ sensor ~ showcases the nearest TRI release with distance and years produced
EPA_TRI_merge_by_nearest_sensor_loc_agg = EPA_TRI_merge_by_nearest_sensor.groupby(['EPA_lat','EPA_long','LATITUDE','LONGITUDE','CHEMICAL','TRIFD','haversine_distance_km'])['YEAR'].apply(list).reset_index()
EPA_TRI_merge_by_nearest_sensor_loc_agg = pd.DataFrame(EPA_TRI_merge_by_nearest_sensor_loc_agg)
EPA_TRI_merge_by_nearest_sensor_loc_agg

Unnamed: 0,EPA_lat,EPA_long,LATITUDE,LONGITUDE,CHEMICAL,TRIFD,haversine_distance_km,YEAR
0,37.198299,-113.1506,37.037627,-113.544195,LEAD,84770STKRP1843E,39.205851,"[2012, 2013, 2014, 2015, 2016, 2017, 2018]"
1,37.198299,-113.1506,37.043001,-113.532888,LEAD,8479WSNRCC1825E,38.040146,[2018]
2,37.198299,-113.1506,37.120370,-113.556790,NICKEL,84770STGRG1301E,37.023942,"[2011, 2012, 2013, 2014, 2015]"
3,37.198299,-113.1506,37.169211,-113.423103,LEAD,8473WSNRCC155NR,24.356513,[2018]
4,37.459080,-113.2251,37.120370,-113.556790,NICKEL,84770STGRG1301E,47.743947,"[2011, 2012, 2013, 2014, 2015]"
...,...,...,...,...,...,...,...,...
593,41.842648,-111.8522,41.763580,-111.860270,LEAD,84321NVRNC1073W,8.817370,"[2010, 2011, 2012, 2013, 2014, 2015, 2016, 201..."
594,41.842648,-111.8522,41.771330,-111.848860,LEAD,8432WGNVRC2151N,7.935030,[2018]
595,41.842648,-111.8522,41.882500,-112.196400,CADMIUM,84330NCRST7285W,28.846324,"[1990, 1991]"
596,41.842648,-111.8522,41.882500,-112.196400,LEAD,84330NCRST7285W,28.846324,"[1990, 1991, 1992, 1993]"


In [336]:
#Saving the Data: 
EPA_TRI_merge_by_nearest_sensor_loc_agg.to_csv('/home/boogie2/Hanson_Lab/TRI_STILT/data/validation/EPA_validation_100.csv')

In [337]:
#ATTEMPTING TO FIGURE OUT WHY THE JOIN BETWEEN THE ORIGINAL TRI DATA DOES NOT FULLY MATCH
test = pd.read_csv('/home/boogie2/Hanson_Lab/TRI_STILT/data/validation/EPA_validation_100.csv')[['TRIFD','CHEMICAL','LATITUDE','LONGITUDE']].drop_duplicates()

In [338]:
test=test.reset_index()

In [353]:
#So it seems there may be an error here. It is partially due to the rounded nature of the values being exported. But all cannot be found by the merge which is really weird. 
test['LATITUDE']=test['LATITUDE'].round(5)
test['LONGITUDE']=test['LONGITUDE'].round(5)


tri_9_18_df['LATITUDE']=tri_9_18_df['LATITUDE'].round(5)
tri_9_18_df['LONGITUDE']=tri_9_18_df['LONGITUDE'].round(5)

# SO YOU MUST BE VERY CAREFUL WITH THE LAT AND LONG BECAUSE THEY CAN BE UNSTANDARDIZED!

In [360]:
lat_lon_test = test.merge(tri_9_18_df[['TRIFD','CHEMICAL','LATITUDE','LONGITUDE']].drop_duplicates()).sort_values(by='index')


#ARE THE SHAPES THE SAME? 
print('THE SHAPE OF EXPORTED DATA: ' + str(test.shape))
print('THE SHAPE OF MERGED DATA: ' + str(lat_lon_test.shape))


Unnamed: 0,index,TRIFD,CHEMICAL,LATITUDE,LONGITUDE
32,34,8408WWSTJR5792W,LEAD,40.57516,-112.02899


In [365]:
#Which is causing the issue
test[~test['index'].isin(lat_lon_test['index'])]

Unnamed: 0,index,TRIFD,CHEMICAL,LATITUDE,LONGITUDE
32,34,8408WWSTJR5792W,LEAD,40.57516,-112.02899


In [356]:
#Yes. But why are these being rounded differently???
tri_9_18_df[(tri_9_18_df['TRIFD']=='8408WWSTJR5792W')][['YEAR','CHEMICAL','LATITUDE','LONGITUDE']]

Unnamed: 0,YEAR,CHEMICAL,LATITUDE,LONGITUDE
1151,2016,LEAD,40.57517,-112.02899
1179,2017,LEAD,40.57517,-112.02899


# Modeling EPA Validation Sensors

Joemy merged all EPA sensor data with TRI releases so we can start to validate the model. 

09.30.20 
<br>

Herein, I explore the possibility of taking the TRI validation set, running a sample of 100 stilt models, then rejoining back the EPA validation data to see how the concentrations differ. Talking with Ben and Derick showcased that it is probably best to focus on a specific chemical, most likely lead, as it is relatively inert. This code is an **exploration** in results and should not be consdered refined or tested. 

THE ISSUE ABOVE WILL NEED TO BE FIXED FOR THIS TO WORK PROPERLY

In [20]:
#Load the data
TRI_validation_df = pd.read_csv('/home/boogie2/Hanson_Lab/TRI_STILT/data/validation/TRI_ValidationSet.csv')
TRI_validation_df['sample_dt'] = pd.to_datetime(TRI_validation_df['sample_dt'],format='%m-%d-%Y')

#Collecting only those simulations through 2014 because that is where I have NARR data through
tri_valid_2014 = TRI_validation_df[TRI_validation_df.year<=2014]

In [273]:
tri_9_18_df = pd.read_csv('/home/boogie2/Hanson_Lab/TRI_STILT/data/processed/TRI_base_process_90_18_nopubchem.csv').drop(columns=['Unnamed: 0'])

In [286]:
#There is a weird discrepency between what is expected in overlap. Between the original TRI data and this validation set. When merged together, only 73/624 tri_validation monitors overlap with those found in the original TRI dataset
a = tri_9_18_df[['YEAR','TRIFD','CHEMICAL','LATITUDE','LONGITUDE']].drop_duplicates()
a.shape

(2214, 5)

In [289]:
tri_valid_2014.columns

Index([&#39;monitor_group&#39;, &#39;parametername&#39;, &#39;monitorid&#39;, &#39;year&#39;, &#39;sampleduration&#39;,
       &#39;latitude&#39;, &#39;longitude&#39;, &#39;casnumber&#39;, &#39;trifd&#39;, &#39;cas_no&#39;, &#39;haps_unit&#39;,
       &#39;haps_conc&#39;, &#39;chemical&#39;, &#39;group&#39;, &#39;frsid&#39;, &#39;facilityname&#39;, &#39;city&#39;,
       &#39;county&#39;, &#39;st&#39;, &#39;zip&#39;, &#39;tri_lat&#39;, &#39;tri_lon&#39;, &#39;cascompoundid&#39;, &#39;metal&#39;,
       &#39;carcinogen&#39;, &#39;stackheight&#39;, &#39;stackvelocity&#39;, &#39;stackdiameter&#39;,
       &#39;stackheightsource&#39;, &#39;stackvelocitysource&#39;, &#39;stackdiametersource&#39;,
       &#39;sample_dt&#39;],
      dtype=&#39;object&#39;)

In [287]:
b = tri_valid_2014[['year','trifd','chemical','tri_lat','tri_lon']].drop_duplicates()
b.shape

(624, 5)

In [285]:
a.merge(b, how='inner',right_on= ['year','trifd','chemical','tri_lat','tri_lon'],left_on=['YEAR','TRIFD','CHEMICAL','LATITUDE','LONGITUDE'])

Unnamed: 0,YEAR,TRIFD,CHEMICAL,LATITUDE,LONGITUDE,year,trifd,chemical,tri_lat,tri_lon
0,1996,84087CRYSN2355S,CUMENE,40.86631,-111.91187,1996,84087CRYSN2355S,CUMENE,40.86631,-111.91187
1,2010,84087CRYSN2355S,CUMENE,40.86631,-111.91187,2010,84087CRYSN2355S,CUMENE,40.86631,-111.91187
2,2010,84087CRYSN2355S,NAPHTHALENE,40.86631,-111.91187,2010,84087CRYSN2355S,NAPHTHALENE,40.86631,-111.91187
3,2011,84087CRYSN2355S,NAPHTHALENE,40.86631,-111.91187,2011,84087CRYSN2355S,NAPHTHALENE,40.86631,-111.91187
4,2012,84087CRYSN2355S,NAPHTHALENE,40.86631,-111.91187,2012,84087CRYSN2355S,NAPHTHALENE,40.86631,-111.91187
...,...,...,...,...,...,...,...,...,...,...
68,2011,84087CRYSN2355S,ETHYLBENZENE,40.86631,-111.91187,2011,84087CRYSN2355S,ETHYLBENZENE,40.86631,-111.91187
69,2012,84087CRYSN2355S,ETHYLBENZENE,40.86631,-111.91187,2012,84087CRYSN2355S,ETHYLBENZENE,40.86631,-111.91187
70,2013,84660TLFLX1800N,ETHYLBENZENE,40.13290,-111.66150,2013,84660TLFLX1800N,ETHYLBENZENE,40.13290,-111.66150
71,2013,84087CRYSN2355S,ETHYLBENZENE,40.86631,-111.91187,2013,84087CRYSN2355S,ETHYLBENZENE,40.86631,-111.91187


In [24]:
#Processing for STILT Simulation
stilt_df = tri_valid_2014[['tri_lat','tri_lon','stackheight','sample_dt']].drop_duplicates()
stilt_df.columns = ['lati','long','zagl','run_times']

#Save to csv file
stilt_df.to_csv('/home/boogie2/Hanson_Lab/TRI_STILT/data/validation/092920_epa_valid_2014.csv',index=False)

#Ran a subsample of the original (select within the following R file)
    #Edited ./src/validation/092920_epa_validation.r to only sample 100 releases (CHPC disk issues)


### Processing the Subsample of Validation Data (no script built yet)

In [25]:
def nc_open(path):
    """
    A function to open netcdf4 files. Requires xarray
    ===
    Input:
    path - path to the cdf file

    Output: 
    df - converted cdf file to dataframe object
    """
    df = xr.open_dataarray(path)
    return df.to_dataframe().reset_index()

def stilt_netcdf_to_gdf(stilt_df, threshold):
    '''Takes a stilt footprint, filters based upon a threshold and averages the simulation 

    Input:
    ----------  
    stilt_df - an output coming from nc_open, based upon netcdf to pandas conversion
    threshold - a value for filtering - if null no filtering is performed on the data. 
    epsg - coordinate selection for mapping

    Returns:
    sim_avg: a geodataframe of the average non-log_conc per the simulation run (48 hr with current setup) transformed to points for comparison
    '''  
    if threshold != None:
        stilt_df = stilt_df[stilt_df.foot>threshold] 
    
    sim_avg = stilt_df.groupby(['lat','lon']).agg({'foot':'mean'}).reset_index()
    return sim_avg


    

In [None]:
#First need to create a csv file of all the epa validation info

#Then need to merge on the TRI information retaining only those trifd's which are present within tri_valid_2014

#Then calculate the release per the year

#Check to see if there is a grid cell match 

In [100]:
epa_locations_df = tri_valid_2014[['latitude','longitude']].drop_duplicates()
epa_locations_df = epa_locations_df.rename(columns={'latitude':'epa_latitude','longitude':'epa_longitude'})

In [83]:
#Find the grid cells which are closest
lat_ls = []
lon_ls = []

for rows in range(epa_locations_df.shape[0]):
    temp_lat = epa_locations_df.latitude.iloc[rows]
    temp_lon = epa_locations_df.longitude.iloc[rows]

    stilt_sim_gdf['dist'] = stilt_sim_gdf.apply(lambda row : haversine(row['lon'],row['lat'],temp_lon,temp_lat), axis = 1)
    lat_ls.append(stilt_sim_gdf.iloc[stilt_sim_gdf['dist'].idxmin()].lat)
    lon_ls.append(stilt_sim_gdf.iloc[stilt_sim_gdf['dist'].idxmin()].lon)
    
    stilt_sim_gdf = stilt_sim_gdf.drop(columns='dist')

In [101]:
epa_locations_df['nearest_stilt_grid_lat'] = lat_ls
epa_locations_df['nearest_stilt_grid_lon'] = lon_ls

In [160]:

stilt_filepath= '/home/boogie2/Hanson_Lab/TRI_STILT/data/processed/stilt_output/netcdf/092920_epa_valid_2014'

#Load the Mapping Files
temp_data_list = []
counter = 0
# Gather Chemical information based upon the id_mappings
for files in glob.glob(stilt_filepath + '/*.nc'):

    #Extract information from the data label
    filename = files.split('/')[-1].split('.nc')[0]
    date = pd.to_datetime(int(filename.split('_')[0][0:8]),format='%Y%m%d')
    longi = float(filename.split('_')[1])
    lati = float(filename.split('_')[2])
    zagl = float(filename.split('_')[3])

    stilt_sim_gdf = nc_open(files)
    stilt_sim_gdf = stilt_netcdf_to_gdf(stilt_sim_gdf,None)
    
    points_of_interest = stilt_sim_gdf.merge(epa_locations_df, left_on=['lat','lon'],how='inner', right_on=['nearest_stilt_grid_lat','nearest_stilt_grid_lon']).drop(columns = ['nearest_stilt_grid_lat','nearest_stilt_grid_lon'])
    
    points_of_interest['filename'] = filename
    points_of_interest['date'] = date
    points_of_interest['zagl'] = zagl
    points_of_interest['year'] = pd.DatetimeIndex(points_of_interest['date']).year
    points_of_interest['tri_source_lat'] = lati
    points_of_interest['tri_source_lon'] = longi

    #62/100 have a value over zero at the grid cell closest to the EPA monitor. Let's collect all of those and put them into a dataframe
    if points_of_interest['foot'].sum()>0:
        temp_data_list.append(points_of_interest)

    
#For those that have positive values we need to add to a list then merge with the original TRI data
stilt_epa_df = pd.concat(temp_data_list)


In [201]:
#load the TRI info
tri_9_18_df = pd.read_csv('/home/boogie2/Hanson_Lab/TRI_STILT/data/processed/TRI_base_process_90_18_nopubchem.csv').drop(columns=['Unnamed: 0'])

#This separates fugitive and stack releases - setting the stack height of the release for fugitive releases to 0
fug = tri_9_18_df[tri_9_18_df['51-FUGITIVEAIR']>0]
fug['StackHeight']=0
fug = fug.rename(columns = {'51-FUGITIVEAIR':'Release (lbs/year)'})
fug = fug.drop(columns = ['52-STACKAIR'])

stack = tri_9_18_df[tri_9_18_df['52-STACKAIR']>0]
stack = stack.rename(columns = {'52-STACKAIR':'Release (lbs/year)'})
stack = stack.drop(columns = ['51-FUGITIVEAIR'])

#Concatenate the results together
tri_9_18_df = pd.concat([stack,fug])

In [202]:
#There is an issue here not picking matching to all stilt sites
tri_9_18_df['StackHeight'] = tri_9_18_df['StackHeight'].round(2)
tri_9_18_df['LATITUDE'] = tri_9_18_df['LATITUDE'].round(6)
tri_9_18_df['LONGITUDE'] = tri_9_18_df['LONGITUDE'].round(6)

stilt_epa_df['zagl'] = stilt_epa_df['zagl'].round(2)
stilt_epa_df['tri_source_lat'] = stilt_epa_df['tri_source_lat'].round(6)
stilt_epa_df['tri_source_lon'] = stilt_epa_df['tri_source_lon'].round(6)


In [209]:
stilt_tri_df =stilt_epa_df.merge(tri_9_18_df,how='inner',left_on =['tri_source_lat','tri_source_lon','zagl','year'],right_on=['LATITUDE','LONGITUDE','StackHeight','YEAR'])

In [210]:
validation_stilt_tri_df = tri_valid_2014.merge(stilt_tri_df,how='inner', left_on=['trifd','cas_no','sample_dt','latitude','longitude'],right_on=['TRIFD','CAS#/COMPOUNDID','date','epa_latitude','epa_longitude'])

## Modeling EPA Validation Sensors: Proof of Concept

In [24]:
#So just for sake of efficiency - let's examine just a subset of the data (i choose styrene in 2010)
styrene = TRI_validation_df[(TRI_validation_df.parametername == 'STYRENE') & (TRI_validation_df.year == 2010)]
styrene.describe()

Unnamed: 0,monitorid,year,latitude,longitude,cas_no,haps_conc,frsid,zip,tri_lat,tri_lon,cascompoundid,stackheight,stackvelocity,stackdiameter
count,342.0,342.0,342.0,342.0,342.0,342.0,342.0,342.0,342.0,342.0,342.0,342.0,342.0,342.0
mean,490110004.0,2010.0,40.9029,-111.8845,100425.0,0.233781,110008100000.0,140263800.0,40.850004,-112.117043,100425.0,14.766666,9.866667,0.766667
std,0.0,0.0,7.115838e-15,1.423168e-14,0.0,0.672109,14074970.0,313910600.0,0.126271,0.383589,0.0,9.667929,4.818032,0.335487
min,490110004.0,2010.0,40.9029,-111.8845,100425.0,0.007,110000500000.0,84016.0,40.734402,-112.9681,100425.0,7.7,2.0,0.3
25%,490110004.0,2010.0,40.9029,-111.8845,100425.0,0.013,110000500000.0,84029.0,40.742111,-112.0336,100425.0,9.4,6.0,0.6
50%,490110004.0,2010.0,40.9029,-111.8845,100425.0,0.079,110000800000.0,84070.5,40.816246,-111.94232,100425.0,10.05,10.2,0.7
75%,490110004.0,2010.0,40.9029,-111.8845,100425.0,0.1395,110006900000.0,84104.0,40.886021,-111.91116,100425.0,15.8,15.2,0.9
max,490110004.0,2010.0,40.9029,-111.8845,100425.0,4.63,110039100000.0,841162300.0,41.105,-111.90476,100425.0,35.599998,15.6,1.4


Conclusions:

1. Styrene data is available in the years 1996, 1999, 2010-17
2. There is only a single EPA monitor (40.9029, -111.8845)  but four nearby releasing TRI sites
3. Data is available on 57 dates 

In [25]:
trifd_of_interest = styrene['trifd'].drop_duplicates().to_list()
chems_of_interest = styrene['cas_no'].drop_duplicates().to_list()

#Let's filter the data for these trifd's of interest
df = pd.read_csv('/home/boogie2/Hanson_Lab/TRI_STILT/data/processed/TRI_valid_2010_2010.csv').drop(columns = 'Unnamed: 0' )

df['TRIFD'] = df.TRIFD.astype('string')
entries_of_interest = df[(df.TRIFD.isin(trifd_of_interest)) & (df.CAS_No.isin(chems_of_interest))]

#Now we should be ready to go
entries_of_interest.to_csv('/home/boogie2/Hanson_Lab/TRI_STILT/data/processed/STYRENE_DEMO.csv')

In [41]:
#Get the TRI locations
temp = pd.read_csv('/home/boogie2/Desktop/styrene_stilt_RUN.csv')

In [42]:
#Which dates do we want to model? (the current setup models on a everyday per year basis)
dates_df = pd.DataFrame(sorted(styrene['sample_dt'].drop_duplicates()))
dates_df['merge']=0
temp['merge']=0 

In [49]:
styrene_run = temp.merge(dates_df).drop(columns =['merge','YEAR','id']).rename(columns={0:'run_times'})
styrene_run.to_csv('styrene_run.csv',index=False)

# Run Simulations

In [2]:
#Let's take a look: 
styrene_gdf = gpd.read_file('/home/boogie2/Hanson_Lab/TRI_STILT/data/processed/stilt_output/shapefile/092520_styrene')
styrene_gdf['ss_date'] = pd.to_datetime(styrene_gdf['ss_date'])

In [3]:
styrene_gdf.shape

(38832, 15)

In [42]:
temp_20100106 = styrene_gdf[styrene_gdf['ss_date']== '2010-01-06']
fig, ax = plt.subplots(figsize=(15,15))
temp_20100106[temp_20100106.lbsperday>0.1].plot(column = 'lbsperday',ax = ax,alpha = 0.5,markersize=5)
ctx.add_basemap(ax=ax)

plt.close()

In [38]:
styrene_gdf[['TRI_source','TRI_sour_1']].drop_duplicates()

Unnamed: 0,TRI_source,TRI_sour_1
0,40.74211,-111.95802
14992,40.786392,-111.911163
25782,41.105,-112.0336
28400,40.7344,-112.9681
32116,40.886022,-111.904759
32168,40.8461,-111.92662


In [37]:
temp_20100106

Unnamed: 0,lat,lon,foot,lbsperday,id,TRI_source,TRI_sour_1,zagl,Chemical,Release (l,YEAR,ss_name,ss_path,ss_date,geometry
0,40.745,-111.955,0.084898,9.180184,0,40.74211,-111.95802,9.4,STYRENE,39468.0,2010,201001060000_-111.95802_40.74211_9.4_foot,data/processed/stilt_output/netcdf/092520_styr...,2010-01-06,POINT (-12462773.592 4974801.670)
4,40.755,-111.955,0.073125,7.907112,0,40.74211,-111.95802,9.4,STYRENE,39468.0,2010,201001060000_-111.95802_40.74211_9.4_foot,data/processed/stilt_output/netcdf/092520_styr...,2010-01-06,POINT (-12462773.592 4976271.108)
8,40.755,-111.945,0.069022,7.463464,0,40.74211,-111.95802,9.4,STYRENE,39468.0,2010,201001060000_-111.95802_40.74211_9.4_foot,data/processed/stilt_output/netcdf/092520_styr...,2010-01-06,POINT (-12461660.397 4976271.108)
12,40.765,-111.945,0.147847,15.986922,0,40.74211,-111.95802,9.4,STYRENE,39468.0,2010,201001060000_-111.95802_40.74211_9.4_foot,data/processed/stilt_output/netcdf/092520_styr...,2010-01-06,POINT (-12461660.397 4977740.767)
16,40.765,-111.935,0.003443,0.372280,0,40.74211,-111.95802,9.4,STYRENE,39468.0,2010,201001060000_-111.95802_40.74211_9.4_foot,data/processed/stilt_output/netcdf/092520_styr...,2010-01-06,POINT (-12460547.202 4977740.767)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14972,40.945,-112.365,0.001034,0.111794,0,40.74211,-111.95802,9.4,STYRENE,39468.0,2010,201001060000_-111.95802_40.74211_9.4_foot,data/processed/stilt_output/netcdf/092520_styr...,2010-01-06,POINT (-12508414.583 5004232.558)
14976,40.945,-112.355,0.001060,0.114596,0,40.74211,-111.95802,9.4,STYRENE,39468.0,2010,201001060000_-111.95802_40.74211_9.4_foot,data/processed/stilt_output/netcdf/092520_styr...,2010-01-06,POINT (-12507301.388 5004232.558)
14980,40.945,-112.345,0.001065,0.115107,0,40.74211,-111.95802,9.4,STYRENE,39468.0,2010,201001060000_-111.95802_40.74211_9.4_foot,data/processed/stilt_output/netcdf/092520_styr...,2010-01-06,POINT (-12506188.193 5004232.558)
14984,40.945,-112.335,0.001046,0.113152,0,40.74211,-111.95802,9.4,STYRENE,39468.0,2010,201001060000_-111.95802_40.74211_9.4_foot,data/processed/stilt_output/netcdf/092520_styr...,2010-01-06,POINT (-12505074.998 5004232.558)
