## Evaluate different sets of 10,000 monte carlo spills 

Package requirements are included in ../environment.yaml.  Install in the upper-level directory using:
```
$ conda env create -f ./environment.yaml
```
                                                                               
To activate this environment, use 
```                  
$ conda activate midoss_env
```
 To deactivate an active environment, use
```
$ conda deactivate
```

Files are loaded from the dataset archive, with both the dataset archive and this code archive in the same folder, i.e.:

- \path\to\root\folder\
   - MuellerEtAl_MIDOSS_datasets
   - MuellerEtAl_MIDOSS_code [this repository]

In [1]:
import os
from glob import glob
import pandas as pd
import numpy as np
from pathlib import Path

### Load spills files

In [2]:
data_directory = Path(f'../../MuellerEtAl_MIDOSS_datasets/spill_files')
# create list of spill files to plot/evaluate
filenames = sorted(glob(os.path.join(data_directory,"*.csv")))
# remove MIDOSS_MonteCarlo_spills.csv from list of 10,000 spills files
filenames.pop(0)

'../../MuellerEtAl_MIDOSS_datasets/spill_files/MIDOSS_MonteCarlo_spills.csv'

In [3]:
df={}
for index,fn in enumerate(filenames):
    print(index,fn)
    df[index] = pd.read_csv(fn)
    # rename lagrangian files as oil types (for plotting)
    df[index]['Lagrangian_template'] = df[index]['Lagrangian_template'].replace(
        ['Lagrangian_akns.dat','Lagrangian_bunker.dat',
             'Lagrangian_diesel.dat','Lagrangian_gas.dat',
             'Lagrangian_jet.dat','Lagrangian_dilbit.dat',
             'Lagrangian_other.dat'], 
        ['ANS','Bunker-C','Diesel','Gasoline',
             'Jet Fuel', 'Dilbit', 'Other (Bunker-C)']
    )
    if index == 0:
        df_combined = df[index].copy()
    else:
        df_combined = pd.concat([df_combined, df[index]])

0 ../../MuellerEtAl_MIDOSS_datasets/spill_files/SalishSea_oil_spills_1.csv
1 ../../MuellerEtAl_MIDOSS_datasets/spill_files/SalishSea_oil_spills_2.csv
2 ../../MuellerEtAl_MIDOSS_datasets/spill_files/SalishSea_oil_spills_3.csv
3 ../../MuellerEtAl_MIDOSS_datasets/spill_files/SalishSea_oil_spills_4.csv
4 ../../MuellerEtAl_MIDOSS_datasets/spill_files/SalishSea_oil_spills_5.csv
5 ../../MuellerEtAl_MIDOSS_datasets/spill_files/SalishSea_oil_spills_6.csv
6 ../../MuellerEtAl_MIDOSS_datasets/spill_files/SalishSea_oil_spills_7.csv
7 ../../MuellerEtAl_MIDOSS_datasets/spill_files/SalishSea_oil_spills_8.csv
8 ../../MuellerEtAl_MIDOSS_datasets/spill_files/SalishSea_oil_spills_9.csv


### Count spills by oil type 
Compare total count to the spill file used for this study `SalishSea_oil_spills_4.csv`

In [4]:
count_df = df_combined[['Lagrangian_template','spill_date_hour']].groupby('Lagrangian_template').count().rename(columns={'spill_date_hour':'count'})
count_df['percent'] = [100*count_df['count'].iloc[idx]/count_df['count'].sum() for idx in range(len(count_df['count']))]
count_df['this_study_count']=df[3][['Lagrangian_template','spill_date_hour']].groupby('Lagrangian_template').count().rename(columns={'spill_date_hour':'count'})
count_df['this_study_percent'] = [100*count_df['this_study_count'].iloc[idx]/count_df['this_study_count'].sum() for idx in range(len(count_df['this_study_count']))]

In [5]:
count_df

Unnamed: 0_level_0,count,percent,this_study_count,this_study_percent
Lagrangian_template,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ANS,520,0.577778,66,0.66
Bunker-C,30403,33.781111,3409,34.09
Diesel,57491,63.878889,6353,63.53
Dilbit,39,0.043333,1,0.01
Gasoline,671,0.745556,76,0.76
Jet Fuel,212,0.235556,26,0.26
Other (Bunker-C),664,0.737778,69,0.69


# Repeat the above but only include spills > 1 L

In [6]:
## All 90,000 spills
count_gt1_df = df_combined.loc[
    (df_combined['spill_volume']>1)
][['Lagrangian_template','spill_date_hour']].groupby('Lagrangian_template').count().rename(columns={'spill_date_hour':'count'})
count_gt1_df['percent'] = [100*count_gt1_df['count'].iloc[idx]/count_gt1_df['count'].sum() for idx in range(len(count_gt1_df['count']))]
count_gt1_df['this_study_count'] = df[3].loc[
    (df[3]['spill_volume']>1)][['Lagrangian_template','spill_date_hour']].groupby(
    'Lagrangian_template').count().rename(columns={'spill_date_hour':'count'})
count_gt1_df['this_study_percent'] = [
    100*count_gt1_df['this_study_count'].iloc[idx]/count_gt1_df['this_study_count'].sum() for idx in range(len(count_gt1_df['this_study_count']))]

In [7]:
count_gt1_df

Unnamed: 0_level_0,count,percent,this_study_count,this_study_percent
Lagrangian_template,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ANS,520,0.583202,66,0.66539
Bunker-C,30154,33.818961,3378,34.055852
Diesel,56903,63.819073,6303,63.544712
Dilbit,39,0.04374,1,0.010082
Gasoline,671,0.752554,76,0.766206
Jet Fuel,212,0.237767,26,0.262123
Other (Bunker-C),664,0.744704,69,0.695635
