---
# Create ordered list of SOILED netcdf output from monte-carlo 
# Identify missing/incomplete runs
---

The monte carlo runs were completed on Compute Canada's supercomputer, `Graham` and this code is intended to be used on that system.  It requires initialization of a Virtual Environment.  See `/home/rmueller/projects/def-allen/rmueller/graham-jupyter-env.txt`.

First initiate a compute node (no heavy-lifting in this example) with: 
```
salloc --time=1:00:00 --ntasks=1 --cpus-per-task=1 --mem-per-cpu=1024M --account=rrg-allen
```
Activate `VENV` with:
```
module load python/3.8.2
source ~/venvs/jupyter/bin/activate
```
Deactivate `VENV` with:
```
deactivate
```
If the `jupyter` `VENV` is not yet setup, install it with:
```
module load python/3.8.2
python3 -m virtualenv --no-download ~/venvs/jupyter
source ~/venvs/jupyter/bin/activate
python3 -m pip install --no-index --upgrade pip
python3 -m pip install -r /home/rmueller/projects/def-allen/rmueller/graham-jupyter-env.txt
```

This environment is setup to allow user to initiate a remote window using:
```
jupyter lab --no-browser --ip $(hostname -f)
```

In [1]:
from glob import glob
import os
import sys
import time
from datetime import datetime
import pandas
import numpy
import yaml
import xarray
import h5netcdf

In [2]:
def get_SOILED_netcdf_filenames_byMonth(
    results_dir='/scratch/dlatorne/MIDOSS/runs/monte-carlo',
    output_dir ='/scratch/rmueller/MIDOSS/Results',
    runset_tag="*_near-BP_try3*"):
    """Get lists of filepaths and filenames for netcdf files of model output, 
    grouped by oil types. NOTE: jet and gas are run as diesel; other is run 
    as bunker.  
    
    :param str results_dir: File path for root directory of run sets. 
    On Graham, the filepath is `/scratch/dlatorne/MIDOSS/runs/monte-carlo`
    
    :param str output_dir: File path for storing MOHID_results_locations_{date}.yaml,
    which contains file paths for completed runs, sorted by oil type.  
    
    :return: Dataframe of file paths and names, sorted by oil types, namely: 
    akns, bunker, dilbit, jet, diesel, gas and other.  Note: jet and gas are 
    run as diesel; other is run as bunker.  
    :rtype: :py:class:`pandas.DataFrame`
    """
    # get list of runsets
    # for newer runs, use: "*_near-BP_*"
    runsets = sorted(glob(os.path.join(results_dir,runset_tag)))
    # get list of runs within each runset
    runs = []
    for runset in runsets:
        runs.extend(sorted(
            glob(os.path.join(runset,'results',runset_tag)))[:])        
    # get complete list of netcdf files
    netcdf_files = []
    files_byMonth = {}
    month_names={
        1:'Jan',
        2:'Feb',
        3:'Mar',
        4:'Apr',
        5:'May',
        6:'Jun',
        7:'Jul',
        8:'Aug',
        9:'Sep',
        10:'Oct',
        11:'Nov',
        12:'Dec'
    }
    for month in month_names:
        files_byMonth[month_names[month]]=[]
    files_byMonth['all'] = []
    for run in runs:
        nrun = run.split('/')[-1].split('-')[-1]
        try:
            dat_file_path=glob(os.path.join(run,f'MassBalance*.sro'))[0]
        except:
            print(f'No MassBalance*.sro: {run}')
            continue
        try:
            dat_file = open(dat_file_path, 'r')
        except:
            print(dat_file_path)
            continue
        for position, line in enumerate(dat_file):
            if position==2:
                spill_dateTime = line
                MM = spill_dateTime.split('.')[1]
                files_byMonth[month_names[int(MM)]].extend(glob(os.path.join(run,'Lagrangian*.nc')))
                files_byMonth['all'].extend(glob(os.path.join(run,'Lagrangian*.nc')))
    # write filenames to .yaml with timestamp in filename
    now = datetime.now()
    dt_string = now.strftime("%d%m%Y_%H:%M:%S")
    out_f = output_dir+f'/MOHID_results_locationsByMonth_try3_{dt_string}.yaml'
    try:
        with open(out_f, 'w') as output_yaml:
            documents = yaml.safe_dump(files_byMonth, output_yaml)
    except:
        print("Save to yaml didn't work")
    return files_byMonth, runs

In [3]:
%%time
results_dir='/scratch/dlatorne/MIDOSS/runs/monte-carlo'
output_dir ='/scratch/rmueller/MIDOSS/Results'
files_byMonth,runs = get_SOILED_netcdf_filenames_byMonth()

CPU times: user 13.9 s, sys: 25.5 s, total: 39.4 s
Wall time: 14min 55s


In [10]:
total = 0
time_per_file = 0.575
minutes_to_hours = 1/60
for month in [*files_byMonth]:
    number_of_files = len(files_byMonth[month])
    print(f'{month}: {number_of_files} runs')
    if month!='all':
        total+=number_of_files
print(f'TOTAL RUNS: {total}')

Jan: 717 runs
Feb: 627 runs
Mar: 798 runs
Apr: 760 runs
May: 847 runs
Jun: 870 runs
Jul: 928 runs
Aug: 876 runs
Sep: 886 runs
Oct: 885 runs
Nov: 838 runs
Dec: 801 runs
all: 9833 runs
TOTAL RUNS: 9833


### Find missing files

In [11]:
runset_tag="*_near-BP_try3*"
# get list of runsets
runsets = sorted(glob(os.path.join(results_dir,runset_tag)))
finished = pandas.DataFrame({'filenames':files_byMonth['all']})

In [12]:
list_of_incomplete = []
n_missing = 0
for runset in runsets: 
    finished_runset = finished[finished['filenames'].str.contains(runset)]
    nruns = f'{runset}'.split('-')[2:][0].split('_')[0]
    nruns_finished = len(finished_runset)
    if int(nruns)!=nruns_finished:
        if nruns_finished>0:
            print(f'{runset}'.split('/')[-1],f': {nruns_finished} of {nruns}')
            list_of_incomplete.append(f'{runset}'.split('/')[-1])
            n_missing+=int(nruns)-nruns_finished


31-200_near-BP_try3_2022-05-20T133826 : 197 of 200
41-200_near-BP_try3_2022-05-22T142401 : 199 of 200
42-200_near-BP_try3_2022-05-22T142453 : 195 of 200
50-200_near-BP_try3_2022-05-23T130812 : 32 of 200


In [13]:
missing_runs={}
completed_runs={}
for runset in list_of_incomplete: 
    completed_runs[runset]=[]
    finished_runset = finished[finished['filenames'].str.contains(runset)]
    for run in finished_runset['filenames']:
        completed_runs[runset].append(int(run.split('.')[0].split('-')[-1]))
    completed_runs[runset].sort()
    for i in range(len(completed_runs[runset]) - 1):
        #print(completed_runs[runset][i],completed_runs[runset][i+1])
        if (completed_runs[runset][i+1] - completed_runs[runset][i])>1:
            run_list = numpy.arange(
                completed_runs[runset][i]+1,completed_runs[runset][i+1]
            )
            if runset in missing_runs:
                missing_runs[runset]=numpy.append(missing_runs[runset],run_list)
            else:
                missing_runs[runset]=run_list

In [14]:
for runset in [*missing_runs]:
     missing_runs[runset]=missing_runs[runset].tolist()

In [15]:
import yaml
with open('/scratch/rmueller/MIDOSS/Results/'+'missing_runs.yaml', 'w') as outfile:
    yaml.safe_dump(missing_runs, outfile)

In [16]:
missing_runs

{'31-200_near-BP_try3_2022-05-20T133826': [9, 25, 120],
 '41-200_near-BP_try3_2022-05-22T142401': [116],
 '42-200_near-BP_try3_2022-05-22T142453': [21, 74, 79, 83, 94],
 '50-200_near-BP_try3_2022-05-23T130812': [18]}

## plot locations

In [None]:
import h5py

In [None]:
import h5py
mesh2d = xarray.open_dataset('https://salishsea.eos.ubc.ca/erddap/griddap/ubcSSn2DMeshMaskV17-02.html', engine='h5netcdf')

In [None]:
filename = 'SalishSea_1d_20151227_20151227_ptrc_T.nc'
grid_g = nc.Dataset(filename)
conc = grid_g.variables[field]

#Prepare surface values
conc_ma = np.ma.masked_values(conc[0, 0, :, :], 0)
# use tmask (meshmask file) instead
vmin = np.min(conc_ma)
vmax = np.max(conc_ma)

#Prepare thalweg values
npconc = conc[:]
conc_t = npconc[0, :, thalweg[0], thalweg[1]]
conc_t_ma = np.ma.masked_values(conc_t, 0)
vmin_t = np.min(conc_t_ma)
vmax_t = np.max(conc_t_ma)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))
land_colour = 'burlywood'
for ax in (ax2, ax1):
    ax.set_axis_bgcolor(land_colour)
ax1.set_position((0.125, 0.125, 0.5, 0.775))
#axcb.set_position((0.73, 0.125, 0.02, 0.775))
ax2.set_position((0.8, 0.125, 0.2, 0.775))

set_aspect(ax2)
cmap = plt.get_cmap('Greens')
cmap.set_bad('burlywood')

#Surface plot
mesh = ax2.pcolormesh(conc_ma, cmap=cmap, vmin=vmin, vmax=vmax)
cbar = fig.colorbar(mesh, ax=ax2)
#plt.axis(0, conc_ma.shape[1], 0, conc_ma.shape[0])
ax2.set_title('Surface {label}'.format(label=conc.long_name.title()), fontsize=16)
ax2.set_xlabel('x Index')
ax2.set_ylabel('y Index')
cbar.set_label('{label} [{units}]'.format(label=conc.long_name.title(), units=conc.units))