---
# Create ordered list of SOILED .sro output files from monte-carlo 
# Identify missing/incomplete runs
---

The monte carlo runs were completed on Compute Canada's supercomputer, `Graham` and this code is intended to be used on that system.  It requires initialization of a Virtual Environment.  See `/home/rmueller/projects/def-allen/rmueller/graham-jupyter-env.txt`.

First initiate a compute node (no heavy-lifting in this example) with: 
```
salloc --time=1:00:00 --ntasks=1 --cpus-per-task=1 --mem-per-cpu=1024M --account=rrg-allen
```
Activate `VENV` with:
```
module load python/3.8.2
source ~/venvs/jupyter/bin/activate
```
Deactivate `VENV` with:
```
deactivate
```
If the `jupyter` `VENV` is not yet setup, install it with:
```
module load python/3.8.2
python3 -m virtualenv --no-download ~/venvs/jupyter
source ~/venvs/jupyter/bin/activate
python3 -m pip install --no-index --upgrade pip
python3 -m pip install -r /home/rmueller/projects/def-allen/rmueller/graham-jupyter-env.txt
```

This environment is setup to allow user to initiate a remote window using:
```
jupyter lab --no-browser --ip $(hostname -f)
```

In [1]:
from glob import glob
import os
from os.path import exists
import sys
import time
from datetime import datetime
import pandas
import numpy
import yaml
import xarray
import h5netcdf


In [9]:
oil_types = [
    'akns', 
    'bunker', 
    'dilbit', 
    'jet', 
    'diesel', 
    'gas', 
    'other'
]
output_dir ='/scratch/rmueller/MIDOSS/Results/try3'

In [4]:
%%time
results_dir='/scratch/dlatorne/MIDOSS/runs/monte-carlo'
# specify directory search tags
runset_tag = "*_near-BP_try3*"

# get list of runsets
runsets = sorted(glob(os.path.join(results_dir,runset_tag)))
# get list of runs within each runset
runs = []
sro_files = []
netcdf_files = []
missing_netcdf = []
for runset in runsets:
    runs.extend(sorted(
        glob(os.path.join(runset,"results",runset_tag)))[:])
for run in runs:
    # There are 9833 netcdf and 9841 sro files.  I only choose .sro files
    # where netcdf exist
    try:
        exists(glob(os.path.join(run,'*.nc'))[0])
    except:
        missing_netcdf.append(os.path.join(run,'*.nc'))
    else:    
        sro_files.append(sorted(
            glob(os.path.join(run,'*.sro')))[0])
        netcdf_files.append(sorted(
            glob(os.path.join(run,'*.nc')))[0])

CPU times: user 4.11 s, sys: 8 s, total: 12.1 s
Wall time: 6min 5s


In [10]:
# sort filenames by oil type.  
file_boolean = {}
files = {}
files['all'] = []
for oil in oil_types:
    # Use lagrangian filename to ID and allocate .sro oil type
    file_boolean[oil] = [oil in file for file in netcdf_files]
    files[oil]=[file for i,file in enumerate(sro_files) \
        if file_boolean[oil][i]]
    files['all'].extend(files[oil])
files['all'].sort()
# write filenames to .yaml with timestamp in filename
now = datetime.now()
dt_string = now.strftime("%d%m%Y_%H:%M:%S")
out_f = output_dir+f'/MOHID_massbalance_try3_{dt_string}.yaml'
with open(out_f, 'w') as output_yaml:
    documents = yaml.safe_dump(files, output_yaml)

In [7]:
len(sro_files)

9833

In [11]:
def get_SOILED_sro_filenames_byMonth(
    results_dir='/scratch/dlatorne/MIDOSS/runs/monte-carlo',
    output_dir ='/scratch/rmueller/MIDOSS/Results/try3',
    runset_tag="*_near-BP_try3*"):
    """Get lists of filepaths and filenames for netcdf files of model output, 
    grouped by oil types. NOTE: jet and gas are run as diesel; other is run 
    as bunker.  
    
    :param str results_dir: File path for root directory of run sets. 
    On Graham, the filepath is `/scratch/dlatorne/MIDOSS/runs/monte-carlo`
    
    :param str output_dir: File path for storing MOHID_results_locations_{date}.yaml,
    which contains file paths for completed runs, sorted by oil type.  
    
    :return: Dataframe of file paths and names, sorted by oil types, namely: 
    akns, bunker, dilbit, jet, diesel, gas and other.  Note: jet and gas are 
    run as diesel; other is run as bunker.  
    :rtype: :py:class:`pandas.DataFrame`
    """
    # get list of runsets
    # for newer runs, use: "*_near-BP_*"
    runsets = sorted(glob(os.path.join(results_dir,runset_tag)))
    # get list of runs within each runset
    runs = []
    for runset in runsets:
        runs.extend(sorted(
            glob(os.path.join(runset,'results',runset_tag)))[:])        
    # get complete list of netcdf files
    sro_files = []
    files_byMonth = {}
    month_names={
        1:'Jan',
        2:'Feb',
        3:'Mar',
        4:'Apr',
        5:'May',
        6:'Jun',
        7:'Jul',
        8:'Aug',
        9:'Sep',
        10:'Oct',
        11:'Nov',
        12:'Dec'
    }
    for month in month_names:
        files_byMonth[month_names[month]]=[]
    files_byMonth['all'] = []
    for run in runs:
        nrun = run.split('/')[-1].split('-')[-1]
        try:
            dat_file_path=glob(os.path.join(run,f'MassBalance*.sro'))[0]
        except:
            print(f'No MassBalance*.sro: {run}')
            continue
        try:
            dat_file = open(dat_file_path, 'r')
        except:
            print(dat_file_path)
            continue
        for position, line in enumerate(dat_file):
            if position==2:
                spill_dateTime = line
                MM = spill_dateTime.split('.')[1]
                files_byMonth[month_names[int(MM)]].extend(glob(os.path.join(run,'MassBalance*.sro')))
                files_byMonth['all'].extend(glob(os.path.join(run,'MassBalance*.sro')))
    # write filenames to .yaml with timestamp in filename
    now = datetime.now()
    dt_string = now.strftime("%d%m%Y_%H:%M:%S")
    out_f = output_dir+f'/MOHID_sro_ByMonth_try3_{dt_string}.yaml'
    try:
        with open(out_f, 'w') as output_yaml:
            documents = yaml.safe_dump(files_byMonth, output_yaml)
    except:
        print("Save to yaml didn't work")
    return files_byMonth, runs

In [13]:
files_byMonth, runs=get_SOILED_sro_filenames_byMonth()