---
# Create ordered list of SOILED netcdf output from monte-carlo 
# Identify missing/incomplete runs
---

The monte carlo runs were completed on Compute Canada's supercomputer, `Graham` and this code is intended to be used on that system.  It requires initialization of a Virtual Environment.  See `/home/rmueller/projects/def-allen/rmueller/graham-jupyter-env.txt`.

First initiate a compute node (no heavy-lifting in this example) with: 
```
salloc --time=1:00:00 --ntasks=1 --cpus-per-task=2 --mem-per-cpu=1024M --account=rrg-allen
```
Activate `VENV` with:
```
module load python/3.8.2
source ~/venvs/jupyter/bin/activate
```
Deactivate `VENV` with:
```
deactivate
```
If the `jupyter` `VENV` is not yet setup, install it with:
```
module load python/3.8.2
python3 -m virtualenv --no-download ~/venvs/jupyter
source ~/venvs/jupyter/bin/activate
python3 -m pip install --no-index --upgrade pip
python3 -m pip install -r /home/rmueller/projects/def-allen/rmueller/graham-jupyter-env.txt
```

This environment is setup to allow user to initiate a remote window using:
```
jupyter lab --no-browser --ip $(hostname -f)
```

In [1]:
from glob import glob
import os
import sys
import time
from datetime import datetime
import pandas
import numpy
import yaml

In [2]:
def get_MOHID_netcdf_filenames(results_dir, output_dir, oil_types):
    """Get lists of filepaths and filenames for netcdf files of model output, 
    grouped by oil types. NOTE: jet and gas are run as diesel; other is run 
    as bunker.  
    
    :param str results_dir: File path for root directory of run sets. 
    On Graham, the filepath is `/scratch/dlatorne/MIDOSS/runs/monte-carlo`
    
    :param str output_dir: File path for storing MOHID_results_locations_{date}.yaml,
    which contains file paths for completed runs, sorted by oil type.  
    
    :return: Dataframe of file paths and names, sorted by oil types, namely: 
    akns, bunker, dilbit, jet, diesel, gas and other.  Note: jet and gas are 
    run as diesel; other is run as bunker.  
    :rtype: :py:class:`pandas.DataFrame`
    """
    # get list of runsets
    runsets = sorted(glob(os.path.join(results_dir,"near-BP_*")))
    # get list of runs within each runset
    runs = []
    for runset in runsets:
        runs.extend(sorted(
            glob(os.path.join(runset,'results','near-BP_*')))[:])        
    # get complete list of netcdf files
    netcdf_files = []
    for run in runs:
        netcdf_files.extend(sorted(
            glob(os.path.join(run,'Lagrangian*.nc')))[:])
    # sort filenames by oil type.  
    file_boolean = {}
    files = {}
    files['all'] = []
    for oil in oil_types:
        file_boolean[oil] = [oil in file for file in netcdf_files]
        files[oil]=[file for i,file in enumerate(netcdf_files) \
            if file_boolean[oil][i]]
        files['all'].extend(files[oil])
    files['all'].sort()
    # write filenames to .yaml with timestamp in filename
    now = datetime.now()
    dt_string = now.strftime("%d%m%Y_%H:%M:%S")
    out_f = output_dir+f'/MOHID_results_locations_{dt_string}.yaml'
    with open(out_f, 'w') as output_yaml:
        documents = yaml.safe_dump(files, output_yaml)
    
    return files

In [3]:
oil_types = [
    'akns', 
    'bunker', 
    'dilbit', 
    'jet', 
    'diesel', 
    'gas', 
    'other'
]
results_dir='/scratch/dlatorne/MIDOSS/runs/monte-carlo'
output_dir ='/scratch/rmueller/MIDOSS/Results'
files = get_MOHID_netcdf_filenames(results_dir, output_dir, oil_types)

In [4]:
total = 0
time_per_file = 0.575
minutes_to_hours = 1/60
for oil in oil_types:
    number_of_files = len(files[oil])
    time_to_complete = time_per_file * number_of_files * minutes_to_hours
    print(f'{oil}: {number_of_files} runs, approx {time_to_complete:.2f} hours to complete')
    total+=len(files[oil])
print(f'TOTAL RUNS: {total}')

akns: 64 runs, approx 0.61 hours to complete
bunker: 3323 runs, approx 31.85 hours to complete
dilbit: 1 runs, approx 0.01 hours to complete
jet: 26 runs, approx 0.25 hours to complete
diesel: 6192 runs, approx 59.34 hours to complete
gas: 73 runs, approx 0.70 hours to complete
other: 68 runs, approx 0.65 hours to complete
TOTAL RUNS: 9747


### Find missing files

In [5]:
# get list of runsets
runsets = sorted(glob(os.path.join(results_dir,"near-BP_*")))
finished = pandas.DataFrame({'filenames':files['all']})

In [6]:
# get list of runsets
runsets = sorted(glob(os.path.join(results_dir,"near-BP_*")))
finished = pandas.DataFrame({'filenames':files['all']})

In [9]:
list_of_incomplete = []
n_missing = 0
for runset in runsets: 
    finished_runset = finished[finished['filenames'].str.contains(runset)]
    nruns = f'{runset}'.split('-')[3:][0].split('_')[0]
    nruns_finished = len(finished_runset)
    if int(nruns)!=nruns_finished:
        if nruns_finished>0:
            print(f'{runset}'.split('/')[-1],f': {nruns_finished} of {nruns}')
            list_of_incomplete.append(f'{runset}'.split('/')[-1])
            n_missing+=int(nruns)-nruns_finished


near-BP_2000th-100_2021-10-02T144815 : 76 of 100
near-BP_2004th-100_2021-10-02T144852 : 95 of 100
near-BP_46th-100_2021-10-02T142422 : 96 of 100
near-BP_49th-100_2021-10-03T140133 : 86 of 100
near-BP_4th-100_2021-09-24T125848 : 96 of 100
near-BP_53th-100_2021-10-03T140217 : 80 of 100
near-BP_55th-100_2021-10-03T140235 : 99 of 100
near-BP_79th-100_2021-10-07T121854 : 90 of 100
near-BP_88th-100_2021-10-11T190627 : 98 of 100
near-BP_94th-103_2021-10-13T133218 : 101 of 103


In [10]:
missing_runs={}
completed_runs={}
for runset in list_of_incomplete: 
    completed_runs[runset]=[]
    finished_runset = finished[finished['filenames'].str.contains(runset)]
    for run in finished_runset['filenames']:
        completed_runs[runset].append(int(run.split('.')[0].split('-')[-1]))
    completed_runs[runset].sort()
    for i in range(len(completed_runs[runset]) - 1):
        #print(completed_runs[runset][i],completed_runs[runset][i+1])
        if (completed_runs[runset][i+1] - completed_runs[runset][i])>1:
            run_list = numpy.arange(
                completed_runs[runset][i]+1,completed_runs[runset][i+1]
            )
            if runset in missing_runs:
                missing_runs[runset]=numpy.append(missing_runs[runset],run_list)
            else:
                missing_runs[runset]=run_list

In [11]:
for runset in [*missing_runs]:
     missing_runs[runset]=missing_runs[runset].tolist()

In [13]:
import yaml
with open('/scratch/rmueller/MIDOSS/Results/'+'missing_runs.yaml', 'w') as outfile:
    yaml.safe_dump(missing_runs, outfile)

In [14]:
missing_runs

{'near-BP_2000th-100_2021-10-02T144815': [1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  10,
  11,
  13,
  14,
  17,
  19,
  20,
  21,
  23,
  24,
  25,
  26,
  27,
  29,
  30,
  35],
 'near-BP_2004th-100_2021-10-02T144852': [10, 15, 17, 19, 24],
 'near-BP_46th-100_2021-10-02T142422': [4, 21, 26, 28],
 'near-BP_49th-100_2021-10-03T140133': [42,
  43,
  52,
  57,
  59,
  70,
  72,
  90,
  91,
  94,
  95,
  96,
  97],
 'near-BP_4th-100_2021-09-24T125848': [92, 94, 96],
 'near-BP_53th-100_2021-10-03T140217': [6,
  7,
  8,
  11,
  12,
  14,
  16,
  17,
  19,
  21,
  23,
  25,
  26,
  28,
  29,
  48,
  49,
  55],
 'near-BP_55th-100_2021-10-03T140235': [24],
 'near-BP_79th-100_2021-10-07T121854': [9, 45, 50, 52, 54, 56, 75, 77, 78, 90],
 'near-BP_88th-100_2021-10-11T190627': [64, 70]}