In [2]:
import sys 
sys.path.append('/home/samuel.varga/python_packages/wofs_ml_severe')
sys.path.append('/home/samuel.varga/python_packages/WoF_post')
sys.path.append('/home/samuel.varga/projects/2to6_hr_severe_wx/')
sys.path.append('/home/samuel.varga/python_packages/MontePython/')
import datetime as dt
import numpy as np
import os
import xarray as xr
from glob import glob
from experiments.ml_2to6_data_pipeline import (GridPointExtracter,
                                                       subsampler, 
                                                       )
from os.path import join
# Import packages 
import pandas as pd
import numpy as np

# Plotting code imports 
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

# We add the github package to our system path so we can import python scripts for that repo. 
import sys

from main.io import load_ml_data, load_bl_data
from bayeshist import bayesian_histogram, plot_bayesian_histogram
from wofs_ml_severe.data_pipeline.storm_report_loader import StormReportLoader
from wofs.plotting.util import decompose_file_path

lookup_file: /home/samuel.varga/python_packages/WoF_post/wofs/data/psadilookup.dat


## Function Definitions

These are the functions responsible for finding the forecast time window in the ML pipeline and the Storm Report Loader. Get_files and load_dataset return the list of files to use when producing the data for any initialization. These files are passed to a GridPointExtracter, which then loads the Storm reports for the prediction window. The prediction window is defined by get_time_window() inside of the Storm Report loader. Get_start_end_time() and get_lead_time_df() are specific to this notebook. They automatically load all of the file paths used for the 0to3 hour predictions and the 2to6 hour predictions, and return a dataframe containing info about the forecast initialization time and prediction window. Most of these functions were designed as part of a class. Within this notebook, I have tried to retain the orginal code in comments for clarity. To reduce compute time and memory requirements, the files are not actually loaded as all of the relevant info is taken from the path and filename.

In [3]:
def get_files(path, TIMESCALE):
    """Get the ENS, ENV, and SVR file paths for the 0-3 || 2-6 hr forecasts"""
    # Load summary files between time step 00-36 || 24-72. 
    if TIMESCALE=='0to3':
        ens_files = glob(join(path,'wofs_ENS_[0-3]*')) 
        ens_files.sort()
        ens_files = ens_files[:37] #Drops the last 4 files, so we have 0-36
    elif TIMESCALE=='2to6':
        ens_files = glob(join(path,'wofs_ENS_[2-7]*'))
        ens_files.sort()
        ens_files = ens_files[4:] #Drops the first 4 files, so we have 24-72 instead of 20-72
    
    svr_files = [f.replace('ENS', 'SVR') for f in ens_files]
    env_files = [f.replace('ENS', 'ENV') for f in ens_files]
    
    return ens_files, env_files, svr_files

In [4]:
def load_dataset(path, TIMESCALE):
    """Load the 0-3|| 2-6 hr forecasts"""
    ens_files, env_files, svr_files = get_files(path, TIMESCALE)
    
    #coord_vars = ["xlat", "xlon", "hgt"]
    
    #X_strm, coords, _, _  = load_multiple_nc_files(
    #            ens_files, concat_dim="time", coord_vars=coord_vars,  load_vars=ml_config['ENS_VARS'])

    #X_env, _, _, _  = load_multiple_nc_files(
    #            env_files, concat_dim="time", coord_vars=coord_vars,  load_vars=ml_config['ENV_VARS'])

    #X_svr, _, _, _ = load_multiple_nc_files(
    #            svr_files, concat_dim="time", coord_vars=coord_vars,  load_vars=ml_config['SVR_VARS'])

    #X_env = {**X_env, **X_svr}

    #X_env = {v : X_env[v][1] for v in X_env.keys()}
    #X_strm = {v : X_strm[v][1] for v in X_strm.keys()}
    
    #ll_grid = (coords['xlat'][1].values, coords['xlon'][1].values)
    
    X_env=None; X_strm=None; ll_grid=None ##Set these values to None so the return statement doesn't error.
    #For the forecast window, ens_files[0] is used to determine the start date and init time
    return X_env, X_strm, ens_files[0], ll_grid


In [5]:
def get_time_window(initial_time, forecast_length, err_window=15): #From StormReportLoader -- used to get time window of reports
        '''
        Get beginning and ending of the time window to search for LSRs
        '''
        # Convert the datetime string to a datetime object 
        start_date = dt.datetime.strptime(initial_time, '%Y%m%d%H%M') 
        end_date = start_date + dt.timedelta(minutes=forecast_length+err_window)
        
        # Move the start time back (in case reports came in late)
        start_date-= dt.timedelta(minutes=err_window)
        
        #self.start_date = start_date
        #print(f'Start Time: {start_date}')
        #self.end_date = end_date
        #print(f'End Time: {end_date}')
        #print(f'{start_date} - {end_date}')
        forecast_duration=end_date-start_date
        forecast_duration=forecast_duration.seconds / 3600 #Convert from seconds to hours
        #self.time_mask = (self.df.date >= self.start_date) & (self.df.date <= self.end_date)
        
        #self.time_mask = (self.df.date > self.start_date) & (self.df.date < self.end_date)
        #df = self.df[self.time_mask==True]

        return start_date.strftime('%Y%m%d%H%M'), end_date.strftime('%Y%m%d%H%M'), forecast_duration

In [6]:
def get_start_end_time(ncfile, TIMESCALE, deltat=5):
    comps = decompose_file_path(ncfile)
    start_time=(pd.to_datetime(comps['VALID_DATE']+comps['INIT_TIME'])+dt.timedelta(minutes=int(comps['TIME_INDEX'])*deltat)).strftime('%Y%m%d%H%M') #This adds deltat * TIME_INDEX minutes to get the valid time for the forecast
    #TIME_INDEX will be 0 in 0to3, i.e. start_time=data_time. In 2to6, it's 24 * 5, so start_time + 2hours
    forecast_length = 180 if TIMESCALE=='0to3' else 240
    
#    report = StormReportLoader(
#                reports_path = '/work/mflora/LSRS/StormEvents_2017-2022.csv',
#                report_type='NOAA',
#                initial_time=start_time, 
#                forecast_length=forecast_length, 
#                err_window=15,               
#            )
    init_date=start_time
    start_date, end_date, forecast_duration = get_time_window(start_time, forecast_length, err_window=15)
    data_time={'Data_date':comps['VALID_DATE'],'Data_init':comps['INIT_TIME']}
    return data_time, init_date, start_date, end_date, forecast_duration

In [46]:
def get_lead_time_df(TIMESCALE, base_path='/work/mflora/SummaryFiles'):
    paths=[]
    dates = [d for d in os.listdir(base_path) if '.txt' not in d]
    for d in dates:
        if d[4:6] != '05': #Skips all months other than May
            continue

        times = [t for t in os.listdir(join(base_path,d)) if 'basemap' not in t] #initialization time

        for t in times: #For every init time on that day
            path = join(base_path,d,t)
            if TIMESCALE=='0to3':
                files = glob(join(path, f'wofs_ENS_[0-3]*')) #For 0-200 minutes into forecast, gets changed to 0-180 in get_files
            elif TIMESCALE=='2to6':    
                files = glob(join(path, f'wofs_ENS_[2-7]*')) #For 100-360 minutes into the forecast- gets changed to 120-360 in get_files

            all_nc_files = [f for f in files if f.endswith('.nc')] #list of every ENS file that ends with nc for that init time

            if len(all_nc_files) == len(files):
                if TIMESCALE=='2to6' and len(files) == 53: #If files are available for all time steps btwn 20-72:
                    paths.append(path) #If all ENS files are nc files, append the path to the active list
                elif TIMESCALE=='0to3' and len(files)==40: #If files are available for all timesteps between 0-40:
                    paths.append(path)
    data_dates=[] #Full timestamp for the data being used
    init_times=[] #Time that the forecast is being made for
    start_windows=[] #Beginning of window (includes error window)
    end_windows=[] #End of window (includes error window)
    duration=[] #Duration of forecast window
    for path in paths:
        X_env, X_strm, ncfile, ll_grid = load_dataset(path, TIMESCALE=TIMESCALE) #Load the files for the time scale
        info=get_start_end_time(ncfile, TIMESCALE) #data date, start time, start time - err, end time + err, duration
        data_dates.append(info[0])
        init_times.append(info[1])
        start_windows.append(info[2])
        end_windows.append(info[3])
        duration.append(info[4])
        
    #Put data into DataFrame
    lt_df=pd.DataFrame()
    lt_df['Data Init Date']=[ds['Data_date'] for ds in data_dates]
    lt_df['Data Init Time']=[ds['Data_init'] for ds in data_dates]
    lt_df['Start Time']=init_times
    lt_df['Start Window']=start_windows
    lt_df['End Window']=end_windows
    lt_df['Window Duration']=duration
    return lt_df, paths

## 0-3 HR Lead Times

We start with the 0-3 HR lead times, as they are simpler. The ML predictions will be valid starting at the forecast initialization time, and extending for three hours. This three hour window is the same period over which storm reports are loaded. A 15 minute error is added to the beginning and end of the storm report window. 

In [47]:
Oto3df, paths=get_lead_time_df(TIMESCALE='0to3')
print(f'Length of DF: {len(Oto3df)}')
Oto3df

Length of DF: 710


Unnamed: 0,Data Init Date,Data Init Time,Start Time,Start Window,End Window,Window Duration
0,20190518,2200,201905182200,201905182145,201905190115,3.5
1,20190519,0300,201905190300,201905190245,201905190615,3.5
2,20190518,1900,201905181900,201905181845,201905182215,3.5
3,20190519,0200,201905190200,201905190145,201905190515,3.5
4,20190518,2300,201905182300,201905182245,201905190215,3.5
...,...,...,...,...,...,...
705,20210521,0300,202105210300,202105210245,202105210615,3.5
706,20210520,1900,202105201900,202105201845,202105202215,3.5
707,20210520,1800,202105201800,202105201745,202105202115,3.5
708,20210521,0200,202105210200,202105210145,202105210515,3.5


Data Init Date and Data Init Time are the forecast initialization being used. Start Time is when the storm report window opens, not including the error. Start window and end window are when the storm report window opens and closes respectively, including the 15 minute error on both ends. Window duration is how long the storm report window is open. In this case, the Duration is 3.5 hrs due to 3hr prediction window, and 2 15 minute errors.

Because the start time is always the same as the init time for this prediction window, we only have to worry about init times of 0000 as the 15 minute error window extends into the previous day. We also want to make sure that the end window correctly switches to the next date when the forecast extends into the next day. This can occur for init times of 2100-2300. We first select only init times within this region. We then choose 2 random examples from each init time to display. 

In [48]:
concerns=['2100','2200','2300','0000']
Oto3df=Oto3df[Oto3df['Data Init Time'].isin(concerns)]
choices=np.array([])
for init_time in concerns:
    choices=np.append(choices, np.random.choice(Oto3df[Oto3df['Data Init Time']==init_time].index, 2, replace=False))
print(choices)
Oto3df.loc[choices]

[553. 492. 226. 251. 207. 148. 574. 231.]


Unnamed: 0,Data Init Date,Data Init Time,Start Time,Start Window,End Window,Window Duration
553,20200505,2100,202005052100,202005052045,202005060015,3.5
492,20180510,2100,201805102100,201805102045,201805110015,3.5
226,20190523,2200,201905232200,201905232145,201905240115,3.5
251,20200528,2200,202005282200,202005282145,202005290115,3.5
207,20210519,2300,202105192300,202105192245,202105200215,3.5
148,20200507,2300,202005072300,202005072245,202005080215,3.5
574,20190526,0,201905260000,201905252345,201905260315,3.5
231,20190524,0,201905240000,201905232345,201905240315,3.5


We can see that everything looks good!

## 2-6 HR Lead Times

We now repeat this for the 2-6 hr lead times. At this point, the predictions are desynced from the actual initialization time by 2 hours as seen in Data Init Time and Start Time. The forecast duration is also an hour longer. At this point, we have to worry about the report windows for init times between 1800 - 0000, which is most of them! Furthermore, we now have to make sure that the start window folds over correctly!

In [53]:
Zto6df, paths=get_lead_time_df(TIMESCALE='2to6')
print(f'Length of DF: {len(Zto6df)}')
Zto6df.head()

Length of DF: 644


Unnamed: 0,Data Init Date,Data Init Time,Start Time,Start Window,End Window,Window Duration
0,20190518,2200,201905190000,201905182345,201905190415,4.5
1,20190519,300,201905190500,201905190445,201905190915,4.5
2,20190518,1900,201905182100,201905182045,201905190115,4.5
3,20190519,200,201905190400,201905190345,201905190815,4.5
4,20190518,2300,201905190100,201905190045,201905190515,4.5


In [34]:
Zto6df['Data Init Time'].unique()

array(['2200', '0300', '1900', '0200', '2300', '2100', '0000', '0100',
       '2000', '1700', '1800'], dtype=object)

In [52]:
concerns=['1800','1900','2000','2100','2200','2300','0000']
Zto6df=Zto6df[Zto6df['Data Init Time'].isin(concerns)]
choices=np.array([])
for init_time in concerns:
    choices=np.append(choices, np.random.choice(Zto6df[Zto6df['Data Init Time']==init_time].index, 2, replace=False))
print(choices)
Zto6df.loc[choices]

[431. 130. 214. 160. 315. 147. 457.   5.  11.  79. 249. 388. 525. 183.]


Unnamed: 0,Data Init Date,Data Init Time,Start Time,Start Window,End Window,Window Duration
431,20200520,1800,202005202000,202005201945,202005210015,4.5
130,20200507,1800,202005072000,202005071945,202005080015,4.5
214,20190529,1900,201905292100,201905292045,201905300115,4.5
160,20210517,1900,202105172100,202105172045,202105180115,4.5
315,20200504,2000,202005042200,202005042145,202005050215,4.5
147,20210525,2000,202105252200,202105252145,202105260215,4.5
457,20190513,2100,201905132300,201905132245,201905140315,4.5
5,20190518,2100,201905182300,201905182245,201905190315,4.5
11,20210510,2200,202105110000,202105102345,202105110415,4.5
79,20200513,2200,202005140000,202005132345,202005140415,4.5


Once again, we can see that everything is behaving properly! When the forecast starts in or carries over into the next day, datetime will automatically change the date in both the start and end windows. 