# Import Modules

## Standard Packages

In [1]:
import os
import sys
import os.path as path
import glob
import random
import numpy as np
import pandas as pd
import xarray as xr
import pickle
#from matplotlib import pyplot as plt
#plt.style.use('seaborn-white')
from datetime import date, datetime, timedelta, time
from timeit import default_timer as timer



## User-Defined Functions

In [2]:
from helper_extract_wrf import generate_seed, init_random_generator
from helper_extract_wrf import get_data_file_names, downsample_data_files
from helper_extract_wrf import downsample_grid_indices
from helper_extract_wrf import create_df_at_gp

# Global Start Time

In [3]:
global_start_time = timer()

# Variables to be used for extracting WRF data

In [4]:
# WRF data set location and the extracted data set location
data_files_location = '/p/vast1/climres/DFM_reanalysis'
extracted_data_loc = '/p/lustre2/jha3/Wildfire/Wildfire_LDRD_SI/SJSU/01_WRF_Nelson_Data_Extracted'

# The current data set params
data_set_count = 0
percent_files_to_use = 0.02         # f1 = what percent of available files to use
percent_grid_points_to_use = 0.005  # f2 = what percent of grid points to use
max_history_to_consider = 5 # n_history in hours
history_interval        = 2

# Some fixed stuff
'''
identity_fields = ['latitude', 'longitude', 'YYYY', 'MM', 'DD', 'HH']

label_fields = ['mean_wtd_moisture_1hr', 'mean_wtd_moisture_10hr',\
                'mean_wtd_moisture_100hr', 'mean_wtd_moisture_1000hr']

feature_fields = ['eastward_10m_wind', 'northward_10m_wind',\
                  'air_temperature_2m', \
                  'accumulated_precipitation_amount', \
                  'air_relative_humidity_2m', \
                  'surface_downwelling_shortwave_flux'] 
'''

"\nidentity_fields = ['latitude', 'longitude', 'YYYY', 'MM', 'DD', 'HH']\n\nlabel_fields = ['mean_wtd_moisture_1hr', 'mean_wtd_moisture_10hr',                'mean_wtd_moisture_100hr', 'mean_wtd_moisture_1000hr']\n\nfeature_fields = ['eastward_10m_wind', 'northward_10m_wind',                  'air_temperature_2m',                   'accumulated_precipitation_amount',                   'air_relative_humidity_2m',                   'surface_downwelling_shortwave_flux'] \n"

# Generate seed for the random number generator

In [5]:
seed = generate_seed()
random_state = init_random_generator(seed)

# File Names

In [6]:
module_start_time = timer()
data_files_list = get_data_file_names(data_files_location)
module_end_time = timer()
print('Module "get_data_file_names" computing time: {} s'.format(module_end_time - module_start_time))


Getting the names of the data files at the dir : 
 /p/vast1/climres/DFM_reanalysis 

years_list: ['2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020'] 

Getting the names of the data files for the year : 2000
... Found 8784 files for this year
Getting the names of the data files for the year : 2001
... Found 8760 files for this year
Getting the names of the data files for the year : 2002
... Found 8760 files for this year
Getting the names of the data files for the year : 2003
... Found 8760 files for this year
Getting the names of the data files for the year : 2004
... Found 8784 files for this year
Getting the names of the data files for the year : 2005
... Found 8760 files for this year
Getting the names of the data files for the year : 2006
... Found 8760 files for this year
Getting the names of the data files for the year : 2007
... Found 8760 files for this year
Get

# Downsample Files

In [7]:
module_start_time = timer()
sampled_file_indices, sampled_data_files = downsample_data_files (data_files_list, percent_files_to_use)
module_end_time = timer()
print('Module "downsample_data_files" computing time: {} s'.format(module_end_time - module_start_time))


Randomly selecting approx 0.02 % of the data files
Selected 37 data files out of 184103
Indices of the randomly selected files: 
 [112379, 131207, 134488, 63266, 144535, 154953, 102091, 62994, 99309, 129072, 114855, 27362, 43842, 173591, 6255, 136922, 84356, 11649, 84688, 294, 146031, 61194, 41708, 85863, 150674, 58399, 131677, 182062, 77552, 22259, 94242, 132352, 166030, 137872, 173209, 59883, 141851]
Names of the randomly selected files: 
 ['wrf_2012-10-26_12.nc', 'wrf_2014-12-20_00.nc', 'wrf_2015-05-05_17.nc', 'wrf_2007-03-21_02.nc', 'wrf_2016-06-27_08.nc', 'wrf_2017-09-04_10.nc', 'wrf_2011-08-24_20.nc', 'wrf_2007-03-09_18.nc', 'wrf_2011-04-30_22.nc', 'wrf_2014-09-22_01.nc', 'wrf_2013-02-06_16.nc', 'wrf_2003-02-14_02.nc', 'wrf_2004-12-31_18.nc', 'wrf_2019-10-21_00.nc', 'wrf_2000-09-17_15.nc', 'wrf_2015-08-15_03.nc', 'wrf_2009-08-15_20.nc', 'wrf_2001-04-30_09.nc', 'wrf_2009-08-29_16.nc', 'wrf_2000-01-13_06.nc', 'wrf_2016-08-28_16.nc', 'wrf_2006-12-24_18.nc', 'wrf_2004-10-03_20.nc', 

# Global End Time

In [8]:
global_end_time = timer()
print('Total computing time: {} s'.format(global_end_time - global_start_time))

Total computing time: 0.3706265729852021 s


# Cells below this are old/legacy stuff for reference

# Grid Dimensions, Downsample Grid Points

In [None]:
df_for_all_files = pd.DataFrame()
for file_count, data_file_name in enumerate(sampled_data_files):
    print ('\nReading data from file # {}, with name :- {}'.format(file_count, data_file_name))
    print('-----------------------------------------------------------------------')
    dfm_file_data = xr.open_dataset(path.join(data_files_location, data_file_name))
    
    df_for_single_file = downsample_grid_indices (data_file_name,dfm_file_data, percent_grid_points_to_use, 
                                                  max_history_to_consider, history_interval, frames_in_file)
    
    df_for_all_files = df_for_all_files.append(df_for_single_file).reset_index(drop = True)

In [None]:
#df_for_all_files.head(10)

# Save the extracted data

In [None]:
data_set_name = 'extracted_data_%02d'%(data_set_count)
extracted_data_file_name = '{}.pkl'.format(data_set_name)
'''
extracted_data_file_name = '{}_files_{}pc_grid_points_{}pc_max_history_{}_hist_interval_{}.pkl'.format(
                            data_set_name, # name of data set
                            percent_files_to_use, # f1 = what percent of available files to use
                            percent_grid_points_to_use, # f2 = what percent of grid points to use
                            max_history_to_consider, # n_history in hours
                            history_interval)
'''
extracted_data = {'percent_files_to_use': [percent_files_to_use],
                 'percent_grid_points_to_use': [percent_grid_points_to_use],
                 'max_history_to_consider': [max_history_to_consider],
                 'history_interval': [history_interval],
                 'number_of_files_used' : [len(sampled_data_files)],
                 'number_of_data_points' : [len(df_for_all_files)],
                 'df_for_all_files': df_for_all_files}
extracted_data_file_handle = open(os.path.join(
    extracted_data_loc, extracted_data_file_name), 'wb')
pickle.dump(extracted_data, extracted_data_file_handle)
extracted_data_file_handle.close()

In [None]:
del extracted_data['df_for_all_files']
#extracted_data['index'] = 0

In [None]:
tab_data_file_name = '{}.csv'.format(os.path.join(extracted_data_loc, data_set_name))
tabulated_data = pd.DataFrame.from_dict(extracted_data).reset_index(drop = True)
tabulated_data.to_csv(tab_data_file_name, index = False)

In [None]:
#tabulated_data

# Load extracted data from pickle file

In [None]:
loaded_data = pickle.load(open(os.path.join(
    extracted_data_loc, extracted_data_file_name), 'rb'))

In [None]:
#loaded_data['df_for_all_files'][5:15]