# Import Modules

In [1]:
import os
import sys
import os.path as path
import glob
import numpy as np
import pandas as pd
import xarray as xr
import pickle
#from matplotlib import pyplot as plt
#plt.style.use('seaborn-white')
from datetime import date, datetime, timedelta
import time
import random



In [2]:
from helper_functions import generate_seed, init_random_generator
from helper_functions import get_data_file_names, downsample_data_files
from helper_functions import downsample_grid_indices
from helper_functions import create_df_at_gp

# Variables to be used for preparing training data

In [3]:
data_files_location = '/p/lustre1/mirocha2/SJSU_DATA/akochanski/PGnE_climo/dfm'
percent_files_to_use = 50.0         # f1 = what percent of available files to use
percent_grid_points_to_use = 0.005  # f2 = what percent of grid points to use
max_history_to_consider = 5 # n_history in hours
history_interval        = 2
frames_in_file          = 153
label_fields = ['mean_wtd_moisture_1hr', 'mean_wtd_moisture_10hr']
identity_fields = ['latitude', 'longitude']
feature_fields = ['eastward_10m_wind', 'northward_10m_wind',\
                  'air_temperature_2m', \
                  'accumulated_precipitation_amount', \
                  'air_relative_humidity_2m', \
                  'surface_downwelling_shortwave_flux'] 

# Generate seed for the random number generator

In [4]:
seed = generate_seed()
random_state = init_random_generator(seed)

# Paths, File Names, Downsample Files

In [5]:
data_files_list = get_data_file_names(data_files_location)
sampled_file_indices, sampled_data_files = downsample_data_files (data_files_list, percent_files_to_use)


Getting the names of data files at the dir : 
 /p/lustre1/mirocha2/SJSU_DATA/akochanski/PGnE_climo/dfm
Found 99 files

Randomly selecting approx 50.0 % of the data files
Selected 50 data files
Indices of the randomly selected files: 
 [45, 51, 1, 72, 66, 10, 90, 47, 70, 56, 85, 23, 93, 7, 28, 76, 8, 19, 33, 4, 5, 46, 53, 25, 92, 59, 15, 74, 18, 86, 69, 98, 71, 73, 57, 78, 11, 62, 31, 29, 37, 40, 60, 61, 16, 48, 9, 12, 77, 65]
Names of the randomly selected files: 
 ['wrfout_d03_1990-04-09_00:00:00_dfm.nc', 'wrfout_d03_1990-05-09_00:00:00_dfm.nc', 'wrfout_d03_1989-09-01_00:00:00_dfm.nc', 'wrfout_d03_1990-08-22_00:00:00_dfm.nc', 'wrfout_d03_1990-07-23_00:00:00_dfm.nc', 'wrfout_d03_1989-10-16_00:00:00_dfm.nc', 'wrfout_d03_1990-11-20_00:00:00_dfm.nc', 'wrfout_d03_1990-04-19_00:00:00_dfm.nc', 'wrfout_d03_1990-08-12_00:00:00_dfm.nc', 'wrfout_d03_1990-06-03_00:00:00_dfm.nc', 'wrfout_d03_1990-10-26_00:00:00_dfm.nc', 'wrfout_d03_1989-12-20_00:00:00_dfm.nc', 'wrfout_d03_1990-12-05_00:00:00_dfm.

# Grid Dimensions, Downsample Grid Points

In [6]:
df_for_all_files = pd.DataFrame()
for file_count, data_file_name in enumerate(sampled_data_files):
    print ('\nReading data from file # {}, with name :- {}'.format(file_count, data_file_name))
    print('-----------------------------------------------------------------------')
    dfm_file_data = xr.open_dataset(path.join(data_files_location, data_file_name))
    
    df_for_single_file = downsample_grid_indices (dfm_file_data, percent_grid_points_to_use, 
                                                  max_history_to_consider, history_interval, frames_in_file)
    
    df_for_all_files = df_for_all_files.append(df_for_single_file).reset_index(drop = True)


Reading data from file # 0, with name :- wrfout_d03_1990-04-09_00:00:00_dfm.nc
-----------------------------------------------------------------------

Reading data from file # 1, with name :- wrfout_d03_1990-05-09_00:00:00_dfm.nc
-----------------------------------------------------------------------

Reading data from file # 2, with name :- wrfout_d03_1989-09-01_00:00:00_dfm.nc
-----------------------------------------------------------------------

Reading data from file # 3, with name :- wrfout_d03_1990-08-22_00:00:00_dfm.nc
-----------------------------------------------------------------------

Reading data from file # 4, with name :- wrfout_d03_1990-07-23_00:00:00_dfm.nc
-----------------------------------------------------------------------

Reading data from file # 5, with name :- wrfout_d03_1989-10-16_00:00:00_dfm.nc
-----------------------------------------------------------------------

Reading data from file # 6, with name :- wrfout_d03_1990-11-20_00:00:00_dfm.nc
--------

In [7]:
df_for_all_files.head(10)

Unnamed: 0,lat,lon,FM_10hr,FM_1hr,U10[-4],V10[-4],T2[-4],Precip[-4],RH2[-4],SDSF[-4],U10[-2],V10[-2],T2[-2],Precip[-2],RH2[-2],SDSF[-2]
0,41.705215,-117.078568,0.119069,0.142763,0.920528,-0.460592,5.718597,0.0001,0.596931,0.0,-0.049356,1.451619,4.868042,0.0001,0.6155,0.0
1,42.184338,-123.301788,0.146209,0.175756,1.354522,0.919016,7.145874,0.0,0.846082,0.0,0.876572,0.220149,6.748352,0.0,0.877705,32.738674
2,41.909821,-122.174225,0.114832,0.150049,-0.096242,3.746594,9.993744,0.0,0.579384,0.0,-0.670532,3.467349,9.936157,0.0,0.601483,0.0
3,39.434334,-118.192505,0.095024,0.079315,-1.374921,-2.958729,15.483276,0.0055,0.216658,34.243313,0.718075,-2.464024,12.708557,0.0055,0.261151,0.0
4,40.600117,-122.702667,0.098316,0.109041,2.041034,0.641563,7.242401,2.7e-05,0.447608,0.0,2.236888,0.569929,6.916107,2.7e-05,0.372894,0.0
5,41.754669,-123.020996,0.107141,0.14126,-0.05767,0.608179,14.108887,0.0,0.591198,0.0,-0.249225,1.41358,13.737274,0.0,0.605511,0.0
6,36.558159,-119.545654,0.094246,0.102968,1.81896,0.611185,14.256897,0.0,0.673592,241.920319,0.588389,0.07657,18.442291,0.0,0.538737,637.195862
7,41.348297,-122.213593,0.366856,0.849947,2.067078,-1.117344,0.504364,0.0,0.622075,0.0,1.83574,-0.859852,1.952698,0.0,0.523121,191.538284
8,39.577171,-116.397736,0.119322,0.145762,-0.318048,-0.258749,-0.689423,0.0,0.811283,0.0,0.299222,-0.29727,0.637024,0.0,0.759536,324.473907
9,38.435047,-118.973358,0.052226,0.047166,-0.951465,-0.288855,22.105011,0.0,0.152896,895.251038,3.930423,2.396085,22.686859,0.0,0.141999,697.917297


# Save the training data

In [8]:
training_data_file_name = 'training_data_files_{}pc_grid_points_{}pc_max_history_{}_hist_interval_{}'.format(
                            percent_files_to_use, # f1 = what percent of available files to use
                            percent_grid_points_to_use, # f2 = what percent of grid points to use
                            max_history_to_consider, # n_history in hours
                            history_interval)
training_data = {'percent_files_to_use': percent_files_to_use,
                 'percent_grid_points_to_use': percent_grid_points_to_use,
                 'max_history_to_consider': max_history_to_consider,
                 'history_interval': history_interval,
                 'df_for_all_files': df_for_all_files}
training_data_file_handle = open(training_data_file_name, 'wb')
pickle.dump(training_data, training_data_file_handle)
training_data_file_handle.close()

# Load training data from pickle file

In [9]:
loaded_data = pickle.load(open(training_data_file_name, 'rb'))

In [10]:
loaded_data['df_for_all_files'].head(10)

Unnamed: 0,lat,lon,FM_10hr,FM_1hr,U10[-4],V10[-4],T2[-4],Precip[-4],RH2[-4],SDSF[-4],U10[-2],V10[-2],T2[-2],Precip[-2],RH2[-2],SDSF[-2]
0,41.705215,-117.078568,0.119069,0.142763,0.920528,-0.460592,5.718597,0.0001,0.596931,0.0,-0.049356,1.451619,4.868042,0.0001,0.6155,0.0
1,42.184338,-123.301788,0.146209,0.175756,1.354522,0.919016,7.145874,0.0,0.846082,0.0,0.876572,0.220149,6.748352,0.0,0.877705,32.738674
2,41.909821,-122.174225,0.114832,0.150049,-0.096242,3.746594,9.993744,0.0,0.579384,0.0,-0.670532,3.467349,9.936157,0.0,0.601483,0.0
3,39.434334,-118.192505,0.095024,0.079315,-1.374921,-2.958729,15.483276,0.0055,0.216658,34.243313,0.718075,-2.464024,12.708557,0.0055,0.261151,0.0
4,40.600117,-122.702667,0.098316,0.109041,2.041034,0.641563,7.242401,2.7e-05,0.447608,0.0,2.236888,0.569929,6.916107,2.7e-05,0.372894,0.0
5,41.754669,-123.020996,0.107141,0.14126,-0.05767,0.608179,14.108887,0.0,0.591198,0.0,-0.249225,1.41358,13.737274,0.0,0.605511,0.0
6,36.558159,-119.545654,0.094246,0.102968,1.81896,0.611185,14.256897,0.0,0.673592,241.920319,0.588389,0.07657,18.442291,0.0,0.538737,637.195862
7,41.348297,-122.213593,0.366856,0.849947,2.067078,-1.117344,0.504364,0.0,0.622075,0.0,1.83574,-0.859852,1.952698,0.0,0.523121,191.538284
8,39.577171,-116.397736,0.119322,0.145762,-0.318048,-0.258749,-0.689423,0.0,0.811283,0.0,0.299222,-0.29727,0.637024,0.0,0.759536,324.473907
9,38.435047,-118.973358,0.052226,0.047166,-0.951465,-0.288855,22.105011,0.0,0.152896,895.251038,3.930423,2.396085,22.686859,0.0,0.141999,697.917297
