# Import Modules

In [1]:
import os
import sys
import os.path as path
import glob
import numpy as np
import pandas as pd
import xarray as xr
import pickle
#from matplotlib import pyplot as plt
#plt.style.use('seaborn-white')
from datetime import date, datetime, timedelta
import time
import random



In [2]:
from helper_functions import generate_seed, init_random_generator
from helper_functions import get_data_file_names, downsample_data_files
from helper_functions import downsample_grid_indices
from helper_functions import create_df_at_gp

# Variables to be used for preparing training data

In [3]:
data_files_location = '/p/lustre1/mirocha2/SJSU_DATA/akochanski/PGnE_climo/dfm'
percent_files_to_use = 5.0         # f1 = what percent of available files to use
percent_grid_points_to_use = 0.005  # f2 = what percent of grid points to use
max_history_to_consider = 10 # n_history in hours
history_interval        = 1
frames_in_file          = 153
label_fields = ['mean_wtd_moisture_1hr', 'mean_wtd_moisture_10hr']
identity_fields = ['latitude', 'longitude']
feature_fields = ['eastward_10m_wind', 'northward_10m_wind',\
                  'air_temperature_2m', \
                  'accumulated_precipitation_amount', \
                  'air_relative_humidity_2m', \
                  'surface_downwelling_shortwave_flux'] 

# Generate seed for the random number generator

In [4]:
seed = generate_seed()
random_state = init_random_generator(seed)

# Paths, File Names, Downsample Files

In [5]:
data_files_list = get_data_file_names(data_files_location)
sampled_file_indices, sampled_data_files = downsample_data_files (data_files_list, percent_files_to_use)


Getting the names of data files at the dir : 
 /p/lustre1/mirocha2/SJSU_DATA/akochanski/PGnE_climo/dfm
Found 99 files

Randomly selecting approx 5.0 % of the data files
Selected 5 data files
Indices of the randomly selected files 
 [66, 30, 29, 0, 78]
Names of the randomly selected files 
 ['wrfout_d03_1990-07-23_00:00:00_dfm.nc', 'wrfout_d03_1990-01-24_00:00:00_dfm.nc', 'wrfout_d03_1990-01-19_00:00:00_dfm.nc', 'wrfout_d03_1989-08-04_00:00:00_dfm.nc', 'wrfout_d03_1990-09-21_00:00:00_dfm.nc']


# Grid Dimensions, Downsample Grid Points

In [6]:
df_combined = pd.DataFrame()
for file_count, data_file_name in enumerate(sampled_data_files):
    print ('\nReading data from file # {}, with name :- {}'.format(file_count, data_file_name))
    dfm_file_data = xr.open_dataset(path.join(data_files_location, data_file_name))
    i_indices, j_indices, sampled_grid_indices = downsample_grid_indices (dfm_file_data, percent_grid_points_to_use)
    #print('i_indices: {}'.format(i_indices))
    #print('j_indices: {}'.format(j_indices))
    #print('sampled_grid_indices: {}'.format(sampled_grid_indices))
    
    for gp_count, (i_ind, j_ind) in enumerate(zip (i_indices, j_indices)):
        print('Grip point # {} : i = {}, j = {}'.format(gp_count, i_ind, j_ind))
        FM_time_index, AtmData_time_indices, df_at_gp = create_df_at_gp (dfm_file_data, i_ind, j_ind, max_history_to_consider, history_interval, frames_in_file)
        #print('DataFrame at grid point: \n {}'.format(df_at_gp))
        #print('... ... ...')
        if (not df_at_gp.isna().values.any()):
            df_combined = df_combined.append(df_at_gp).reset_index(drop = True)
    print('------------------------------------------------------------')


Reading data from file # 0, with name :- wrfout_d03_1990-07-23_00:00:00_dfm.nc

Randomly selecting 0.005 % of the grid points
Dimensions: 396 X 480, Num of grid points: 190080
Selected 10 grid points
Sampled grid indices: 
[12153, 103350, 74430, 5455, 77387, 25973, 106859, 143938, 188695, 181002]
Sampled i-indices: 
[273 390 378 307 167 233 335 190 199  30]
Sampled j-indices: 
[ 30 260 187  13 195  65 269 363 476 457]
Grip point # 0 : i = 273, j = 30
Grip point # 1 : i = 390, j = 260
Grip point # 2 : i = 378, j = 187
Grip point # 3 : i = 307, j = 13
Grip point # 4 : i = 167, j = 195
Grip point # 5 : i = 233, j = 65
Grip point # 6 : i = 335, j = 269
Grip point # 7 : i = 190, j = 363
Grip point # 8 : i = 199, j = 476
Grip point # 9 : i = 30, j = 457
------------------------------------------------------------

Reading data from file # 1, with name :- wrfout_d03_1990-01-24_00:00:00_dfm.nc

Randomly selecting 0.005 % of the grid points
Dimensions: 396 X 480, Num of grid points: 190080
Sel

In [7]:
df_combined.head(10)

Unnamed: 0,lat,lon,FM_10hr,FM_1hr,U10[-9],V10[-9],T2[-9],Precip[-9],RH2[-9],SDSF[-9],...,T2[-2],Precip[-2],RH2[-2],SDSF[-2],U10[-1],V10[-1],T2[-1],Precip[-1],RH2[-1],SDSF[-1]
0,38.295979,-116.596176,0.037678,0.04304,1.179513,2.209197,29.042542,0.0,0.093996,1014.491333,...,23.746796,0.0,0.123198,0.0,2.530737,-2.018502,22.592316,0.0,0.126817,0.0
1,36.984035,-116.948502,0.04249,0.059947,-2.014308,5.684172,20.878113,0.0,0.166365,0.0,...,18.96933,0.0,0.232457,205.058151,-1.279327,-0.675308,20.776825,0.0,0.225292,425.853699
2,37.200932,-121.740021,0.150343,0.149058,-2.128006,1.950207,14.534149,0.0,0.814076,0.0,...,14.963348,0.0,0.769306,284.33667,-4.697626,2.985459,16.60733,0.0,0.694228,519.356018
3,34.844704,-120.264008,0.163846,0.301983,1.773816,-1.616036,14.13559,6.6e-05,0.91845,0.0,...,12.691681,0.000109,0.991602,0.0,1.031104,-0.959487,12.441498,0.000113,0.991782,0.0
4,38.50058,-117.858887,0.033154,0.034298,2.027003,-3.133134,24.712311,0.0,0.15106,792.799805,...,28.550049,0.0,0.104091,594.679199,0.835772,0.412687,28.106293,0.0,0.111033,384.034729
5,40.248459,-121.222443,0.056709,0.054493,-0.63894,-0.209227,15.533661,0.0,0.519774,351.784637,...,23.104462,0.0,0.211342,935.859863,4.230122,-0.121537,23.354248,0.0,0.220706,804.925537
6,42.294544,-121.006622,0.093149,0.142444,3.264637,-2.464468,19.136047,0.0,0.269744,792.157959,...,12.378326,0.0,0.588358,0.0,1.527022,-1.589275,11.184021,0.0,0.646677,0.0
7,38.506275,-116.884674,0.070429,0.082326,-1.444979,-2.081428,0.155518,0.0,0.211907,473.651367,...,-1.690094,0.0,0.250153,0.0,-0.797542,0.997377,-1.231506,0.0,0.248202,0.0
8,38.367168,-119.252914,0.259043,0.843899,5.082563,-3.119682,-2.600372,0.0,0.189301,0.0,...,-2.971466,0.0,0.227623,0.0,8.596319,-1.004557,-1.552399,0.0,0.212761,0.0
9,39.196712,-120.72908,0.121286,0.071809,-3.16184,-0.540487,0.15976,0.059895,0.488645,0.0,...,9.691711,0.059895,0.213349,556.64209,1.331005,1.107436,10.770264,0.059895,0.203431,602.742859


# Save the training data

In [24]:
training_data_file_name = 'training_data_files_{}pc_grid_points_{}pc_max_history_{}_hist_interval_{}'.format(
                            percent_files_to_use, # f1 = what percent of available files to use
                            percent_grid_points_to_use, # f2 = what percent of grid points to use
                            max_history_to_consider, # n_history in hours
                            history_interval)
training_data = {'percent_files_to_use': percent_files_to_use,
                 'percent_grid_points_to_use': percent_grid_points_to_use,
                 'max_history_to_consider': max_history_to_consider,
                 'history_interval': history_interval,
                 'df_combined': df_combined}
training_data_file_handle = open(training_data_file_name, 'wb')
pickle.dump(training_data, training_data_file_handle)

In [26]:
pickle.dump(training_data, training_data_file_handle)
training_data_file_handle.close()

In [28]:
loaded_data = pickle.load(open(training_data_file_name, 'rb'))

In [31]:
loaded_data['df_combined']

Unnamed: 0,lat,lon,FM_10hr,FM_1hr,U10[-9],V10[-9],T2[-9],Precip[-9],RH2[-9],SDSF[-9],...,T2[-2],Precip[-2],RH2[-2],SDSF[-2],U10[-1],V10[-1],T2[-1],Precip[-1],RH2[-1],SDSF[-1]
0,38.295979,-116.596176,0.037678,0.04304,1.179513,2.209197,29.042542,0.0,0.093996,1014.491333,...,23.746796,0.0,0.123198,0.0,2.530737,-2.018502,22.592316,0.0,0.126817,0.0
1,36.984035,-116.948502,0.04249,0.059947,-2.014308,5.684172,20.878113,0.0,0.166365,0.0,...,18.96933,0.0,0.232457,205.058151,-1.279327,-0.675308,20.776825,0.0,0.225292,425.853699
2,37.200932,-121.740021,0.150343,0.149058,-2.128006,1.950207,14.534149,0.0,0.814076,0.0,...,14.963348,0.0,0.769306,284.33667,-4.697626,2.985459,16.60733,0.0,0.694228,519.356018
3,34.844704,-120.264008,0.163846,0.301983,1.773816,-1.616036,14.13559,6.6e-05,0.91845,0.0,...,12.691681,0.000109,0.991602,0.0,1.031104,-0.959487,12.441498,0.000113,0.991782,0.0
4,38.50058,-117.858887,0.033154,0.034298,2.027003,-3.133134,24.712311,0.0,0.15106,792.799805,...,28.550049,0.0,0.104091,594.679199,0.835772,0.412687,28.106293,0.0,0.111033,384.034729
5,40.248459,-121.222443,0.056709,0.054493,-0.63894,-0.209227,15.533661,0.0,0.519774,351.784637,...,23.104462,0.0,0.211342,935.859863,4.230122,-0.121537,23.354248,0.0,0.220706,804.925537
6,42.294544,-121.006622,0.093149,0.142444,3.264637,-2.464468,19.136047,0.0,0.269744,792.157959,...,12.378326,0.0,0.588358,0.0,1.527022,-1.589275,11.184021,0.0,0.646677,0.0
7,38.506275,-116.884674,0.070429,0.082326,-1.444979,-2.081428,0.155518,0.0,0.211907,473.651367,...,-1.690094,0.0,0.250153,0.0,-0.797542,0.997377,-1.231506,0.0,0.248202,0.0
8,38.367168,-119.252914,0.259043,0.843899,5.082563,-3.119682,-2.600372,0.0,0.189301,0.0,...,-2.971466,0.0,0.227623,0.0,8.596319,-1.004557,-1.552399,0.0,0.212761,0.0
9,39.196712,-120.72908,0.121286,0.071809,-3.16184,-0.540487,0.15976,0.059895,0.488645,0.0,...,9.691711,0.059895,0.213349,556.64209,1.331005,1.107436,10.770264,0.059895,0.203431,602.742859
