# Import Modules

In [1]:
import os
import sys
import os.path as path
import glob
import numpy as np
import pandas as pd
import xarray as xr
import pickle
#from matplotlib import pyplot as plt
#plt.style.use('seaborn-white')
from datetime import date, datetime, timedelta
import time
import random



In [2]:
from helper_functions import generate_seed, init_random_generator
from helper_functions import get_data_file_names, downsample_data_files
from helper_functions import downsample_grid_indices
from helper_functions import create_df_at_gp

# Variables to be used for preparing training data

In [3]:
# WRF data set location and the extracted data set location
data_files_location = '/p/lustre1/mirocha2/SJSU_DATA/akochanski/PGnE_climo/dfm'
extracted_data_loc = '/p/lustre2/jha3/Wildfire/Wildfire_SJSU/data_extracted'

# The current data set params
data_set_count = 0
percent_files_to_use = 5.0         # f1 = what percent of available files to use
percent_grid_points_to_use = 0.005  # f2 = what percent of grid points to use
max_history_to_consider = 5 # n_history in hours
history_interval        = 2

# Some fixed stuff
frames_in_file          = 153
label_fields = ['mean_wtd_moisture_1hr', 'mean_wtd_moisture_10hr']
identity_fields = ['latitude', 'longitude']
feature_fields = ['eastward_10m_wind', 'northward_10m_wind',\
                  'air_temperature_2m', \
                  'accumulated_precipitation_amount', \
                  'air_relative_humidity_2m', \
                  'surface_downwelling_shortwave_flux'] 

# Generate seed for the random number generator

In [4]:
seed = generate_seed()
random_state = init_random_generator(seed)

# Paths, File Names, Downsample Files

In [5]:
data_files_list = get_data_file_names(data_files_location)
sampled_file_indices, sampled_data_files = downsample_data_files (data_files_list, percent_files_to_use)


Getting the names of data files at the dir : 
 /p/lustre1/mirocha2/SJSU_DATA/akochanski/PGnE_climo/dfm
Found 99 files

Randomly selecting approx 5.0 % of the data files
Selected 5 data files
Indices of the randomly selected files: 
 [10, 63, 40, 5, 85]
Names of the randomly selected files: 
 ['wrfout_d03_1989-10-16_00:00:00_dfm.nc', 'wrfout_d03_1990-07-08_00:00:00_dfm.nc', 'wrfout_d03_1990-03-15_00:00:00_dfm.nc', 'wrfout_d03_1989-09-21_00:00:00_dfm.nc', 'wrfout_d03_1990-10-26_00:00:00_dfm.nc']


# Grid Dimensions, Downsample Grid Points

In [6]:
df_for_all_files = pd.DataFrame()
for file_count, data_file_name in enumerate(sampled_data_files):
    print ('\nReading data from file # {}, with name :- {}'.format(file_count, data_file_name))
    print('-----------------------------------------------------------------------')
    dfm_file_data = xr.open_dataset(path.join(data_files_location, data_file_name))
    
    df_for_single_file = downsample_grid_indices (dfm_file_data, percent_grid_points_to_use, 
                                                  max_history_to_consider, history_interval, frames_in_file)
    
    df_for_all_files = df_for_all_files.append(df_for_single_file).reset_index(drop = True)


Reading data from file # 0, with name :- wrfout_d03_1989-10-16_00:00:00_dfm.nc
-----------------------------------------------------------------------

Reading data from file # 1, with name :- wrfout_d03_1990-07-08_00:00:00_dfm.nc
-----------------------------------------------------------------------

Reading data from file # 2, with name :- wrfout_d03_1990-03-15_00:00:00_dfm.nc
-----------------------------------------------------------------------

Reading data from file # 3, with name :- wrfout_d03_1989-09-21_00:00:00_dfm.nc
-----------------------------------------------------------------------

Reading data from file # 4, with name :- wrfout_d03_1990-10-26_00:00:00_dfm.nc
-----------------------------------------------------------------------


In [7]:
df_for_all_files.head(10)

Unnamed: 0,lat,lon,FM_10hr,FM_1hr,U10[-4],V10[-4],T2[-4],Precip[-4],RH2[-4],SDSF[-4],U10[-2],V10[-2],T2[-2],Precip[-2],RH2[-2],SDSF[-2]
0,37.067947,-116.761871,0.048587,0.035532,-0.795185,4.990386,23.199615,0.0,0.075627,733.132812,-1.076751,4.96994,21.66687,0.0,0.082003,272.727783
1,39.944962,-123.846375,0.175257,0.13424,-0.43194,-1.854014,10.66217,0.0,0.920093,28.408361,-0.623511,-1.72703,14.767853,0.0,0.723286,248.973297
2,37.565315,-121.217224,0.07388,0.083137,1.310323,-0.19043,23.095062,0.0,0.256115,0.0,0.723292,-0.137491,17.988831,0.0,0.353656,0.0
3,41.260529,-117.996124,0.062456,0.061018,0.825007,3.000624,11.564117,0.0,0.279496,113.269577,2.329639,4.335286,15.66687,0.0,0.202196,474.808167
4,34.184284,-117.230988,0.097825,0.129074,-0.095459,3.491995,21.744385,0.0,0.379457,565.216736,2.07446,2.941873,19.891785,0.0,0.506765,181.94191
5,41.477886,-118.83252,0.070463,0.072663,1.029048,0.952107,7.36319,0.0,0.29725,116.215126,0.136073,1.802363,9.715179,0.0,0.260757,340.666077
6,37.202755,-120.966187,0.087938,0.189768,2.409945,-1.007455,18.331299,0.000187,0.574864,0.0,1.827873,-2.022197,17.713837,0.006232,0.618808,0.0
7,40.555824,-121.509216,0.066355,0.058348,-0.976924,1.15871,16.30896,0.0,0.224829,591.712646,-0.189628,-0.537572,17.710968,0.0,0.214257,696.478882
8,37.605591,-118.333313,0.061923,0.070372,-1.190264,0.74645,10.212921,0.0,0.218191,0.0,-0.528279,1.161369,9.401001,0.0,0.216037,0.0
9,41.0299,-118.198486,0.11912,0.132528,3.846643,0.686766,11.734833,0.223081,0.490797,0.0,0.605367,3.468621,10.260834,0.223081,0.563415,0.0


# Save the extracted data

In [8]:
data_set_name = 'extracted_data_%02d'%(data_set_count)
extracted_data_file_name = '{}_files_{}pc_grid_points_{}pc_max_history_{}_hist_interval_{}'.format(
                            data_set_name, # name of data set
                            percent_files_to_use, # f1 = what percent of available files to use
                            percent_grid_points_to_use, # f2 = what percent of grid points to use
                            max_history_to_consider, # n_history in hours
                            history_interval)
extracted_data = {'percent_files_to_use': percent_files_to_use,
                 'percent_grid_points_to_use': percent_grid_points_to_use,
                 'max_history_to_consider': max_history_to_consider,
                 'history_interval': history_interval,
                 'df_for_all_files': df_for_all_files}
extracted_data_file_handle = open(os.path.join(
    extracted_data_loc, extracted_data_file_name), 'wb')
pickle.dump(extracted_data, extracted_data_file_handle)
extracted_data_file_handle.close()

# Load extracted data from pickle file

In [9]:
loaded_data = pickle.load(open(os.path.join(
    extracted_data_loc, extracted_data_file_name), 'rb'))

In [10]:
loaded_data['df_for_all_files'].head(10)

Unnamed: 0,lat,lon,FM_10hr,FM_1hr,U10[-4],V10[-4],T2[-4],Precip[-4],RH2[-4],SDSF[-4],U10[-2],V10[-2],T2[-2],Precip[-2],RH2[-2],SDSF[-2]
0,37.067947,-116.761871,0.048587,0.035532,-0.795185,4.990386,23.199615,0.0,0.075627,733.132812,-1.076751,4.96994,21.66687,0.0,0.082003,272.727783
1,39.944962,-123.846375,0.175257,0.13424,-0.43194,-1.854014,10.66217,0.0,0.920093,28.408361,-0.623511,-1.72703,14.767853,0.0,0.723286,248.973297
2,37.565315,-121.217224,0.07388,0.083137,1.310323,-0.19043,23.095062,0.0,0.256115,0.0,0.723292,-0.137491,17.988831,0.0,0.353656,0.0
3,41.260529,-117.996124,0.062456,0.061018,0.825007,3.000624,11.564117,0.0,0.279496,113.269577,2.329639,4.335286,15.66687,0.0,0.202196,474.808167
4,34.184284,-117.230988,0.097825,0.129074,-0.095459,3.491995,21.744385,0.0,0.379457,565.216736,2.07446,2.941873,19.891785,0.0,0.506765,181.94191
5,41.477886,-118.83252,0.070463,0.072663,1.029048,0.952107,7.36319,0.0,0.29725,116.215126,0.136073,1.802363,9.715179,0.0,0.260757,340.666077
6,37.202755,-120.966187,0.087938,0.189768,2.409945,-1.007455,18.331299,0.000187,0.574864,0.0,1.827873,-2.022197,17.713837,0.006232,0.618808,0.0
7,40.555824,-121.509216,0.066355,0.058348,-0.976924,1.15871,16.30896,0.0,0.224829,591.712646,-0.189628,-0.537572,17.710968,0.0,0.214257,696.478882
8,37.605591,-118.333313,0.061923,0.070372,-1.190264,0.74645,10.212921,0.0,0.218191,0.0,-0.528279,1.161369,9.401001,0.0,0.216037,0.0
9,41.0299,-118.198486,0.11912,0.132528,3.846643,0.686766,11.734833,0.223081,0.490797,0.0,0.605367,3.468621,10.260834,0.223081,0.563415,0.0
