## Convert this notebook to executable python script using:

- jupyter nbconvert --to python Prepare_TrainTest_Data.ipynb

# Import Modules

## Standard Packages

In [1]:
import os
import sys
import os.path as path
import psutil
import glob
import random
import numpy as np
import pandas as pd
import xarray as xr
import pickle
import json
from matplotlib import pyplot as plt
from mpl_toolkits import mplot3d
from datetime import date, datetime, timedelta, time
from timeit import default_timer as timer

## User-Defined Functions

In [2]:
current_running_file_dir = sys.path[0]
current_running_file_par = '/'.join(sys.path[0].split('/')[:-1])
sys.path.insert(0, os.path.join(current_running_file_par, 'Step1_ExtractData'))

In [3]:
#print('current_running_file_dir:', current_running_file_dir)
#print('current_running_file_par:', current_running_file_par)
#print('PATH: ', sys.path)

In [4]:
from Extract_DFM_Data_Helper import *
from Prepare_TrainTest_Data_Helper import *

# Global Start Time and Memory

In [5]:
global_start_time = timer()
process = psutil.Process(os.getpid())
global_initial_memory = process.memory_info().rss

# Read the Input JSON File

### Input file name when using jupyter notebook

In [6]:
json_file_extract_data = '/p/lustre2/jha3/Wildfire/Wildfire_LDRD_SI/InputJson/Extract/json_extract_data_000.json'
json_file_prep_data    = '/p/lustre2/jha3/Wildfire/Wildfire_LDRD_SI/InputJson/Prep/json_prep_data_label_007.json'

### Input file name when using python script on command line

In [7]:
#json_file_extract_data = sys.argv[1]
#json_file_prep_data = sys.argv[2]

### Load the JSON file for extracting data

In [8]:
print('Loading the JSON file for extracting data: \n {}'.format(json_file_extract_data))

Loading the JSON file for extracting data: 
 /p/lustre2/jha3/Wildfire/Wildfire_LDRD_SI/InputJson/Extract/json_extract_data_000.json


In [9]:
with open(json_file_extract_data) as json_file_handle:
    json_content_extract_data = json.load(json_file_handle)

In [10]:
#json_content_extract_data

### Load the JSON file for preparing data

In [11]:
print('Loading the JSON file for preparing data: \n {}'.format(json_file_prep_data))

Loading the JSON file for preparing data: 
 /p/lustre2/jha3/Wildfire/Wildfire_LDRD_SI/InputJson/Prep/json_prep_data_label_007.json


In [12]:
with open(json_file_prep_data) as json_file_handle:
    json_content_prep_data = json.load(json_file_handle)

In [13]:
#json_content_prep_data

# Variables to be Used for Preparing Train and Test Data

## DataSet Defintion

In [14]:
# The current data set params
data_set_count = json_content_extract_data['data_set_defn']['data_set_count']

In [15]:
features_labels = json_content_extract_data['features_labels']
features_to_read = features_labels['features_to_read']

## Define Label, FM Threshold etc.

In [16]:
label_count = json_content_prep_data['label_defn']['label_count']

In [17]:
FM_labels = json_content_prep_data['FM_labels']

In [18]:
FM_label_type = FM_labels['label_type']

if (FM_label_type == 'Binary'):
    FM_binary_threshold = FM_labels['FM_binary_threshold']
if (FM_label_type == 'MultiClass'):
    FM_MC_levels = FM_labels['FM_MC_levels']

In [19]:
qois_to_use = json_content_prep_data['features']['qois_to_use']
qois_derived = json_content_prep_data['features']['qois_derived']

In [20]:
prune_data = json_content_prep_data['prune_data']

## Paths and File Names

#### Global

In [21]:
# WRF data set location and the extracted data set location
extracted_data_base_loc = json_content_extract_data['paths']['extracted_data_base_loc']
prepared_data_base_loc  = json_content_prep_data['paths']['prepared_data_base_loc']

#### DataSet Specific (Train and Test Data Extracted from WRF)

In [22]:
data_set_name = 'data_train_test_extracted_%03d'%(data_set_count)
extracted_data_loc = os.path.join(extracted_data_base_loc, data_set_name)
extracted_data_file_name = '{}_df.pkl'.format(data_set_name)

#### DataSet and Label Specific (Train and Test Data Prepared)

In [23]:
prepared_data_set_name = 'dataset_%03d_label_%03d_%s'%(data_set_count, label_count, FM_label_type)

prepared_data_loc = os.path.join(prepared_data_base_loc, prepared_data_set_name)
os.system('mkdir -p %s'%prepared_data_loc)

prepared_data_file_name = '{}.pkl'.format(prepared_data_set_name)

# Generate seed for the random number generator

In [24]:
seed = generate_seed()
random_state = init_random_generator(seed)

# Load The Pickled Extracted Data (Train, Test) from WRF 

## Load The Train/Test Data Saved in Pickle File

In [25]:
df_tt_prep = load_pickled_data (extracted_data_loc, extracted_data_file_name)

MODULE Name: "load_pickled_data"

Process in the module(): psutil.Process(pid=184263, name='python3', status='running', started='14:16:09')

Loading data from file:
 ... data_train_test_extracted_000_df.pkl 
 ... at: /p/lustre2/jha3/Wildfire/Wildfire_LDRD_SI/01_WRF_Nelson_Data_Extracted/data_train_test_extracted_000

Module memory consumed: 14.906 MB
Module computing time: 0.059 s


In [26]:
#df_tt_prep.keys()

In [27]:
#df_tt_prep['PRECIP[-8hr]']

In [28]:
#len(df_tt_prep)

## Prune Extracted Data

In [29]:
df_tt_prep = prune_desired_data (df_tt_prep, prune_data)

## Reduce The Size of Extracted Train/Test Data

In [30]:
df_tt_prep = reduce_data_size (df_tt_prep)

Reducing data size (float64 to float16, int64 to int32)


In [31]:
#df_tt_prep.dtypes

# Get Column Names in the Train and Test Data

In [32]:
keys_identity, keys_FM, \
keys_U10, keys_V10, keys_UMag10, \
keys_T2, keys_RH, keys_PREC, keys_SW, \
                            keys_HGT = get_keys_from_extracted_data (df_tt_prep)

In [33]:
#keys_U10, keys_V10, keys_UMag10

In [34]:
keys_FM_Binary, keys_FM_MC = define_binary_and_MC_FM_labels (keys_FM)

### Define Groups of Keys

In [35]:
keys_labels = define_labels(FM_label_type, keys_FM, keys_FM_Binary, keys_FM_MC)

In [36]:
#keys_FM_MC

In [37]:
keys_features  = define_features(keys_HGT, keys_UMag10, keys_T2, keys_RH, keys_PREC, keys_SW, \
                   qois_to_use)

In [38]:
#keys_features


# Compute New Columns or Remove Some

## Compute Wind Magnitude 

In [39]:
#features_to_read

In [40]:
if ('UMag10' not in features_to_read):
    df_tt_prep = compute_wind_mag (df_tt_prep, keys_U10, keys_V10, keys_UMag10)

In [41]:
#df_tt_prep[keys_U10 + keys_V10 + keys_UMag10]
#df_tt_prep

## Drop Wind Components

In [42]:
if ('UMag10' not in features_to_read):
    df_tt_prep = drop_wind_components (df_tt_prep, keys_U10, keys_V10)

In [43]:
#df_tt_prep[keys_UMag10]

## Compute VPD

In [None]:
if 'VPD' in qois_derived:
    df_tt_prep, keys_VPD = compute_VPD (df_tt_prep, keys_T2, keys_RH)
    keys_features += keys_VPD

In [49]:
print ('Computing Vapor Pressure Deficit (VPD) from T2 and RH')
keys_VPD_s = []
keys_VPD = []
for T2_key, RH_key in zip(keys_T2, keys_RH):
    assert T2_key[2:] == RH_key[2:]
    VPD_s_key = 'VPD_s{}'.format(T2_key[2:])
    VPD_key = 'VPD{}'.format(T2_key[2:])
    keys_VPD_s.append(VPD_s_key)
    keys_VPD.append(VPD_key)
    
    
    df_tt_prep[VPD_key] = df_tt_prep[T2_key]*0.0

Computing Vapor Pressure Deficit (VPD) from T2 and RH


In [50]:
keys_VPD, keys_VPD_s

(['VPD[-8hr]', 'VPD[-6hr]', 'VPD[-4hr]', 'VPD[-2hr]'],
 ['VPD_s[-8hr]', 'VPD_s[-6hr]', 'VPD_s[-4hr]', 'VPD_s[-2hr]'])

In [51]:
df_tt_prep[keys_HGT + keys_VPD + keys_T2 + keys_RH]

Unnamed: 0,HGT,VPD[-8hr],VPD[-6hr],VPD[-4hr],VPD[-2hr],T2[-8hr],T2[-6hr],T2[-4hr],T2[-2hr],RH[-8hr],RH[-6hr],RH[-4hr],RH[-2hr]
0,677.500,0.0,0.0,0.0,0.0,280.25,283.50,286.00,287.25,61.90625,43.93750,38.593750,32.031250
1,78.625,0.0,0.0,0.0,0.0,285.25,287.75,290.50,292.50,65.75000,53.46875,36.468750,24.078125
2,1382.000,0.0,0.0,0.0,0.0,275.00,277.75,279.25,280.50,71.56250,52.84375,47.625000,35.875000
3,1732.000,0.0,0.0,0.0,0.0,273.50,275.50,277.50,278.75,60.15625,52.09375,39.968750,30.140625
4,1540.000,0.0,0.0,0.0,0.0,276.50,278.75,281.00,282.50,52.34375,39.50000,25.984375,20.640625
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,860.500,0.0,0.0,0.0,0.0,285.00,280.25,279.25,280.50,35.96875,64.93750,58.218750,49.031250
7496,1441.000,0.0,0.0,0.0,0.0,273.75,271.75,270.00,271.00,82.68750,81.25000,85.750000,81.812500
7497,1713.000,0.0,0.0,0.0,0.0,278.75,272.75,272.00,273.00,60.71875,91.56250,91.812500,76.000000
7498,170.875,0.0,0.0,0.0,0.0,287.00,283.00,282.25,280.50,64.31250,81.50000,80.000000,89.312500


## Compute Binary FM Labels

In [None]:
if FM_label_type == 'Binary':
    df_tt_prep = compute_binary_FM_labels(df_tt_prep, \
                                          keys_FM, keys_FM_Binary, FM_binary_threshold)

In [None]:
#len(df_tt_prep.keys())
#df_tt_prep[keys_FM + keys_FM_Binary].dtypes

In [None]:
#df_tt_prep[keys_FM + keys_FM_Binary][985:995]

## Compute MC FM Labels

In [None]:
if FM_label_type == 'MultiClass':
    df_tt_prep = compute_MC_FM_labels(df_tt_prep, \
                                      keys_FM, keys_FM_MC, FM_MC_levels)

In [None]:
#df_tt_prep[keys_FM + keys_FM_MC].dtypes
#df_tt_prep[keys_FM + keys_FM_MC]

# Plot FM Labels

In [None]:
FM_hr = json_content_prep_data['qoi_to_plot']['FM_hr']

In [None]:
plot_FM_labels (df_tt_prep, FM_label_type, FM_hr, \
                prepared_data_set_name, prepared_data_loc)

# Split Data into Identity, Features, and Labels

In [None]:
data_tt_prep = split_data_into_groups (df_tt_prep, \
                                       keys_identity, keys_labels, keys_features)

# Save The Prepared Data

In [None]:
prepared_data = data_tt_prep
with open(os.path.join(prepared_data_loc, prepared_data_file_name), 'wb') as file_handle:
    pickle.dump(prepared_data, file_handle)
print('Wrote prepared data in "{}" at "{}"'.format(prepared_data_file_name, prepared_data_loc))

# Load and Test The Prepared Data Saved in Pickle File

In [None]:
prepared_data_read = load_pickled_data (prepared_data_loc, prepared_data_file_name)

In [None]:
#with open(os.path.join(prepared_data_loc, prepared_data_file_name), 'rb') as file_handle:
 #   prepared_data_read = pickle.load(file_handle)
print('Read prepared data from "{}" at "{}"'.format(prepared_data_file_name, prepared_data_loc))

In [None]:
#prepared_data_read['identity'].

In [None]:
#prepared_data_read['labels'].head(5)

In [None]:
#prepared_data_read['labels'][prepared_data_read['labels']['FM_10hr_bin'] == 1]

In [None]:
#prepared_data_read['features'].dtypes
#prepared_data_read['features'].head(5)

In [None]:
#prepared_data_read['features']['UMag10[-8hr]']

# Global End Time and Memory

In [None]:
global_final_memory = process.memory_info().rss
global_end_time = timer()
global_memory_consumed = global_final_memory - global_initial_memory
print('Total memory consumed: {:.3f} MB'.format(global_memory_consumed/(1024*1024)))
print('Total computing time: {:.3f} s'.format(global_end_time - global_start_time))
print('=========================================================================')
print("SUCCESS: Done Preparation of Data")