#Preliminaries

In [1]:
import os
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import LSTM, Dropout, Dense, Flatten, Bidirectional
from keras.optimizers import Adam, SGD, RMSprop
from keras.callbacks import ModelCheckpoint, EarlyStopping
import tensorflow_addons as tfa
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from numpy import array
import pandas as pd
import string
import random
import math

2023-12-06 22:33:57.753102: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.

TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



In [2]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [3]:
np.set_printoptions(threshold=np.inf,linewidth=np.inf)

In [4]:
#from google.colab import drive
#drive.mount('/content/drive')

In [5]:
homedir =  '/home/bcavna/Documents'

#Reread and reshape the numpy files that were output during preprocessing

For model phase 1 - Limit the data to days that had hail in at least one location cell in Indiana. Other cells can be introduced later.

In [6]:
hailDir = homedir + '/ERA5/model_input/hail/'

In [7]:
hailDayCells = os.listdir(hailDir)
haildays = []
for f in hailDayCells:
  haildays.append(f[:8])
haildays.sort()
print(haildays)
hailDayCells.sort()
print(hailDayCells)

['20210325', '20210325', '20210325', '20210325', '20210326', '20210326', '20210328', '20210328', '20210328', '20210328', '20210507', '20210612', '20210613', '20210613', '20210618', '20210618', '20210618', '20210618', '20210618', '20210618', '20210618', '20210618', '20210618', '20210618', '20210618', '20210618', '20210618', '20210618', '20210619', '20210619', '20210619', '20210619', '20210619', '20210619', '20210620', '20210620', '20210629', '20210708', '20210914', '20211011', '20211206']
['20210325142.npy', '20210325200.npy', '20210325201.npy', '20210325221.npy', '20210326239.npy', '20210326260.npy', '20210328218.npy', '20210328239.npy', '20210328260.npy', '20210328280.npy', '20210507327.npy', '20210612325.npy', '20210613120.npy', '20210613261.npy', '20210618199.npy', '20210618220.npy', '20210618239.npy', '20210618242.npy', '20210618260.npy', '20210618261.npy', '20210618281.npy', '20210618282.npy', '20210618302.npy', '20210618319.npy', '20210618322.npy', '20210618343.npy', '20210618349

In [8]:
noHailDir = homedir +  '/ERA5/model_input/nohail/'

In [9]:
noHailFiles = os.listdir(noHailDir)
noHailCells = [noHailFiles[i] for i in range(len(noHailFiles)) if noHailFiles[i][:8] in haildays]
noHailCells.sort()
print(noHailCells)

['20210325099.npy', '20210325100.npy', '20210325101.npy', '20210325116.npy', '20210325117.npy', '20210325118.npy', '20210325119.npy', '20210325120.npy', '20210325121.npy', '20210325127.npy', '20210325128.npy', '20210325129.npy', '20210325130.npy', '20210325131.npy', '20210325132.npy', '20210325133.npy', '20210325134.npy', '20210325135.npy', '20210325136.npy', '20210325137.npy', '20210325138.npy', '20210325139.npy', '20210325140.npy', '20210325141.npy', '20210325143.npy', '20210325148.npy', '20210325149.npy', '20210325150.npy', '20210325151.npy', '20210325152.npy', '20210325153.npy', '20210325154.npy', '20210325155.npy', '20210325156.npy', '20210325157.npy', '20210325158.npy', '20210325159.npy', '20210325160.npy', '20210325161.npy', '20210325162.npy', '20210325163.npy', '20210325164.npy', '20210325169.npy', '20210325170.npy', '20210325171.npy', '20210325172.npy', '20210325173.npy', '20210325174.npy', '20210325175.npy', '20210325176.npy', '20210325177.npy', '20210325178.npy', '2021032517

Change to use 2021 for validation instead of 80% split

In [None]:
hdc = np.array(hailDayCells)
nhc = np.array(noHailCells)

In [11]:
#hail_train, hail_test = train_test_split(hdc, test_size = 0.2, random_state = 123)

In [13]:
print(hdc.shape)
print(nhc.shape)

(41,)
(2675,)


In [None]:
#noHail_train, noHail_test = train_test_split(nhc, test_size = 0.2, random_state = 123)

In [None]:
#print(noHail_train.shape)
#print(noHail_test.shape)

In [None]:
#hTrain, hVal  = train_test_split(hail_train, test_size=163, random_state=123)

In [None]:
#print(hTrain.shape)
#print(hVal.shape)

In [None]:
nhTrain, nhVal  = train_test_split(noHail_train, test_size=.2, random_state=123)

In [None]:
print(nhTrain.shape)
print(nhVal.shape)

In [None]:
hailTrainTfds = homedir + '/ERA5/tfds/train/hail/'

In [None]:
for f in hTrain:
  try:
      daily = np.load(hailDir + f,allow_pickle=True)
      seq = []
      for h in range(1,25):
        hourly = daily[0][h]
        seq.append(hourly)
      seq2 = np.array(seq)
      seq3 = np.reshape(seq2,(24,92))
      np.save(hailTrainTfds + f[:11],seq3)
  except:
      print("error in file:",f)

In [None]:
hailValTfds = homedir + '/ERA5/tfds/val/hail/'

In [None]:
for f in hVal:
  try:
      daily = np.load(hailDir + f,allow_pickle=True)
      seq = []
      for h in range(1,25):
        hourly = daily[0][h]
        seq.append(hourly)
      seq2 = np.array(seq)
      seq3 = np.reshape(seq2,(24,92))
      np.save(hailValTfds + f[:11],seq3)
  except:
      print("error in file:",f)

In [None]:
hailTestTfds = homedir + '/ERA5/tfds/test/hail/'

In [None]:
for f in hail_test:
  daily = np.load(hailDir + f,allow_pickle=True)
  seq = []
  for h in range(1,25):
    hourly = daily[0][h]
    seq.append(hourly)
  seq2 = np.array(seq)
  seq3 = np.reshape(seq2,(24,92))
  np.save(hailTestTfds + f[:11],seq3)

In [None]:
noHailTrainTfds = homedir + '/ERA5/tfds/train/nohail/'

In [None]:
for f in nhTrain:
  daily = np.load(noHailDir + f,allow_pickle=True)
  seq = []
  for h in range(1,25):
    hourly = daily[0][h]
    seq.append(hourly)
  seq2 = np.array(seq)
  seq3 = np.reshape(seq2,(24,92))
  np.save(noHailTrainTfds + f[:11],seq3)

In [None]:
noHailValTfds = homedir + '/ERA5/tfds/val/nohail/'

In [None]:
for f in nhVal:
  daily = np.load(noHailDir + f,allow_pickle=True)
  seq = []
  for h in range(1,25):
    hourly = daily[0][h]
    seq.append(hourly)
  seq2 = np.array(seq)
  seq3 = np.reshape(seq2,(24,92))
  np.save(noHailValTfds + f[:11],seq3)

In [None]:
noHailTestTfds = homedir + '/ERA5/tfds/test/nohail/'

In [None]:
for f in noHail_test:
  daily = np.load(noHailDir + f,allow_pickle=True)
  seq = []
  for h in range(1,25):
    hourly = daily[0][h]
    seq.append(hourly)
  seq2 = np.array(seq)
  seq3 = np.reshape(seq2,(24,92))
  np.save(noHailTestTfds + f[:11],seq3)

In [None]:
hTrainCount = os.listdir(hailTrainTfds)
hValCount = os.listdir(hailValTfds)
hTestCount = os.listdir(hailTestTfds)
nhTrainCount = os.listdir(noHailTrainTfds)
nhValCount = os.listdir(noHailValTfds)
nhTestCount = os.listdir(noHailTestTfds)
print("hTrainCount:",len(hTrainCount))
print("hValCount:",len(hValCount))
print("hTestCount:",len(hTestCount))
print("nhTrainCount:",len(nhTrainCount))
print("nhValCount:",len(nhValCount))
print("nhTestCount:",len(nhTestCount))

In [None]:
f1 = hTrainCount[0]
f2 = hValCount[0]
f3 = hTestCount[0]
f4 = nhTrainCount[0]
f5 = nhValCount[0]
f6 = nhTestCount[0]
print(f1)
print(f2)
print(f3)
print(f4)
print(f5)
print(f6)
checkf1 = np.load(hailTrainTfds + f1,allow_pickle=True)
checkf2 = np.load(hailValTfds + f2,allow_pickle=True)
checkf3 = np.load(hailTestTfds + f3,allow_pickle=True)
checkf4 = np.load(noHailTrainTfds + f4,allow_pickle=True)
checkf5 = np.load(noHailValTfds + f5,allow_pickle=True)
checkf6 = np.load(noHailTestTfds + f6,allow_pickle=True)

In [None]:
print(checkf1.shape)
print(checkf2.shape)
print(checkf3.shape)
print(checkf4.shape)
print(checkf5.shape)
print(checkf6.shape)

In [None]:
pipelineTF = tf.data.Dataset.list_files(homedir + '/ERA5/tfds/train/*/*',shuffle=True)
pipelineTF2 = tf.data.Dataset.list_files(homedir + '/ERA5/tfds/val/*/*',shuffle=True)
pipelineTF3 = tf.data.Dataset.list_files(homedir + '/ERA5/tfds/test/*/*',shuffle=True)

In [None]:
#check the pipeline file list
for file in pipelineTF.take(5):
  print(file)

In [None]:
#check the pipeline file list
for file in pipelineTF2.take(5):
  print(file)

In [None]:
#check the pipeline file list
for file in pipelineTF3.take(5):
  print(file)

In [None]:
#create a function to map target from filepath
def get_label(path):
  if tf.strings.split(path,os.sep)[-2] == 'hail':
    label = 1
  else:
    label = 0
  return label

In [None]:
#create a function to get the features and tartgets from a given file in the pipeline
#Using examples from: https://stackoverflow.com/questions/71970277/loading-a-numpy-array-into-tensorflow-input-pipeline
def get_data(path):
  label = get_label(path)
  label = np.uint8(label)
  features = np.load(path)
  featuresTF = tf.convert_to_tensor(features, dtype=tf.float32)
  return label, featuresTF

train_ds =  pipelineTF.map(lambda item: tf.numpy_function(get_data, [item], (tf.uint8,tf.float32))) #.prefetch(AUTOTUNE) .cache()
val_ds   =  pipelineTF2.map(lambda item: tf.numpy_function(get_data, [item], (tf.uint8,tf.float32))) #.prefetch(AUTOTUNE) .cache()
test_ds  =  pipelineTF3.map(lambda item: tf.numpy_function(get_data, [item], (tf.uint8,tf.float32))) #.prefetch(AUTOTUNE) .cache()

In [None]:
#Get x and y test vectors
ytst = []
Xtst = []
for label, featuresTF  in test_ds:
  ytst.append(label)
  Xtst.append(featuresTF)
  y_tst = np.array(ytst)
  X_tst = np.array(Xtst)

In [None]:
X_test_path  = homedir + '/ERA5/model_input/X_test.npy'
y_test_path  = homedir + '/ERA5/model_input/y_test.npy'

In [None]:
#save the xtest and ytest vectors for reuse later
np.save(X_test_path,X_tst)
np.save(y_test_path,y_tst)

In [None]:
y_test = np.load(X_test_path)
X_test = np.load(y_test_path)

In [None]:
y_testTF = tf.convert_to_tensor(y_test)

In [None]:
y_testTF.shape

In [None]:
X_testTF = tf.convert_to_tensor(X_test)

In [None]:
print(type(X_testTF),X_testTF.shape)
#print(X_testTF[0][:])