#Preliminaries

In [1]:
#pip install tensorflow_addons 

In [2]:
#pip install -U scikit-learn

In [3]:
#pip install seaborn

In [4]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import LSTM, Dropout, Dense, Flatten, Bidirectional
from keras.optimizers import Adam, SGD, RMSprop
from keras.callbacks import ModelCheckpoint, EarlyStopping
import tensorflow_addons as tfa
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from numpy import array
import pandas as pd
import string
import random
import math


TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



In [5]:
np.set_printoptions(threshold=np.inf,linewidth=np.inf)

In [6]:
#from google.colab import drive
#drive.mount('/content/drive')

In [7]:
homedir =  '/home/bcavna/Documents'

In [8]:
completed = os.listdir( homedir + '/ERA5/model_eval2_input/nohail')

In [9]:
completedPD = pd.DataFrame(completed,columns=['filename'])
completedPD.head()

Unnamed: 0,filename
0,20220814184.npy
1,20221020267.npy
2,20220128193.npy
3,20220126199.npy
4,20221008183.npy


In [10]:
completedPD.to_csv(homedir + '/ERA5/check2022v5.csv')

#Reread and reshape the numpy files that were output during preprocessing

In [11]:
hailDir = homedir + '/ERA5/model_eval2_input/hail/'

In [12]:
haildays = []
hailDayCells = os.listdir(hailDir)
def hdc():
    for f in hailDayCells:
      haildays.append(f[:8])
    haildays.sort()
    print(haildays)
    hailDayCells.sort()
    print(hailDayCells)
hdc()

['20220306', '20220307', '20220307', '20220323', '20220323', '20220323', '20220514', '20220514', '20220514', '20220514', '20220514', '20220519', '20220519', '20220519', '20220519', '20220519', '20220519', '20220519', '20220519', '20220519', '20220519', '20220519', '20220519', '20220519', '20220519', '20220519', '20220520', '20220601', '20220601', '20220606', '20220608', '20220612', '20220613', '20220613', '20220613', '20220613', '20220614', '20220614', '20220614', '20220614', '20220614', '20220709', '20220723', '20220723', '20220723', '20220723', '20220723', '20220723', '20220724', '20220820', '20220820', '20220829']
['20220306340.npy', '20220307120.npy', '20220307142.npy', '20220323305.npy', '20220323339.npy', '20220323340.npy', '20220514100.npy', '20220514200.npy', '20220514220.npy', '20220514276.npy', '20220514339.npy', '20220519120.npy', '20220519121.npy', '20220519139.npy', '20220519141.npy', '20220519159.npy', '20220519160.npy', '20220519180.npy', '20220519181.npy', '20220519204.

In [13]:
noHailDir = homedir +  '/ERA5/model_eval2_input/nohail/'

In [14]:
noHailFiles = os.listdir(noHailDir)
print(noHailFiles[:10])
def nhf():
    noHailCells = [noHailFiles[i] for i in range(len(noHailFiles)) if noHailFiles[i][:8] in haildays]
    noHailCells.sort()
    print(noHailCells)
    return (noHailCells)
noHailCells = nhf()

['20220814184.npy', '20221020267.npy', '20220128193.npy', '20220126199.npy', '20221008183.npy', '20220103226.npy', '20220917194.npy', '20220910191.npy', '20220626219.npy', '20220315322.npy']
['20220306099.npy', '20220306100.npy', '20220306101.npy', '20220306116.npy', '20220306117.npy', '20220306118.npy', '20220306119.npy', '20220306120.npy', '20220306121.npy', '20220306127.npy', '20220306128.npy', '20220306129.npy', '20220306130.npy', '20220306131.npy', '20220306132.npy', '20220306133.npy', '20220306134.npy', '20220306135.npy', '20220306136.npy', '20220306137.npy', '20220306138.npy', '20220306139.npy', '20220306140.npy', '20220306141.npy', '20220306142.npy', '20220306143.npy', '20220306148.npy', '20220306149.npy', '20220306150.npy', '20220306151.npy', '20220306152.npy', '20220306153.npy', '20220306154.npy', '20220306155.npy', '20220306156.npy', '20220306157.npy', '20220306158.npy', '20220306159.npy', '20220306160.npy', '20220306161.npy', '20220306162.npy', '20220306163.npy', '202203061

Split the files into test/train split folders

In [15]:
hdc = np.array(hailDayCells)
nhf = np.array(noHailFiles)
#nhc = np.array(noHailCells)

In [16]:
hail_train, hail_test = train_test_split(hdc, test_size = 0.2, random_state = 123)

In [17]:
print(hail_train.shape)
print(hail_test.shape)

(41,)
(11,)


In [18]:
noHail_train, noHail_test = train_test_split(nhf, test_size = 0.2, random_state = 123)

In [19]:
print(noHail_train.shape)
print(noHail_test.shape)

(56606,)
(14152,)


Send all evaluation data to the same folder rather than using separate folders for train/val/test splits

In [20]:
hailTrainTfds = homedir + '/ERA5/tfds3/eval2/hail/'

In [21]:
hTrainCount = os.listdir(hailTrainTfds)

In [22]:
def tfds1():
    for f in hail_train:
      try:
          daily = np.load(hailDir + f,allow_pickle=True)
          seq = []
          for h in range(1,25):
            hourly = daily[0][h]
            seq.append(hourly)
          seq2 = np.array(seq)
          seq3 = np.reshape(seq2,(24,92))
          np.save(hailTrainTfds + f[:11],seq3)
          print(hailTrainTfds + f[:11])
          print(seq3)
      except:
          print("error in file:",f)
tfds1()

/home/bcavna/Documents/ERA5/tfds3/eval2/hail/20220514339
[[-2.73  0.    0.3  -0.6  -2.73  0.    0.09 -0.46 -2.73  0.    0.21 -0.46 -2.73  0.    0.34 -1.38 -2.73  0.    0.34 -1.02 -2.73  0.01  0.3  -0.35 -2.73  0.01  0.34  0.67 -2.73  0.01  0.41  1.02 -2.73  0.01  0.3   1.14 -2.73  0.01  0.09  1.83 -2.73  0.01  0.02  2.91 -2.73  0.    0.05  3.61 -2.73  0.    0.13  3.37 -2.73  0.    0.13  3.02 -2.73  0.    0.13  2.62 -2.73  0.    0.13  2.25 -2.73  0.    0.13  1.74 -2.73  0.    0.17  1.22 -2.73  0.    0.17  0.79 -2.73  0.    0.17  0.37 -2.73  0.    0.17 -0.06 -2.73  0.    0.13 -0.36 -2.73  0.    0.05 -0.25]
 [-2.73  0.    0.13  0.15 -2.73  0.    0.09  0.32 -2.73  0.    0.13  0.07 -2.73  0.    0.26 -1.3  -2.73  0.    0.3  -1.83 -2.73  0.01  0.3  -1.61 -2.73  0.01  0.3  -0.57 -2.73  0.01  0.34 -0.09 -2.73  0.01  0.26 -0.03 -2.73  0.01  0.05  0.26 -2.73  0.01  0.02  1.02 -2.73  0.    0.02  2.21 -2.73  0.    0.17  2.49 -2.73  0.    0.17  2.23 -2.73  0.    0.21  1.85 -2.73  0.    0.26  1.47 -2

In [23]:
hailTestTfds = homedir + '/ERA5/tfds3/eval2/hail/'

In [24]:
def tfds2():
    for f in hail_test:
      daily = np.load(hailDir + f,allow_pickle=True)
      seq = []
      for h in range(1,25):
        hourly = daily[0][h]
        seq.append(hourly)
      seq2 = np.array(seq)
      seq3 = np.reshape(seq2,(24,92))
      np.save(hailTestTfds + f[:11],seq3)
tfds2()

In [25]:
noHailTrainTfds = homedir + '/ERA5/tfds3/eval2/nohail/'

In [26]:
def tfds3():
    for f in noHail_train:
      daily = np.load(noHailDir + f,allow_pickle=True)
      seq = []
      for h in range(1,25):
        hourly = daily[0][h]
        seq.append(hourly)
      seq2 = np.array(seq)
      seq3 = np.reshape(seq2,(24,92))
      np.save(noHailTrainTfds + f[:11],seq3)
tfds3()

In [27]:
noHailTestTfds = homedir + '/ERA5/tfds3/eval2/nohail/'

In [28]:
def tfds4():
    for f in noHail_test:
      daily = np.load(noHailDir + f,allow_pickle=True)
      seq = []
      for h in range(1,25):
        hourly = daily[0][h]
        seq.append(hourly)
      seq2 = np.array(seq)
      seq3 = np.reshape(seq2,(24,92))
      np.save(noHailTestTfds + f[:11],seq3)
tfds4()

In [29]:
hTrainCount = os.listdir(hailTrainTfds)
hTestCount = os.listdir(hailTestTfds)
nhTrainCount = os.listdir(noHailTrainTfds)
nhTestCount = os.listdir(noHailTestTfds)
print("hTrainCount:",len(hTrainCount))
print("hTestCount:",len(hTestCount))
print("nhTrainCount:",len(nhTrainCount))
print("nhTestCount:",len(nhTestCount))

hTrainCount: 52
hTestCount: 52
nhTrainCount: 70758
nhTestCount: 70758


In [30]:
f1 = hTrainCount[0]
f2 = hTestCount[0]
f3 = nhTrainCount[0]
f4 = nhTestCount[0]
print(f1)
print(f2)
print(f3)
print(f4)
checkf1 = np.load(hailTrainTfds + f1,allow_pickle=True)
checkf2 = np.load(hailTestTfds + f2,allow_pickle=True)
checkf3 = np.load(noHailTrainTfds + f3,allow_pickle=True)
checkf4 = np.load(noHailTestTfds + f4,allow_pickle=True)

20220614232.npy
20220614232.npy
20220814184.npy
20220814184.npy


In [31]:
print(checkf1.shape)
print(checkf2.shape)
print(checkf3.shape)
print(checkf4.shape)

(24, 92)
(24, 92)
(24, 92)
(24, 92)


#Create the Tensor Flow pipeline

In [32]:
batch_size = 5000
AUTOTUNE = tf.data.AUTOTUNE

In [33]:
pipelineTF = tf.data.Dataset.list_files(homedir + '/ERA5/tfds3/eval2/*/*',shuffle=False)

In [34]:
#check the pipeline file list
for file in pipelineTF.take(5):
  print(file)

tf.Tensor(b'/home/bcavna/Documents/ERA5/tfds3/eval2/hail/20220306340.npy', shape=(), dtype=string)
tf.Tensor(b'/home/bcavna/Documents/ERA5/tfds3/eval2/hail/20220307120.npy', shape=(), dtype=string)
tf.Tensor(b'/home/bcavna/Documents/ERA5/tfds3/eval2/hail/20220307142.npy', shape=(), dtype=string)
tf.Tensor(b'/home/bcavna/Documents/ERA5/tfds3/eval2/hail/20220323305.npy', shape=(), dtype=string)
tf.Tensor(b'/home/bcavna/Documents/ERA5/tfds3/eval2/hail/20220323339.npy', shape=(), dtype=string)


In [133]:
#create a function to map target from filepath
def get_label(path):
  if tf.strings.split(path,os.sep)[-2] == 'hail':
    label = 1
  else:
    label = 0
  return label

In [282]:
def get_cell(path):
    f = tf.strings.split(path,os.sep)[-1]
    f = np.string_(f)[-15:-4]
    f = np.int64(f)
    return f

In [283]:
#check the pipeline file list
for i in pipelineTF.take(5):
    print(get_cell(i))

20220306340
20220307120
20220307142
20220323305
20220323339


In [286]:
#create a function to get the keys, features and tartgets from a given file in the pipeline
#Using examples from: https://stackoverflow.com/questions/71970277/loading-a-numpy-array-into-tensorflow-input-pipeline
def get_data(path):
  cell = get_cell(path)
  label = get_label(path)
  label = np.uint8(label)
  features = np.load(path)
  featuresTF = tf.convert_to_tensor(features, dtype=tf.float32)
  return cell, label, featuresTF

train_ds =  pipelineTF.map(lambda item: tf.numpy_function(get_data, [item], (tf.int64,tf.uint8,tf.float32))) #.prefetch(AUTOTUNE) .cache()

In [287]:
#point the x and y training arrays to the pipeline
k_train, y_train, X_train = next(iter(train_ds.batch(batch_size)))

In [288]:
#check the k, x and y training arrays
X_train.shape

TensorShape([5000, 24, 92])

In [289]:
y_train.shape

TensorShape([5000])

In [290]:
k_train.shape

TensorShape([5000])

In [291]:
k_train[0]

<tf.Tensor: shape=(), dtype=int64, numpy=20220306340>

In [292]:
print(type(k_train[0]))

<class 'tensorflow.python.framework.ops.EagerTensor'>


In [293]:
X_test_path  = homedir + '/ERA5/model_eval2_input/X_eval2.npy'
y_test_path  = homedir + '/ERA5/model_eval2_input/y_eval2.npy'
k_test_path  = homedir + '/ERA5/model_eval2_input/k_eval2.npy'

In [294]:
#Get k, x and y test vectors
def getVecs():
    ktst = []
    ytst = []
    Xtst = []
    for k, label, featuresTF  in train_ds:
      ktst.append(k)
      ytst.append(label)
      Xtst.append(featuresTF)
    k_tst = np.array(ktst)
    y_tst = np.array(ytst)
    X_tst = np.array(Xtst)
    np.save(k_test_path,k_tst)
    np.save(X_test_path,X_tst)
    np.save(y_test_path,y_tst)
getVecs()

Check the vectors

In [295]:
k_test = np.load(k_test_path)
y_test = np.load(y_test_path)
X_test = np.load(X_test_path)

In [296]:
k_testTF = tf.convert_to_tensor(k_test)

In [297]:
k_testTF.shape

TensorShape([70810])

In [298]:
y_testTF = tf.convert_to_tensor(y_test)

In [299]:
y_testTF.shape

TensorShape([70810])