#Preliminaries

In [None]:
#!apt-get install openjdk-8-jdk-headless -qq > /dev/null
#!wget -q https://archive.apache.org/dist/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz
#!tar xf spark-3.2.0-bin-hadoop3.2.tgz
#!pip install -q findspark

In [None]:
import warnings

In [None]:
warnings.filterwarnings('ignore')

In [None]:
import os
#os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
#os.environ["SPARK_HOME"] = "spark-3.2.0-bin-hadoop3.2"

In [None]:
import findspark
findspark.init()

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
sc = spark.sparkContext
sc

In [None]:
from pyspark.sql import Row
from pyspark.sql.types import *
import json
from pyspark.sql.types import *
from pyspark.sql.types import StructField, StructType, StringType, LongType, ArrayType, FloatType, IntegerType

In [None]:
import numpy as np
np.set_printoptions(threshold=np.inf,linewidth=np.inf)
import string
import math
import statistics
import pandas as pd
pd.set_option('display.max_colwidth', None)

In [None]:
#from metpy.calc import dewpoint_from_relative_humidity, saturation_mixing_ratio, relative_humidity_from_specific_humidity
#from metpy.units import units

#Input Files

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
homedir =  '/home/bcavna/Documents'

In [None]:
m = 8
year = 2015
yearmo = year*100 + m

In [None]:
ERA5_path = homedir + '/ERA5/processed/smaller/ERA5_'+ str(yearmo) + '*'
print(ERA5_path)

In [None]:
labels = homedir + '/ERA5/processed/processed_targets_20231118.csv'
print(labels)

In [None]:
wsPath = homedir + '/ERA5/processed/normvals/ws_facs.cvs'
print(wsPath)

In [None]:
vvPath = homedir + '/ERA5/processed/normvals/vv_facs.cvs'
print(vvPath)

In [None]:
final_output = homedir + '/ERA5/formatted/Model_Input_'+ str(yearmo) + '.csv'
print(final_output)

In [None]:
IN_Cells = homedir + '/ERA5/Hail_Model_Cells_20231126.csv'
print(IN_Cells)

In [None]:
InCellsDF = spark.read.csv(IN_Cells, header="true")
InCellsDF.createOrReplaceTempView("INCELLS")
#InCellsDF.show(5)

In [None]:
rawdataDF = spark.read.csv(ERA5_path, header="true")
rawdataDF.createOrReplaceTempView("processed")

In [None]:
filteredDF = spark.sql("SELECT a.* FROM processed a JOIN INCELLS b ON a.cell_id = b.cell_id")
filteredDF.createOrReplaceTempView("filtered")
#filteredDF.show(4)

In [None]:
#checkCellsDF = spark.sql("SELECT COUNT(DISTINCT cell_id) AS cell_count FROM filtered ")
#checkCellsDF.show(4)

In [None]:
wsFacDF = spark.read.csv(wsPath, header="true")
wsFacDF.createOrReplaceTempView("wsFacs")
#wsFacDF.show(3)

In [None]:
vvFacDF = spark.read.csv(vvPath, header="true")
vvFacDF.createOrReplaceTempView("vvFacs")
#vvFacDF.show(3)

In [None]:
vvMean = vvFacDF.rdd.map(lambda x: float(x[1])).collect()

In [None]:
vvStdv = vvFacDF.rdd.map(lambda x: float(x[2])).collect()

In [None]:
#print(vvMean)
#print(vvStdv)

In [None]:
def normalize(x,mean,std):
  return (float(x)-mean)/std

#Add calculated weather metrics, normalization, and labels to features

Dew point calc based on examples from: https://gist.github.com/sourceperl/45587ea99ff123745428

(replace with ERA5 RH Later)

In [None]:
def get_dew_point_c(t_air_c, rel_humidity):
  try:
    """Compute the dew point in degrees Celsius
    :param t_air_c: current ambient temperature in degrees Celsius
    :type t_air_c: float
    :param rel_humidity: relative humidity in %
    :type rel_humidity: float
    :return: the dew point in degrees Celsius
    :rtype: float
    """
    A = 17.27
    B = 237.7
    alpha = ((A * t_air_c) / (B + t_air_c)) + math.log(rel_humidity)
    return (B * alpha) / (A - alpha)
  except:
    return t_air_c - 10

In [None]:
#dew point example
#get_dew_point_c(-12.901999999999987,56.984917/100)

In [None]:
labelsDF = spark.read.csv(labels, header="true")
labelsDF.createOrReplaceTempView("labels")
#labelsDF.show(10)

In [None]:
calcInputDF = spark.sql("SELECT a._c0, a.YEAR, a.MONTH, a.DAY,  a.cell_id, CASE WHEN b.cell IS NULL THEN 0 ELSE 1 END AS TARGET, a.level, a.HOUR, a.t, CASE WHEN a.q IS NULL THEN 0 ELSE float(a.q) END as q, u, v, w, r FROM filtered a LEFT JOIN labels b ON a.cell_id = b.cell AND a.YEAR = b.YEAR AND a.MONTH = b.MONTH AND a.DAY = b.DAY WHERE level*1 >= 200 ")
#calcInputDF.take(2)

In [None]:
calcInputR = calcInputDF.rdd.map(lambda x: [int(x[1])*10000+int(x[2])*100+int(x[3]),int(x[4]),int(x[5]),int(x[6]),int(x[7]),round((float(x[8])-273.15)/100,2),round(get_dew_point_c(float(x[8])-273.15,float(x[13])/100)/100,2),round(np.linalg.norm([float(x[10]),float(x[11])]),0),round((180 + (np.rad2deg(np.arctan2(float(x[11]), float(x[10])))))/360,2),round((float(x[12])-vvMean[0])/vvStdv[0],2)])
#calcInputR.take(5)

In [None]:
#test metpy function for relative humidity
#relative_humidity_from_specific_humidity(1013.25 * units.hPa, 30 * units.degC, 18/1000).to('percent')

Format the values above and add them into the raw data

In [None]:
calcsR = calcInputR.map(lambda x: [x[0],x[1],x[2],x[3],x[4],float(x[5]),float(round(x[5]-x[6],2)),float(x[7]),float(x[8]),float(x[9])])
labeledR = calcsR.map(lambda p: Row(date=p[0],cell=p[1],label=p[2],hour=p[4],level=p[3],t=p[5],tvdp=p[6],ws=p[7],wd=p[8],vv=p[9]))
#labeledR.take(5)

In [None]:
calcsDF = spark.createDataFrame(labeledR)
calcsDF.createOrReplaceTempView("keyfeatures")
#calcsDF.show(3,truncate=False)

In [None]:
#check the wind direction (using a percentage of 360 degree rotation here to match units with other percentiles)
#wdDF = spark.sql("SELECT DISTINCT wd FROM keyfeatures ORDER BY 1")
#wdDF.show(500)

Bring in normalization factors for wind speed

In [None]:
featuresDF = spark.sql("SELECT a.date, a.cell, a.label, a.hour, a.level, a.t, a.tvdp, CASE WHEN b.cdf IS NOT NULL THEN b.cdf ELSE '0' END AS wsn, a.wd, a.vv AS vvn FROM keyfeatures a LEFT JOIN wsFacs b ON a.ws = b.ws")
featuresDF.createOrReplaceTempView("features")
#featuresDF.show(23)

Group features into lists (leave wind direction out for now)

In [None]:
featuresR = featuresDF.rdd.map(lambda x: [x[0],x[1],x[2],x[3],x[4],[x[5],x[6],float(x[7]),x[9]]])
relabeledR = featuresR.map(lambda p: Row(date=p[0],cell=p[1],label=p[2],hour=p[3],level=p[4],features=p[5]))
#relabeledR.take(5)

In [None]:
finalfeatDF = spark.createDataFrame(relabeledR)
finalfeatDF.createOrReplaceTempView("finalfeat")
#finalfeatDF.show(23,truncate=False)

#Format Features for Input into Model

In [None]:
q = "SELECT date*1000+cell as date_cell,hour, level, label, CASE WHEN level  = 200 THEN features else NULL END AS L01, CASE WHEN level  = 225 THEN features else NULL END AS L02, CASE WHEN level  = 250 THEN features else NULL END AS L03, CASE WHEN level  = 300 THEN features else NULL END AS L04, CASE WHEN level  = 350 THEN features else NULL END AS L05, CASE WHEN level  = 400 THEN features else NULL END AS L06, CASE WHEN level  = 450 THEN features else NULL END AS L07, CASE WHEN level  = 500 THEN features else NULL END AS L08, CASE WHEN level  = 550 THEN features else NULL END AS L09, CASE WHEN level  = 600 THEN features else NULL END AS L10, CASE WHEN level  = 650 THEN features else NULL END AS L11, CASE WHEN level  = 700 THEN features else NULL END AS L12, CASE WHEN level  = 750 THEN features else NULL END AS L13, CASE WHEN level  = 775 THEN features else NULL END AS L14, CASE WHEN level  = 800 THEN features else NULL END AS L15, CASE WHEN level  = 825 THEN features else NULL END AS L16, CASE WHEN level  = 850 THEN features else NULL END AS L17, CASE WHEN level  = 875 THEN features else NULL END AS L18, CASE WHEN level  = 900 THEN features else NULL END AS L19, CASE WHEN level  = 925 THEN features else NULL END AS L20, CASE WHEN level  = 950 THEN features else NULL END AS L21, CASE WHEN level  = 975 THEN features else NULL END AS L22, CASE WHEN level  = 1000 THEN features else NULL END AS L23 FROM finalfeat ORDER BY cell, level"


In [None]:
pivotCalcs = spark.sql(q)
pivotCalcs.createOrReplaceTempView("pivot")
#pivotCalcs.show(23,truncate=False)

In [None]:
#get each cell/date cohort with features at each level to try and preserve the relationship between levels
def cohort(flist):
  squarelist = []
  for f in flist:
    c = f[0] #cohort
    l = f[1] #label
    mat = np.array(f[2:]) #features
    tup = (c,l,mat)
    squarelist.append(tup)
  return squarelist

In [None]:
q2a = "SELECT date_cell, label, MAX(L01) as L01, MAX(L02) as L02, MAX(L03) as L03, MAX(L04) as L04, MAX(L05) as L05, MAX(L06) as L06, MAX(L07) as L07, MAX(L08) as L08, MAX(L09) as L09, MAX(L10) as L10, MAX(L11) as L11, MAX(L12) as L12, MAX(L13) as L13, MAX(L14) as L14, MAX(L15) as L15, MAX(L16) as L16, MAX(L17) as L17, MAX(L18) as L18, MAX(L19) as L19, MAX(L20) as L20, MAX(L21) as L21, MAX(L22) as L22, MAX(L23) as L23 FROM pivot WHERE hour = "
q2b = " GROUP BY date_cell, label ORDER BY date_cell"

Map Hour 00

In [None]:
h0 = 0
flattenDF00 = spark.sql(q2a + str(h0) + q2b)
flattenDF00.createOrReplaceTempView("flat00")
#flattenDF00.show(3)
flatList00 = flattenDF00.rdd.map(lambda x: x).collect()
#print(flatList00[:2])

In [None]:
datecells00 = cohort(flatList00)

Map Hour 01

In [None]:
h1 = 1
flattenDF01 = spark.sql(q2a + str(h1) + q2b)
flattenDF01.createOrReplaceTempView("flat01")
#flattenDF01.show(3)
flatList01 = flattenDF01.rdd.map(lambda x: x).collect()
#print(flatList01[1])

In [None]:
datecells01 = cohort(flatList01)

Map Hour 02

In [None]:
h2 = 2
flattenDF02 = spark.sql(q2a + str(h2) + q2b)
flattenDF02.createOrReplaceTempView("flat02")
#flattenDF02.show(3)
flatList02 = flattenDF02.rdd.map(lambda x: x).collect()
#print(flatList02[1])

In [None]:
datecells02 = cohort(flatList02)

Map Hour 03

In [None]:
h3 = 3
flattenDF03 = spark.sql(q2a + str(h3) + q2b)
flattenDF03.createOrReplaceTempView("flat03")
#flattenDF03.show(3)
flatList03 = flattenDF03.rdd.map(lambda x: x).collect()
#print(flatList03[1])

In [None]:
datecells03 = cohort(flatList03)

Map Hour 04

In [None]:
h4 = 4
flattenDF04 = spark.sql(q2a + str(h4) + q2b)
flattenDF04.createOrReplaceTempView("flat04")
#flattenDF04.show(3)
flatList04 = flattenDF04.rdd.map(lambda x: x).collect()
#print(flatList04[1])

In [None]:
datecells04 = cohort(flatList04)

Map Hour 05

In [None]:
h5 = 5
flattenDF05 = spark.sql(q2a + str(h5) + q2b)
flattenDF05.createOrReplaceTempView("flat05")
#flattenDF05.show(3)
flatList05 = flattenDF05.rdd.map(lambda x: x).collect()
#print(flatList05[1])

In [None]:
datecells05 = cohort(flatList05)

Map Hour 06

In [None]:
h6 = 6
flattenDF06 = spark.sql(q2a + str(h6) + q2b)
flattenDF06.createOrReplaceTempView("flat06")
#flattenDF06.show(3)
flatList06 = flattenDF06.rdd.map(lambda x: x).collect()
#print(flatList06[1])

In [None]:
datecells06 = cohort(flatList06)

Map Hour 07

In [None]:
h7 = 7
flattenDF07 = spark.sql(q2a + str(h7) + q2b)
flattenDF07.createOrReplaceTempView("flat07")
#flattenDF07.show(3)
flatList07 = flattenDF07.rdd.map(lambda x: x).collect()
#print(flatList07[1])

In [None]:
datecells07 = cohort(flatList07)

Map Hour 08

In [None]:
h8 = 8
flattenDF08 = spark.sql(q2a + str(h8) + q2b)
flattenDF08.createOrReplaceTempView("flat08")
#flattenDF08.show(3)
flatList08 = flattenDF08.rdd.map(lambda x: x).collect()
#print(flatList08[1])

In [None]:
datecells08 = cohort(flatList08)

Map Hour 09

In [None]:
h9 = 9
flattenDF09 = spark.sql(q2a + str(h9) + q2b)
flattenDF09.createOrReplaceTempView("flat09")
#flattenDF09.show(3)
flatList09 = flattenDF09.rdd.map(lambda x: x).collect()
#print(flatList09[1])

In [None]:
datecells09 = cohort(flatList09)

Map Hour 10

In [None]:
h10 = 10
flattenDF10 = spark.sql(q2a + str(h10) + q2b)
flattenDF10.createOrReplaceTempView("flat10")
#flattenDF10.show(3)
flatList10 = flattenDF10.rdd.map(lambda x: x).collect()
#print(flatList10[1])

In [None]:
datecells10 = cohort(flatList10)

Map Hour 11

In [None]:
h11 = 11
flattenDF11 = spark.sql(q2a + str(h11) + q2b)
flattenDF11.createOrReplaceTempView("flat11")
#flattenDF11.show(3)
flatList11 = flattenDF11.rdd.map(lambda x: x).collect()
#print(flatList11[1])

In [None]:
datecells11 = cohort(flatList11)

Map Hour 12

In [None]:
h12 = 12
flattenDF12 = spark.sql(q2a + str(h12) + q2b)
flattenDF12.createOrReplaceTempView("flat12")
#flattenDF12.show(3)
flatList12 = flattenDF12.rdd.map(lambda x: x).collect()
#print(flatList12[1])

In [None]:
datecells12 = cohort(flatList12)

Map Hour 13

In [None]:
h13 = 13
flattenDF13 = spark.sql(q2a + str(h13) + q2b)
flattenDF13.createOrReplaceTempView("flat13")
#flattenDF13.show(3)
flatList13 = flattenDF13.rdd.map(lambda x: x).collect()
#print(flatList13[1])

In [None]:
datecells13 = cohort(flatList13)

Map Hour 14

In [None]:
h14 = 14
flattenDF14 = spark.sql(q2a + str(h14) + q2b)
flattenDF14.createOrReplaceTempView("flat14")
#flattenDF14.show(3)
flatList14 = flattenDF14.rdd.map(lambda x: x).collect()
#print(flatList14[1])

In [None]:
datecells14 = cohort(flatList14)

Map Hour 15

In [None]:
h15 = 15
flattenDF15 = spark.sql(q2a + str(h15) + q2b)
flattenDF15.createOrReplaceTempView("flat15")
#flattenDF15.show(3)
flatList15 = flattenDF15.rdd.map(lambda x: x).collect()
#print(flatList15[1])

In [None]:
datecells15 = cohort(flatList15)

Map Hour 16

In [None]:
h16 = 16
flattenDF16 = spark.sql(q2a + str(h16) + q2b)
flattenDF16.createOrReplaceTempView("flat16")
#flattenDF16.show(3)
flatList16 = flattenDF16.rdd.map(lambda x: x).collect()
#print(flatList16[1])

In [None]:
datecells16 = cohort(flatList16)

Map Hour 17

In [None]:
h17 = 17
flattenDF17 = spark.sql(q2a + str(h17) + q2b)
flattenDF17.createOrReplaceTempView("flat17")
#flattenDF17.show(3)
flatList17 = flattenDF17.rdd.map(lambda x: x).collect()
#print(flatList17[1])

In [None]:
datecells17 = cohort(flatList17)

Map Hour 18

In [None]:
h18 = 18
flattenDF18 = spark.sql(q2a + str(h18) + q2b)
flattenDF18.createOrReplaceTempView("flat18")
#flattenDF18.show(3)
flatList18 = flattenDF18.rdd.map(lambda x: x).collect()
#print(flatList18[1])

In [None]:
datecells18 = cohort(flatList18)

Map Hour 19

In [None]:
h19 = 19
flattenDF19 = spark.sql(q2a + str(h19) + q2b)
flattenDF19.createOrReplaceTempView("flat19")
#flattenDF19.show(3)
flatList19 = flattenDF19.rdd.map(lambda x: x).collect()
#print(flatList19[1])

In [None]:
datecells19 = cohort(flatList19)

Map Hour 20

In [None]:
h20 = 20
flattenDF20 = spark.sql(q2a + str(h20) + q2b)
flattenDF20.createOrReplaceTempView("flat20")
#flattenDF20.show(3)
flatList20 = flattenDF20.rdd.map(lambda x: x).collect()
#print(flatList20[1])

In [None]:
datecells20 = cohort(flatList20)

Map Hour 21

In [None]:
h21 = 21
flattenDF21 = spark.sql(q2a + str(h21) + q2b)
flattenDF21.createOrReplaceTempView("flat21")
#flattenDF21.show(3)
flatList21 = flattenDF21.rdd.map(lambda x: x).collect()
#print(flatList21[1])

In [None]:
datecells21 = cohort(flatList21)

Map Hour 22

In [None]:
h22 = 22
flattenDF22 = spark.sql(q2a + str(h22) + q2b)
flattenDF22.createOrReplaceTempView("flat22")
#flattenDF22.show(3)
flatList22 = flattenDF22.rdd.map(lambda x: x).collect()
#print(flatList22[1])

In [None]:
datecells22 = cohort(flatList22)

Map Hour 23

In [None]:
h23 = 23
flattenDF23 = spark.sql(q2a + str(h23) + q2b)
flattenDF23.createOrReplaceTempView("flat23")
#flattenDF23.show(3)
flatList23 = flattenDF23.rdd.map(lambda x: x).collect()
#print(flatList23[1])

In [None]:
datecells23 = cohort(flatList23)

#Combine the features for each day into 24 hour sequences

In [None]:
# creating the dataframe
PD00 = pd.DataFrame(datecells00, columns = ['date_cell', 'target', 'features00'])
PD01 = pd.DataFrame(datecells01, columns = ['date_cell', 'target', 'features01'])
PD02 = pd.DataFrame(datecells02, columns = ['date_cell', 'target', 'features02'])
PD03 = pd.DataFrame(datecells03, columns = ['date_cell', 'target', 'features03'])
PD04 = pd.DataFrame(datecells04, columns = ['date_cell', 'target', 'features04'])
PD05 = pd.DataFrame(datecells05, columns = ['date_cell', 'target', 'features05'])
PD06 = pd.DataFrame(datecells06, columns = ['date_cell', 'target', 'features06'])
PD07 = pd.DataFrame(datecells07, columns = ['date_cell', 'target', 'features07'])
PD08 = pd.DataFrame(datecells08, columns = ['date_cell', 'target', 'features08'])
PD09 = pd.DataFrame(datecells09, columns = ['date_cell', 'target', 'features09'])
PD10 = pd.DataFrame(datecells10, columns = ['date_cell', 'target', 'features10'])
PD11 = pd.DataFrame(datecells11, columns = ['date_cell', 'target', 'features11'])
PD12 = pd.DataFrame(datecells12, columns = ['date_cell', 'target', 'features12'])
PD13 = pd.DataFrame(datecells13, columns = ['date_cell', 'target', 'features13'])
PD14 = pd.DataFrame(datecells14, columns = ['date_cell', 'target', 'features14'])
PD15 = pd.DataFrame(datecells15, columns = ['date_cell', 'target', 'features15'])
PD16 = pd.DataFrame(datecells16, columns = ['date_cell', 'target', 'features16'])
PD17 = pd.DataFrame(datecells17, columns = ['date_cell', 'target', 'features17'])
PD18 = pd.DataFrame(datecells18, columns = ['date_cell', 'target', 'features18'])
PD19 = pd.DataFrame(datecells19, columns = ['date_cell', 'target', 'features19'])
PD20 = pd.DataFrame(datecells20, columns = ['date_cell', 'target', 'features20'])
PD21 = pd.DataFrame(datecells21, columns = ['date_cell', 'target', 'features21'])
PD22 = pd.DataFrame(datecells22, columns = ['date_cell', 'target', 'features22'])
PD23 = pd.DataFrame(datecells23, columns = ['date_cell', 'target', 'features22'])

In [None]:
allPD01 = pd.merge(PD00, PD01, on=["date_cell","target"])
allPD02 = pd.merge(allPD01, PD02, on=["date_cell","target"])
allPD03 = pd.merge(allPD02, PD03, on=["date_cell","target"])
allPD04 = pd.merge(allPD03, PD04, on=["date_cell","target"])
allPD05 = pd.merge(allPD04, PD05, on=["date_cell","target"])
allPD06 = pd.merge(allPD05, PD06, on=["date_cell","target"])
allPD07 = pd.merge(allPD06, PD07, on=["date_cell","target"])
allPD08 = pd.merge(allPD07, PD08, on=["date_cell","target"])
allPD09 = pd.merge(allPD08, PD09, on=["date_cell","target"])
allPD10 = pd.merge(allPD09, PD10, on=["date_cell","target"])
allPD11 = pd.merge(allPD10, PD11, on=["date_cell","target"])
allPD12 = pd.merge(allPD11, PD12, on=["date_cell","target"])
allPD13 = pd.merge(allPD12, PD13, on=["date_cell","target"])
allPD14 = pd.merge(allPD13, PD14, on=["date_cell","target"])
allPD15 = pd.merge(allPD14, PD15, on=["date_cell","target"])
allPD16 = pd.merge(allPD15, PD16, on=["date_cell","target"])
allPD17 = pd.merge(allPD16, PD17, on=["date_cell","target"])
allPD18 = pd.merge(allPD17, PD18, on=["date_cell","target"])
allPD19 = pd.merge(allPD18, PD19, on=["date_cell","target"])
allPD20 = pd.merge(allPD19, PD20, on=["date_cell","target"])
allPD21 = pd.merge(allPD20, PD21, on=["date_cell","target"])
allPD22 = pd.merge(allPD21, PD22, on=["date_cell","target"])
allPD23 = pd.merge(allPD22, PD23, on=["date_cell","target"])
allPD23.head(3)

In [None]:
finalKeysDF = spark.sql("SELECT DISTINCT date_cell,label FROM pivot")

In [None]:
#finalKeysDF.show(5)

In [None]:
finalHailKeys = finalKeysDF.rdd.filter(lambda x: x[1] == 1).map(lambda x : x[0]).collect()

In [None]:
print(finalHailKeys)

In [None]:
finalNoHailKeys = finalKeysDF.rdd.filter(lambda x: x[1] == 0).map(lambda x : x[0]).collect()

In [None]:
print(finalNoHailKeys)

In [None]:
for k in finalHailKeys:
      kstring = 'date_cell == ' + str(k)
      #print(kstring)
      #print("")
      readyPD = allPD23.query(kstring)
      readyPD.index = readyPD.pop('date_cell')
      outpath = homedir + '/ERA5/model_input/hail/'+ str(k) + '.npy'
      print("outputting file to:",outpath)
      #print("")
      readyNP = readyPD.to_numpy()
      np.save(outpath,readyNP)

In [None]:
for k in finalNoHailKeys:
      kstring = 'date_cell == ' + str(k)
      #print(kstring)
      #print("")
      readyPD = allPD23.query(kstring)
      readyPD.index = readyPD.pop('date_cell')
      outpath = homedir + '/ERA5/model_input/nohail/'+ str(k) + '.npy'
      print("outputting file to:",outpath)
      #print("")
      readyNP = readyPD.to_numpy()
      np.save(outpath,readyNP)