In [1]:
from datetime import datetime

# print date as date accessed
date_accessed = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print(f"Date accessed: {date_accessed}")

Date accessed: 2025-03-10 17:11:56


- Prior to this script, the ERA5 input variables at location specifics are extracted using the following script
    -   Extrating_ERA5_variables.py
    -   combining_yearly_ERA5.py, and  
    -   run_all_ERA5_extraction.sh 

In [2]:
import xarray as xr
import dask
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import os, sys, glob, re, time, math, calendar

- Execute the following scripts only if you intend to begin from scratch. 
- If you intend to add/compute new features, start from [add new features](#adding-new-features)

# Compute ML inputs derived from ERA

In [7]:
def compute_wind_speed(par_name1,par_name2, par_name):
    ds1 = xr.open_dataset(f'data/ERA5_variables/{par_name1}.nc',chunks={'lat': -1, 'lon': -1, 'time': -1}).compute()
    print(ds1)
    ds2 = xr.open_dataset(f'data/ERA5_variables/{par_name2}.nc',chunks={'lat': -1, 'lon': -1, 'time': -1}).compute()
    print(ds2)  
    ds = (ds1[par_name1]**2+ds2[par_name2]**2)**0.5
    ds = ds.rename(par_name)
    return ds
def compute_alpha(dataset, par_name1,par_name2, par_name):
    ds = np.log(dataset[par_name2]/dataset[par_name1])/np.log(100/10)
    ds = ds.rename(par_name)
    return ds
def compute_gradient(dataset, par_name1,par_name2, par_name):
    ds = dataset[par_name2]-dataset[par_name1]
    ds = ds.rename(par_name)
    return ds

def compute_second_derivative(dataset, par_name1,par_name2,par_name3, par_name):
    ds = dataset[par_name3]-2*dataset[par_name2]+dataset[par_name1]
    ds = ds.rename(par_name)
    return ds


- Previously, the wind speeds u and v are not added in the dataset, rather their magnitudes are present. 
- Now, in addition to the magnitude, I also added u and v components, just incase if the need to be provided as inputs.

In [8]:
combined_dataset = xr.Dataset()

par_names = ['u10','v10','u100','v100','u_1000','v_1000','u_975','v_975','u_950','v_950',
             'zust','i10fg',
            't2m','skt','stl1','d2m','msl','blh','cbh',
            'ishf','ie','tcc','lcc','cape','cin','bld','t_1000','t_975','t_950']

for par_name in par_names:
    file_path = f'data/ERA5_variables/{par_name}.nc'
    ds = xr.open_dataset(file_path,chunks={'lat': -1, 'lon': -1, 'time': -1}).compute()
    combined_dataset = xr.merge([combined_dataset, ds])
    print(par_name)

u10
v10
u100
v100
u_1000
v_1000
u_975
v_975
u_950
v_950
zust
i10fg
t2m
skt
stl1
d2m
msl
blh
cbh
ishf
ie
tcc
lcc
cape
cin
bld
t_1000
t_975
t_950


In [10]:
# === derived parameters === #

# --- 10m wind ---#
ds = compute_wind_speed('u10','v10', '10ws')
combined_dataset = xr.merge([combined_dataset, ds])

# --- 100m wind ---#
ds = compute_wind_speed('u100','v100', '100ws')
combined_dataset = xr.merge([combined_dataset, ds])

# --- 1000 wind ---#
ds = compute_wind_speed('u_1000','v_1000', '1000ws')
combined_dataset = xr.merge([combined_dataset, ds])

# --- 975 wind ---#
ds = compute_wind_speed('u_975','v_975', '975ws')
combined_dataset = xr.merge([combined_dataset, ds])

# --- 950 wind ---#
ds = compute_wind_speed('u_950','v_950', '950ws')
combined_dataset = xr.merge([combined_dataset, ds])

# --- 100 alpha ---#
ds = compute_alpha(combined_dataset,'10ws','100ws','100alpha')
combined_dataset = xr.merge([combined_dataset, ds])

# --- 1000 wind gradient ---#
ds = compute_gradient(combined_dataset,'1000ws','100ws','1000wsgrad')
combined_dataset = xr.merge([combined_dataset, ds])

# --- 975 wind gradient ---#
ds = compute_gradient(combined_dataset,'100ws','975ws','975wsgrad')
combined_dataset = xr.merge([combined_dataset, ds])

# --- 950 wind gradient ---#
ds = compute_gradient(combined_dataset,'975ws','950ws','950wsgrad')
combined_dataset = xr.merge([combined_dataset, ds]) 

# --- 1000 to 950 wind gradient ---#
ds = compute_gradient(combined_dataset,'1000ws','950ws','1000to950wsgrad')
combined_dataset = xr.merge([combined_dataset, ds])

# --- 1000 to 950 wind second derivative ---#
ds = compute_second_derivative(combined_dataset,'1000ws','975ws','950ws','1000to950wssecondgrad')
combined_dataset = xr.merge([combined_dataset, ds]) 

# --- skin temperature gradient ---#
ds = compute_gradient(combined_dataset,'stl1','skt','sktempgrad')
combined_dataset = xr.merge([combined_dataset, ds]) 

# --- 2m temperature gradient ---#
ds = compute_gradient(combined_dataset,'skt','t2m','2mtempgrad')
combined_dataset = xr.merge([combined_dataset, ds]) 

# --- Temperature dewpoint spread ---#
ds = compute_gradient(combined_dataset,'d2m','t2m','dewtempsprd')
combined_dataset = xr.merge([combined_dataset, ds]) 

# --- 1000 temperature gradient ---#
ds = compute_gradient(combined_dataset,'t2m','t_1000','1000tempgrad')
combined_dataset = xr.merge([combined_dataset, ds])

# --- 975 temperature gradient ---#
ds = compute_gradient(combined_dataset,'t_1000','t_975','975tempgrad')
combined_dataset = xr.merge([combined_dataset, ds]) 

# --- 950 temperature gradient ---#
ds = compute_gradient(combined_dataset,'t_975','t_950','950tempgrad')
combined_dataset = xr.merge([combined_dataset, ds]) 

# --- 1000 to 950 temperature gradient ---#
ds = compute_gradient(combined_dataset,'t_1000','t_950','1000to950tempgrad')
combined_dataset = xr.merge([combined_dataset, ds])

# --- 1000 to 950 temperature second derivative ---#
ds = compute_second_derivative(combined_dataset,'t_1000','t_975','t_950','1000to950tempsecondgrad')
combined_dataset = xr.merge([combined_dataset, ds])

<xarray.Dataset> Size: 5MB
Dimensions:     (location: 18, valid_time: 52584)
Coordinates:
  * valid_time  (valid_time) datetime64[ns] 421kB 2018-01-01 ... 2023-12-31T2...
    latitude    (location) float64 144B 42.75 42.75 43.75 ... 44.25 40.75 43.25
    longitude   (location) float64 144B -73.75 -73.75 -76.0 ... -73.5 -77.5
  * location    (location) <U9 648B 'PROF_ALB2' 'PROF_ALBA' ... 'PROF_WEBS'
    year        (valid_time) int64 421kB 2018 2018 2018 2018 ... 2023 2023 2023
Data variables:
    u10         (location, valid_time) float32 4MB 3.169 3.012 ... -2.568 -2.74
<xarray.Dataset> Size: 5MB
Dimensions:     (location: 18, valid_time: 52584)
Coordinates:
  * valid_time  (valid_time) datetime64[ns] 421kB 2018-01-01 ... 2023-12-31T2...
    latitude    (location) float64 144B 42.75 42.75 43.75 ... 44.25 40.75 43.25
    longitude   (location) float64 144B -73.75 -73.75 -76.0 ... -73.5 -77.5
  * location    (location) <U9 648B 'PROF_ALB2' 'PROF_ALBA' ... 'PROF_WEBS'
    year        (v

In [11]:
# === save file ===#
file_path = 'data/ERA5.nc'
if os.path.exists(file_path):
    os.remove(file_path)
combined_dataset.to_netcdf(file_path)

# Adding new features

In [12]:
ERA5 = xr.open_dataset('data/ERA5.nc')
combined_dataset = ERA5
HR =  pd.to_datetime(ERA5.valid_time).hour
# --- sin converted hour ---#
ds = xr.DataArray(np.sin(2 * np.pi / 24 * HR), coords=[ERA5.valid_time], dims=['valid_time'], name='sinHR')
combined_dataset = xr.merge([combined_dataset, ds])

# --- cosine converted hour ---#
ds = xr.DataArray(np.cos(2 * np.pi / 24 * HR), coords=[ERA5.valid_time], dims=['valid_time'], name='cosHR')
combined_dataset = xr.merge([combined_dataset, ds])

JDAY  = pd.to_datetime(ERA5.valid_time).dayofyear
# --- sin converted day ---#
ds = xr.DataArray(np.sin(2 * np.pi / 366 * JDAY), coords=[ERA5.valid_time], dims=['valid_time'], name='sinJDAY')
combined_dataset = xr.merge([combined_dataset, ds])

# --- cosine converted day ---#
ds = xr.DataArray(np.cos(2 * np.pi / 366 * JDAY), coords=[ERA5.valid_time], dims=['valid_time'], name='cosJDAY')
combined_dataset = xr.merge([combined_dataset, ds])

# === save file ===#
file_path = f'data/ERA5.nc'
if os.path.exists(file_path):
    os.remove(file_path)
combined_dataset.to_netcdf(file_path)

In [13]:
ERA5 = xr.open_dataset('data/ERA5.nc')
combined_dataset = ERA5
combined_dataset