In [1]:
from datetime import datetime

# print date as date accessed
date_accessed = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print(f"Date accessed: {date_accessed}")

Date accessed: 2024-10-12 22:40:28


In [2]:
import xarray as xr
import dask
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import os, sys, glob, re, time, math, calendar

In [30]:
client.close()
cluster.close()

In [29]:
import dask
import dask.distributed as dd
if 'client' in locals():
    client.close()
    cluster.close()
cluster = dd.LocalCluster(n_workers=12, dashboard_address=8787)
client = dd.Client(cluster)

In [None]:
year = 2020
files = sorted(glob.glob(f'/data/harish/Estimation-of-lidar-wind-speed-profiles-from-ERA5-inputs-using-TabNet/data/{year}/PRES*'))
ds = xr.open_mfdataset(files,combine='nested', concat_dim='valid_time', parallel=True,
                                chunks={'pressure_level': -1,'latitude': -1, 'longitude': -1, 'valid_time': -1})
ds

In [None]:
year = 2020
files = sorted(glob.glob(f'/data/harish/Estimation-of-lidar-wind-speed-profiles-from-ERA5-inputs-using-TabNet/data/{year}/SFC*'))
ds = xr.open_mfdataset(files,combine='nested', concat_dim='valid_time', parallel=True,
                                chunks={'pressure_level': -1,'latitude': -1, 'longitude': -1, 'valid_time': -1})
ds

In [24]:
# Function to find the closest index in a 1D array
def find_nearest(array, value):
    array = np.asarray(array)
    idx = (np.abs(array - value)).argmin()
    return idx

def era5_pres_hourly(year, par, level, target_lat,target_lon,location):
    files = sorted(glob.glob(f'/data/harish/Estimation-of-lidar-wind-speed-profiles-from-ERA5-inputs-using-TabNet/data/{year}/PRES*'))
    hr_data = xr.open_mfdataset(files,combine='nested', concat_dim='valid_time', parallel=True,
                                chunks={'pressure_level': -1,'latitude': -1, 'longitude': -1, 'valid_time': -1})
    # remiving unnecessary multiple dimensions names
    hr_data = hr_data.drop_vars(['number','expver'])

    hr_cor_data = hr_data[par].sel(pressure_level=level).sel(latitude=target_lat, 
                      longitude=target_lon, method='nearest').drop_vars('pressure_level')
    hr_cor_data['location'] = location
    hr_cor_data = hr_cor_data.rename(f'{par}_{level}')
    return hr_cor_data

def era5_sfc_hourly(year,par,target_lat,target_lon,location):
    files = sorted(glob.glob(f'/data/harish/Estimation-of-lidar-wind-speed-profiles-from-ERA5-inputs-using-TabNet/data/{year}/SFC*'))
    hr_data = xr.open_mfdataset(files,combine='nested', concat_dim='valid_time', parallel=True,
                                chunks={'pressure_level': -1,'latitude': -1, 'longitude': -1, 'valid_time': -1})
    # remiving unnecessary multiple dimensions names
    hr_data = hr_data.drop_vars(['number','expver'])
    hr_cor_data = hr_data[par].sel(latitude=target_lat, 
                      longitude=target_lon, method='nearest')
    hr_cor_data['location'] = location
    return hr_cor_data

In [None]:
profiler_stations = pd.read_csv('data/profiler_locations.csv',usecols=[0,3,4])
profiler_stations

# Extracting surface variables

In [None]:
par_names = ['u10', 'v10', 'u100', 'v100','zust','i10fg',
            't2m','skt','stl1','d2m','msl','blh','cbh',
            'ishf','ie','tcc','lcc','cape','cin','bld']
for par_name in (par_names):
    # create a folder with name par inside data_dir
    par_dir = f'data/ERA5_variables/{par_name}'
    os.makedirs(par_dir, exist_ok=True)
    # --- extract data at each year ---#
    for year in np.arange(2018,2019+1):
        datasets = []
        for loc in range(len(profiler_stations)):
            ds = era5_sfc_hourly(year, par_name, 
                                 profiler_stations['lat [degrees]'][loc], 
                                 profiler_stations['lon [degrees]'][loc],
                                 profiler_stations['stid'][loc])
            datasets.append(ds.compute())
            del(ds)
        # Concatenate datasets along a new dimension ('location')
        combined_dataset = xr.concat(datasets, dim='location')
        combined_dataset['year'] = year
        file_path = f'{par_dir}/{year}.nc'
        if os.path.exists(file_path):
            os.remove(file_path)
        combined_dataset.to_netcdf(file_path)
        del(combined_dataset)
        print(par_name,year)
    # --- combining all years data ---#
    ds = xr.open_mfdataset(f'{par_dir}/*.nc', 
                                parallel=True)
    file_path = f'data/ERA5_variables/{par_name}.nc'
    if os.path.exists(file_path):
        os.remove(file_path)
    ds.to_netcdf(file_path)
    print(par_name)

# Extracting pressure level variables

In [None]:
lvls = [1000,975,950]
pars = ['u','v','t']
pars = ['t']
for par in pars:
    for level in lvls:
        # create a folder with name par and level inside data_dir
        par_dir = f'data/ERA5_variables/{par}_{level}'
        os.makedirs(par_dir, exist_ok=True)
        # --- extract data at each year ---#
        for year in np.arange(2020,2023+1):
            datasets = []
            for loc in range(len(profiler_stations)):
                ds = era5_pres_hourly(year, par,level, 
                                      profiler_stations['lat [degrees]'][loc], 
                                 profiler_stations['lon [degrees]'][loc],
                                 profiler_stations['stid'][loc])
                datasets.append(ds.compute())
                del(ds)
            # Concatenate datasets along a new dimension ('location')
            combined_dataset = xr.concat(datasets, dim='location')
            combined_dataset['year'] = year
            file_path = f'{par_dir}/{year}.nc'
            if os.path.exists(file_path):
                os.remove(file_path)
            combined_dataset.to_netcdf(file_path)
            del(combined_dataset)
            print(par,level,year)
        # --- combining all years data ---#
        ds = xr.open_mfdataset(f'{par_dir}/*.nc',
                                    parallel=True)
        file_path = f'data/ERA5_variables/{par}_{level}.nc'
        if os.path.exists(file_path):
            os.remove(file_path)
        ds.to_netcdf(file_path)
        print(par)

# Compute ML inputs derived from ERA

In [5]:
def compute_wind_speed(par_name1,par_name2, par_name):
    ds1 = xr.open_dataset(f'data/ERA5_variables/{par_name1}.nc',chunks={'lat': -1, 'lon': -1, 'time': -1}).compute()
    ds2 = xr.open_dataset(f'data/ERA5_variables/{par_name2}.nc',chunks={'lat': -1, 'lon': -1, 'time': -1}).compute()  
    ds = (ds1[par_name1]**2+ds2[par_name2]**2)**0.5
    ds = ds.rename(par_name)
    return ds
def compute_alpha(dataset, par_name1,par_name2, par_name):
    ds = np.log(dataset[par_name2]/dataset[par_name1])/np.log(100/10)
    ds = ds.rename(par_name)
    return ds
def compute_gradient(dataset, par_name1,par_name2, par_name):
    ds = dataset[par_name2]-dataset[par_name1]
    ds = ds.rename(par_name)
    return ds

In [19]:
combined_dataset = xr.Dataset()
# --- 10m wind ---#
ds = compute_wind_speed('u10','v10', '10ws')
combined_dataset = xr.merge([combined_dataset, ds])

# --- 100m wind ---#
ds = compute_wind_speed('u100','v100', '100ws')
combined_dataset = xr.merge([combined_dataset, ds])

# --- 975 wind ---#
ds = compute_wind_speed('u_975','v_975', '975ws')
combined_dataset = xr.merge([combined_dataset, ds])

# --- 950 wind ---#
ds = compute_wind_speed('u_950','v_950', '950ws')
combined_dataset = xr.merge([combined_dataset, ds])

par_names = ['zust','i10fg',
            't2m','skt','stl1','d2m','msl','blh','cbh',
            'ishf','ie','tcc','lcc','cape','cin','bld','t_975','t_950']

for par_name in par_names:
    file_path = f'data/ERA5_variables/{par_name}.nc'
    ds = xr.open_dataset(file_path,chunks={'lat': -1, 'lon': -1, 'time': -1}).compute()
    combined_dataset = xr.merge([combined_dataset, ds])
    print(par_name)

# === derived parameters === #
# --- 100 alpha ---#
ds = compute_alpha(combined_dataset,'10ws','100ws','100alpha')
combined_dataset = xr.merge([combined_dataset, ds])

# --- 975 wind gradient ---#
ds = compute_gradient(combined_dataset,'100ws','975ws','975wsgrad')
combined_dataset = xr.merge([combined_dataset, ds])

# --- 950 wind gradient ---#
ds = compute_gradient(combined_dataset,'975ws','950ws','950wsgrad')
combined_dataset = xr.merge([combined_dataset, ds]) 

# --- 2m temperature gradient ---#
ds = compute_gradient(combined_dataset,'skt','t2m','2mtempgrad')
combined_dataset = xr.merge([combined_dataset, ds]) 

# --- skin temperature gradient ---#
ds = compute_gradient(combined_dataset,'stl1','skt','sktempgrad')
combined_dataset = xr.merge([combined_dataset, ds]) 

# --- Temperature dewpoint spread ---#
ds = compute_gradient(combined_dataset,'d2m','t2m','dewtempsprd')
combined_dataset = xr.merge([combined_dataset, ds]) 
    
# --- 950 temperature gradient ---#
ds = compute_gradient(combined_dataset,'t_975','t_950','950tempgrad')
combined_dataset = xr.merge([combined_dataset, ds]) 

# --- 975 temperature gradient ---#
ds = compute_gradient(combined_dataset,'t2m','t_975','975tempgrad')
combined_dataset = xr.merge([combined_dataset, ds]) 

# === save file ===#
file_path = 'data/ERA5.nc'
if os.path.exists(file_path):
    os.remove(file_path)
combined_dataset.to_netcdf(file_path)

zust
i10fg
t2m
skt
stl1
d2m
msl
blh
cbh
ishf
ie
tcc
lcc
cape
cin
bld
t_975
t_950


In [20]:
combined_dataset