In [1]:
# CDS API

import cdsapi



# Libraries for working with multi-dimensional arrays

import xarray as xr

import pandas as pd

import numpy as np

import os

# Forecast verification metrics with xarray

import xskillscore as xs



# Date and calendar libraries

from dateutil.relativedelta import relativedelta

import calendar



# Libraries for plotting and geospatial data visualisation

from matplotlib import pyplot as plt

import cartopy.crs as ccrs

import cartopy.feature as cfeature



# Disable warnings for data download via API and matplotlib (do I need both???)

import warnings
import re
import os

warnings.filterwarnings('ignore')

In [2]:
file_path = "/home/mohamed/EHTPIII/MODELISATION/DATA/DATASET/IN/rea/era5_monthly_stmonth_RR_1993_2016.grib"
era5_1deg = xr.open_dataset(file_path, engine="cfgrib")
era5_1deg

Ignoring index file '/home/mohamed/EHTPIII/MODELISATION/DATA/DATASET/IN/rea/era5_monthly_stmonth_RR_1993_2016.grib.5b7b6.idx' incompatible with GRIB file


In [3]:
era5_1deg = era5_1deg.rename({'latitude':'lat','longitude':'lon','time':'start_date'}).swap_dims({'start_date':'valid_time'})
valid_time = pd.to_datetime(era5_1deg.valid_time)
valid_time_normalized = valid_time.normalize()
era5_1deg["valid_time"]=valid_time_normalized
era_anom=era5_1deg-era5_1deg.mean("valid_time",skipna=True)

In [4]:
DATAIN ="/home/mohamed/EHTPIII/MODELISATION/DATA/DATASET/IN/grib"
    
DATAOUT="/home/mohamed/EHTPIII/MODELISATION/DATA/DATASET/OUT_2"

grib_files = [file for file in os.listdir(DATAIN) if file.endswith('.grib')]
for file in grib_files:

    match = re.search(r"monthly_mean_(\d+)_", file)
    start=int(match.group(1))
    
    config = dict(
    
        list_vars = ['tp', ],
    
        hcstarty = 1993,
    
        hcendy = 2016,
    
        start_month = start,
    
    )
    
    import os
    
    
    SCOREDIR = DATAOUT + '/SF/scores'
    
    PLOTSDIR = DATAOUT + f'/SF/plots/stmonth{config["start_month"]:02d}'
    
    
    
    for directory in [DATAIN, SCOREDIR, PLOTSDIR]:
    
        # Check if the directory exists
    
        if not os.path.exists(directory):
    
            # If it doesn't exist, create it
    
            os.makedirs(directory)
    
            print(f'Creating folder {directory}')
    st_dim_name = 'time' if not config.get('isLagged',False) else 'indexing_time'
    
    print('Reading HCST data from file')
    
    # available_files = ["ukmo_602", "meteo_france_8", "ecmwf_51", "eccc_3", "eccc_2", "dwd_21", "cmcc_35"]
    
    
    hcst_fname=DATAIN + f'/{file}'
    
    hcst_bname=file.split(".grib")[0]
    if "ecmwf" in hcst_bname:
        hcst = xr.open_dataset(hcst_fname,engine='cfgrib', backend_kwargs=dict(time_dims=('forecastMonth', st_dim_name)),drop_variables="t2m")
    else:
        hcst = xr.open_dataset(hcst_fname,engine='cfgrib', backend_kwargs=dict(time_dims=('forecastMonth', st_dim_name)),drop_variables="p167")

    time_interval_seconds = 30 * 86400 
    hcst.tprate.values = hcst.tprate.values*time_interval_seconds*1000
    hcst.attrs['units'] = 'mm'
    hcst=hcst.rename({"tprate":"tp"})
    

    
    hcst = hcst.chunk({'forecastMonth':1, 'latitude':'auto', 'longitude':'auto'})  #force dask.array using chunks on leadtime, latitude and longitude coordinate
    
    hcst = hcst.rename({'latitude':'lat','longitude':'lon', st_dim_name:'start_date'})
    
    print ('Re-arranging time metadata in xr.Dataset object')
    
    # Add start_month to the xr.Dataset
    
    start_month = pd.to_datetime(hcst.start_date.values[0]).month
    
    hcst = hcst.assign_coords({'start_month':start_month})
    
    # Add valid_time to the xr.Dataset
    
    vt = xr.DataArray(dims=('start_date','forecastMonth'), coords={'forecastMonth':hcst.forecastMonth,'start_date':hcst.start_date})
    
    vt.data = [[pd.to_datetime(std)+relativedelta(months=fcmonth-1) for fcmonth in vt.forecastMonth.values] for std in vt.start_date.values]
    
    hcst = hcst.assign_coords(valid_time=vt)
    
    
    
    # CALCULATE 3-month AGGREGATIONS
    
    # NOTE rolling() assigns the label to the end of the N month period, so the first N-1 elements have NaN and can be dropped
    
    print('Computing 3-month aggregation')
    
    hcst_3m = hcst.rolling(forecastMonth=3,min_periods=1).mean(skipna=True)
    
    hcst_3m = hcst_3m.where(hcst_3m.forecastMonth>=3,drop=True)
    
    
    
    
    
    # CALCULATE ANOMALIES (and save to file)
    
    print('Computing anomalies 1m')
    
    hcmean = hcst.mean(['number','start_date'],skipna=True)
    
    anom = hcst - hcmean
    
    anom = anom.assign_attrs(reference_period='{hcstarty}-{hcendy}'.format(**config))
    
    
    
    hcst_2=hcst.assign_attrs(reference_period='{hcstarty}-{hcendy}'.format(**config))
    
    hcst_2_3m=hcst_2.rolling(forecastMonth=3,min_periods=1).mean(skipna=True)
    
    hcst_2_3m = hcst_2_3m.where(hcst_2_3m.forecastMonth>=3,drop=True)
    
    
    
    print('Computing anomalies 3m')
    
    hcmean_3m = hcst_3m.mean(['number','start_date'],skipna=True)
    
    anom_3m = hcst_3m - hcmean_3m
    
    anom_3m = anom_3m.assign_attrs(reference_period='{hcstarty}-{hcendy}'.format(**config))
    
    
    
    print('Saving anomalies 1m/3m to netCDF files')
    
    anom.to_netcdf(f'{DATAOUT}/{hcst_bname}.1m.RR.anom.nc')
    
    hcst_2.to_netcdf(f'{DATAOUT}/{hcst_bname}.1m.RR.hcst_2.nc')
    
    hcst_2_3m.to_netcdf(f'{DATAOUT}/{hcst_bname}.3m.RR.hcst_2.nc')
    
    anom_3m.to_netcdf(f'{DATAOUT}/{hcst_bname}.3m.RR.anom.nc')
    # We define a function to calculate the boundaries of forecast categories defined by quantiles
    
    # Assign 'forecastMonth' coordinate values
    
    fcmonths = [mm+1 if mm>=0 else mm+13 for mm in [t.month - config['start_month'] for t in pd.to_datetime(era5_1deg.valid_time.values)] ]
    
    era5_1deg = era5_1deg.assign_coords(forecastMonth=('valid_time',fcmonths))
    era_anom=era_anom.assign_coords(forecastMonth=('valid_time',fcmonths))
    
    # Drop obs values not needed (earlier than first start date) - this is useful to create well shaped 3-month aggregations from obs.
    
    era5_1deg = era5_1deg.where(era5_1deg.valid_time>=np.datetime64('{hcstarty}-{start_month:02d}-01'.format(**config)),drop=True)
    era_anom = era_anom.where(era_anom.valid_time>=np.datetime64('{hcstarty}-{start_month:02d}-01'.format(**config)),drop=True)

    
    era_anom_3m= era_anom.rolling(valid_time=3,min_periods=1).mean(skipna=True)
    era_anom_3m=era_anom_3m.where(era_anom_3m.forecastMonth>=3)
    
    
    # CALCULATE 3-month AGGREGATIONS
    
    # NOTE rolling() assigns the label to the end of the N month period
    
    print('Calculate observation 3-monthly aggregations')
    
    # NOTE care should be taken with the data available in the "obs" xr.Dataset so the rolling mean (over valid_time) is meaningful
    
    era5_1deg_3m = era5_1deg.rolling(valid_time=3,min_periods=1).mean(skipna=True)
    
    era5_1deg_3m = era5_1deg_3m.where(era5_1deg_3m.forecastMonth>=3)

    era_anom = era_anom.drop('forecastMonth')

    era_anom_3m = era_anom_3m.drop('forecastMonth')
    
    
    # As we don't need it anymore at this stage, we can safely remove
    
    # 'forecastMonth'
    
    era5_1deg = era5_1deg.drop('forecastMonth')
    
    era5_1deg_3m = era5_1deg_3m.drop('forecastMonth')
    
    from os.path import join
    
    # Loop over aggregations
    for aggr in ['1m', '3m']:
    
        if aggr == '1m':
            o = era5_1deg
            o_anom=era_anom
        elif aggr == '3m':
            o = era5_1deg_3m
            o_anom=era_anom_3m
        else:
            raise BaseException(f'Unknown aggregation {aggr}')
    
        print(f'Computing deterministic scores for {aggr}-aggregation')
    
        # Read anomalies file
        h_anom = xr.open_dataset(f'{DATAOUT}/{hcst_bname}.{aggr}.RR.anom.nc')
        h=xr.open_dataset(f'{DATAOUT}/{hcst_bname}.{aggr}.RR.hcst_2.nc')

        if "tp" not in h.variables:
    # Rename the variable 'p167' to 't2m'
            h_anom = h_anom.rename({'p228': 'tp'})
            h = h.rename({'p228': 'tp'})
        # h=h.rename({'p167':'t2m'})

        # o=o.swap_dims({'time': 'valid_time'})
    
        is_fullensemble = 'number' in h_anom.dims
    
        l_corr = list()
        l_acc=list()
        l_corr_pval = list()
        l_acc_pval=list()
    
        for this_fcmonth in h.forecastMonth.values:
            print(f'forecastMonth={this_fcmonth}')
            thishcst_anom = h_anom.sel(forecastMonth=this_fcmonth).swap_dims({'start_date': 'valid_time'})
            thishcst = h.sel(forecastMonth=this_fcmonth).swap_dims({'start_date': 'valid_time'})
    
            thisobs_anom = o_anom.where(o_anom.valid_time == thishcst_anom.valid_time, drop=True)
            thisobs = o.where(o.valid_time == thishcst.valid_time, drop=True)
    
            # Align the forecast and observation data along all common dimensions
            thishcst_em, thisobs_aligned = xr.align(thishcst, thisobs, join='inner')
            thishcst_em_anom, thisobs_aligned_anom = xr.align(thishcst_anom, thisobs_anom, join='inner')
    
            # If it's a full ensemble, take the mean over the 'number' dimension
            thishcst_em = thishcst_em if not is_fullensemble else thishcst_em.mean('number',skipna=True)
            thishcst_em_anom = thishcst_em_anom if not is_fullensemble else thishcst_em_anom.mean('number',skipna=True)
    
            l_corr.append(xs.spearman_r(thishcst_em, thisobs_aligned, dim='valid_time'))
            l_acc.append(xs.pearson_r(thishcst_em_anom, thisobs_aligned_anom, dim='valid_time'))
            l_corr_pval.append(xs.spearman_r_p_value(thishcst_em, thisobs_aligned, dim='valid_time'))
            l_acc_pval.append(xs.pearson_r_p_value(thishcst_em_anom, thisobs_aligned_anom, dim='valid_time'))

        print(f'Concatenating (by fcmonth) correlation for {aggr}-aggregation')
        corr = xr.concat(l_corr, dim='forecastMonth')
        rsquared=corr ** 2
        corr_pval = xr.concat(l_corr_pval, dim='forecastMonth')
        acc=xr.concat(l_acc,dim='forecastMonth')
        acc_pval=xr.concat(l_acc_pval,dim='forecastMonth')
    
        print(f'Saving to netCDF file correlation for {aggr}-aggregation')
        corr.to_netcdf(f'{SCOREDIR}/{hcst_bname}.{aggr}.RR.corr.nc')
        corr_pval.to_netcdf(f'{SCOREDIR}/{hcst_bname}.{aggr}.RR.corr_pval.nc')
        rsquared.to_netcdf(f'{SCOREDIR}/{hcst_bname}.{aggr}.RR.rsquared.nc')
        
        acc.to_netcdf(f'{SCOREDIR}/{hcst_bname}.{aggr}.RR.acc.nc')
        acc_pval.to_netcdf(f'{SCOREDIR}/{hcst_bname}.{aggr}.RR.acc_pval.nc')

Creating folder /home/mohamed/EHTPIII/MODELISATION/DATA/DATASET/OUT_2/SF/scores
Creating folder /home/mohamed/EHTPIII/MODELISATION/DATA/DATASET/OUT_2/SF/plots/stmonth11
Reading HCST data from file
Re-arranging time metadata in xr.Dataset object
Computing 3-month aggregation
Computing anomalies 1m
Computing anomalies 3m
Saving anomalies 1m/3m to netCDF files
Calculate observation 3-monthly aggregations
Computing deterministic scores for 1m-aggregation
forecastMonth=2
forecastMonth=3
forecastMonth=4
Concatenating (by fcmonth) correlation for 1m-aggregation
Saving to netCDF file correlation for 1m-aggregation
Computing deterministic scores for 3m-aggregation
forecastMonth=3
forecastMonth=4
Concatenating (by fcmonth) correlation for 3m-aggregation
Saving to netCDF file correlation for 3m-aggregation
Creating folder /home/mohamed/EHTPIII/MODELISATION/DATA/DATASET/OUT_2/SF/plots/stmonth02
Reading HCST data from file
Re-arranging time metadata in xr.Dataset object
Computing 3-month aggregatio