In [1]:
import copy
import glob
import gzip
import json
import os
import shutil
import sys
import time
import warnings
import zipfile
from datetime import date

import cdsapi
import h5py
import numpy
import numpy as np
import pandas as pd
import urllib3
import xarray as xr

sys.path.append(os.getcwd() + "/../cds-backend/code/")
import cds_eua4 as eua

warnings.filterwarnings("ignore")

sys.path.insert(0, os.getcwd() + "/../resort/rasotools-master/")
import matplotlib
import matplotlib.font_manager as font_manager
import matplotlib.pylab as plt
import matplotlib.pyplot as maplt
import rasotools
import seaborn

matplotlib.rcParams.update({"font.size": 20})
matplotlib.rcParams["figure.figsize"] = (20, 10)
font = {
    "family": "normal",
    # 'weight' : 'bold',
    "size": 22,
}
matplotlib.rc("font", **font)

In [2]:
def find_nearest(array, value):
    array = np.asarray(array)
    idx = (np.abs(array - value)).argmin()
    return array[idx]


def datetime_to_seconds(dates, ref="1900-01-01T00:00:00"):
    """from datetime64 to seconds since 1900-01-01 00:00:00"""
    return ((dates - np.datetime64(ref)) / np.timedelta64(1, "s")).astype(np.int64)


def seconds_to_datetime(seconds, ref="1900-01-01"):
    """from seconds to datetime64"""
    seconds = np.asarray(seconds)
    return pd.to_datetime(seconds, unit="s", origin=ref, errors="coerce")

In [3]:
with open("/users/staff/uvoggenberger/scratch/hug/config_v9/active.json") as json_file:
    active_v9 = json.load(json_file)
with open("/users/staff/uvoggenberger/scratch/hug/config_v11/active.json") as json_file:
    active_v11 = json.load(json_file)

## CDS API
### Backend

In [4]:
import cdsapi

c = cdsapi.Client(
    url="https://cds-test.copernicus-climate.eu/api/v2",
    key="2548:a32dce56-b04a-42fc-8fc3-a972f94772ad",
    progress=True,
    retry_max=5,
)
r = c.retrieve(
    "insitu-comprehensive-upper-air-observation-network",
    {
        "day": ["1"],
        "format": "csv",
        "month": "7",
        "statid": "11035",
        "pressure_level": ["100"], # USE hPa for CDSAPI! Pa everywhere else!
        "observed_variable": ["air_temperature"],
        "year": "1997",
        "optional": ["obs_minus_bg", "obs_minus_an", "bias_estimate"],
    },
    target="download3.zip",
)

2023-04-03 10:49:12,770 INFO Welcome to the CDS
2023-04-03 10:49:12,772 INFO Sending request to https://cds-test.copernicus-climate.eu/api/v2/resources/insitu-comprehensive-upper-air-observation-network
2023-04-03 10:49:12,866 INFO Request is completed
2023-04-03 10:49:12,867 INFO Downloading https://cds-test-download-0000.copernicus-climate.eu/cache-compute-0000/cache/data0/adaptor.comprehensive_upper_air.retrieve-1678953481.204305-2990-2-b759c9de-8303-45ba-ae0d-98cf4da5eefd.zip to download3.zip (805)
2023-04-03 10:49:13,033 INFO Download rate 4.8K/s                     


In [22]:
tempdir = "./tmp/"
delete_dl = True
with zipfile.ZipFile("download3.zip", "r") as zip_ref:
    try:
        os.mkdir(tempdir)
    except:
        pass

    zip_ref.extractall(tempdir)

for i in glob.glob(tempdir + "*"):
    print(i)
    display(pd.read_csv(i, header=14))

if delete_dl:
    shutil.rmtree(tempdir)

./tmp/CDS_CUON_output_file.csv


Unnamed: 0,an_depar@body,biascorr@body,date_time,fg_depar@body,latitude,longitude,primary_id,report_id,z_coordinate,variable,observation_value
0,-0.633594,0.065361,1997-07-01 00:00:00+00,0.337387,48.25,16.370001,0-20001-0-11035,100000000070801,10000.0,air_temperature,218.899994
1,-0.80358,0.087323,1997-07-01 12:00:00+00,-0.866757,48.25,16.370001,0-20001-0-11035,100000000070803,10000.0,air_temperature,219.5


## EUA Request
### Backend

### local machine

In [5]:
rq = {
    "statid": ["06610"],
    "variable": ["air_temperature"],
    "date": "20001231-20010101",
    "format": "nc",
    "pressure_level": ["85000"],
    "optional": "report_timestamp",
}
df_v11 = eua.vm_request_wrapper(rq, overwrite=True, vm_url="http://127.0.0.1:8009")
df = df_v11.to_dataframe()
df

2023-04-03 10:49:13,263 INFO Reading Profile on slice(None, None, None)


Unnamed: 0,lat,lon,obs,plev,report_id,station_id,ta,time
0,46.82,6.95,0,85000.0,10000031335,PAYERNE (6610-0),266.899994,2000-12-31 11:00:00
1,46.82,6.95,0,85000.0,10000031337,PAYERNE (6610-0),273.600006,2000-12-31 23:00:00
2,46.82,6.95,0,85000.0,10000031339,PAYERNE (6610-0),277.399994,2001-01-01 11:00:00
3,46.82,6.95,0,85000.0,10000031341,PAYERNE (6610-0),277.799988,2001-01-01 23:00:00


### remote VM machine

In [6]:
rq = {
    "statid": ["06610"],
    "variable": ["air_temperature"],
    "date": "20001231-20010101",
    "format": "nc",
    "pressure_level": ["85000"],
}
df_v11 = eua.vm_request_wrapper(rq, overwrite=True)  # , vm_url="http://127.0.0.1:8009")
df = df_v11.to_dataframe()
df

2023-04-03 10:49:13,707 INFO Reading Profile on slice(None, None, None)


Unnamed: 0,lat,lon,obs,plev,report_id,station_id,ta,time
0,46.82,6.95,0,85000.0,10000031335,PAYERNE (6610-0),266.899994,2000-12-31 11:00:00
1,46.82,6.95,0,85000.0,10000031337,PAYERNE (6610-0),273.600006,2000-12-31 23:00:00
2,46.82,6.95,0,85000.0,10000031339,PAYERNE (6610-0),277.399994,2001-01-01 11:00:00
3,46.82,6.95,0,85000.0,10000031341,PAYERNE (6610-0),277.799988,2001-01-01 23:00:00


### multi file output

In [7]:
rq = {
    "statid": ["72520"],
    "variable": ["northward_wind_speed", "eastward_wind_speed"],
    "optional": ["wind_bias_estimate"],
    "date": "19450101-19451231",
    "format": "nc",
}
df_v11 = eua.vm_request_wrapper(rq)  # , overwrite=True, vm_url='http://127.0.0.1:8009')
for i in df_v11:
    print(i)
    data = df_v11[i].to_dataframe()
    display(data)

2023-04-03 10:49:13,757 INFO Reading Profile on slice(None, None, None)


dest_0-20001-0-72520_northward_wind


Unnamed: 0,lat,lon,obs,plev,report_id,station_id,time,va,wind_bias_estimate
0,40.5317,-80.217201,0,3804.980225,300000000012343,"PITTSBURGH, PA.",1945-01-02 09:00:00,7.778174,-1.927224e-07
1,40.5317,-80.217201,0,4903.325195,300000000012343,"PITTSBURGH, PA.",1945-01-02 09:00:00,7.778174,-1.927224e-07
2,40.5317,-80.217201,0,3804.980225,300000000012344,"PITTSBURGH, PA.",1945-01-02 16:00:00,3.746066,4.045160e-08
3,40.5317,-80.217201,0,4903.325195,300000000012344,"PITTSBURGH, PA.",1945-01-02 16:00:00,6.363961,1.891087e-07
4,40.5317,-80.217201,0,9806.650391,300000000012344,"PITTSBURGH, PA.",1945-01-02 16:00:00,4.495279,1.911428e-07
...,...,...,...,...,...,...,...,...,...
34229,40.4800,-80.220001,0,94265.289062,200000000013826,"PITTSBURGH, PA.",1945-12-31 15:00:00,-3.371459,9.506144e-08
34230,40.4800,-80.220001,0,100362.593750,200000000013826,"PITTSBURGH, PA.",1945-12-31 15:00:00,-4.949748,1.034098e-07
34231,40.5317,-80.217201,0,3804.980225,300000000013827,"PITTSBURGH, PA.",1945-12-31 22:00:00,0.000000,-0.000000e+00
34232,40.5317,-80.217201,0,4903.325195,300000000013827,"PITTSBURGH, PA.",1945-12-31 22:00:00,0.000000,-0.000000e+00


2023-04-03 10:49:13,864 INFO Reading Profile on slice(None, None, None)


dest_0-20001-0-72520_eastward_wind


Unnamed: 0,lat,lon,obs,plev,report_id,station_id,time,ua,wind_bias_estimate
0,40.5317,-80.217201,0,3804.980225,300000000012343,"PITTSBURGH, PA.",1945-01-02 09:00:00,7.778174,-1.927224e-07
1,40.5317,-80.217201,0,4903.325195,300000000012343,"PITTSBURGH, PA.",1945-01-02 09:00:00,7.778174,-1.927224e-07
2,40.5317,-80.217201,0,3804.980225,300000000012344,"PITTSBURGH, PA.",1945-01-02 16:00:00,9.271839,-2.880536e-07
3,40.5317,-80.217201,0,4903.325195,300000000012344,"PITTSBURGH, PA.",1945-01-02 16:00:00,6.363961,1.891087e-07
4,40.5317,-80.217201,0,9806.650391,300000000012344,"PITTSBURGH, PA.",1945-01-02 16:00:00,11.126206,1.432088e-07
...,...,...,...,...,...,...,...,...,...
34229,40.4800,-80.220001,0,94265.289062,200000000013826,"PITTSBURGH, PA.",1945-12-31 15:00:00,8.344655,3.458252e-07
34230,40.4800,-80.220001,0,100362.593750,200000000013826,"PITTSBURGH, PA.",1945-12-31 15:00:00,4.949748,-1.034098e-07
34231,40.5317,-80.217201,0,3804.980225,300000000013827,"PITTSBURGH, PA.",1945-12-31 22:00:00,4.000000,-0.000000e+00
34232,40.5317,-80.217201,0,4903.325195,300000000013827,"PITTSBURGH, PA.",1945-12-31 22:00:00,9.000000,-0.000000e+00


## EUA CDM
### File Access

In [8]:
with eua.CDMDataset(
    "/mnt/users/scratch/leo/scratch/converted_v11/long/0-20000-0-02365_CEUAS_merged_v1.nc"
) as file:
    # display(file["advanced_homogenisation"])
    df = file.to_dataframe(
        groups=["header_table", "recordindices", "source_configuration"],
        variables=["report_timestamp", "recordtimestamp", "source_file"],
    )
df["conv_report_timestamp"] = seconds_to_datetime(df.report_timestamp.values)
df["conv_recordtimestamp"] = seconds_to_datetime(df.recordtimestamp.values)
df[df.report_timestamp != df.recordtimestamp]

2023-04-03 10:49:14,115 INFO Reading Profile on slice(None, None, None)


Unnamed: 0,report_timestamp,recordtimestamp,source_file,conv_report_timestamp,conv_recordtimestamp
43489,3199519800,3199521600,0-20000-0-02365_igra2_harvested_SWM00002365-da...,2001-05-22 11:30:00,2001-05-22 12:00:00
47804,3399723960,3399732000,0-20000-0-02365_igra2_harvested_SWM00002365-da...,2007-09-25 15:46:00,2007-09-25 18:00:00
51168,3555660600,3555662400,0-20000-0-02365_igra2_harvested_SWM00002365-da...,2012-09-03 11:30:00,2012-09-03 12:00:00


## H5PY
### File Access

In [9]:
with h5py.File(
    "/mnt/users/scratch/leo/scratch/converted_v11/long/0-20300-0-99010_CEUAS_merged_v1.nc",
    "r",
) as file:
    dt_from = datetime_to_seconds(np.datetime64("1946-01-01"))
    dt_to = datetime_to_seconds(np.datetime64("1946-12-31"))
    rts = file["recordindices"]["recordtimestamp"][:]
    idx = np.where(np.logical_and((rts >= dt_from), (rts <= dt_to)))[0]
    idx_d = file["recordindices"]["139"][idx]
    print(
        np.unique([b"".join(f) for f in file["source_configuration"]["source_file"][:]])
    )
    display(idx_d)

[b'0-20300-0-99010_era5_2_harvested_era5.conv._99005.gz.nc'
 b'0-20300-0-99010_era5_2_harvested_era5.conv._9:3702.gz.nc'
 b'0-20300-0-99010_era5_2_ship_harvested_era5.conv._0J0E.gz.nc'
 b'0-20300-0-99010_era5_2_ship_harvested_era5.conv._4YE.gz.nc'
 b'0-20300-0-99010_era5_2_ship_harvested_era5.conv._6:99005.gz.nc'
 b'0-20300-0-99010_igra2_harvested_ZZM00099010-data.txt.nc']


array([2095611, 2095611, 2095611, 2095611, 2095611, 2095611, 2095611,
       2095611, 2095611, 2095611, 2095611, 2095611, 2095611, 2095611,
       2095611, 2095611, 2095611, 2095611, 2095611, 2095611, 2095611,
       2095611, 2095611, 2095611, 2095611, 2095611, 2095611, 2095611,
       2095611, 2095611, 2095611, 2095611, 2095611, 2095611, 2095611,
       2095611, 2095611, 2095611, 2095611, 2095611, 2095611, 2095611,
       2095611, 2095611, 2095611, 2095611, 2095611, 2095611, 2095611,
       2095611, 2095611, 2095611, 2095611, 2095611, 2095611, 2095611,
       2095611, 2095611, 2095611, 2095611, 2095611, 2095611, 2095611,
       2095611, 2095611, 2095611, 2095611, 2095611, 2095611, 2095611,
       2095611, 2095611, 2095611, 2095611, 2095611, 2095611, 2095611,
       2095611, 2095611, 2095611, 2095611, 2095611, 2095611, 2095611,
       2095611, 2095611, 2095611, 2095611, 2095611, 2095611, 2095611,
       2095611, 2095611, 2095611, 2095611, 2095611, 2095611, 2095611,
       2095611, 2095

### Dataframe concatination with h5py

In [10]:
conv_file = glob.glob(
    "/mnt/users/scratch/leo/scratch/converted_v11/long//0-20001-0-11035_CEUAS_merged_v1.nc"
)[0]
dt_from = datetime_to_seconds(np.datetime64("1970-01-01"))
dt_to = datetime_to_seconds(np.datetime64("1970-01-02"))

df_dict = {}
h_df_dict = {}

with h5py.File(conv_file, "r") as file:
    rts = file["recordindices"]["recordtimestamp"][:]
    idx = np.where(np.logical_and((rts >= dt_from), (rts <= dt_to)))[0]
    plevs = [
        1000,
        2000,
        3000,
        5000,
        7000,
        10000,
        15000,
        20000,
        25000,
        30000,
        40000,
        50000,
        70000,
        85000,
        92500,
        100000,
    ]

    idx_d = {}
    var_d = {
        "air_temperature": "126",
        "relative_humidty": "138",
        "geopotential": "117",
        "eastward_wind_speed": "139",
        "northward_wind_speed": "140",
        "dew_point": "137",
        "specific_humidity": "39",
    }
    for i in var_d:
        idx_d[i] = file["recordindices"][var_d[i]][idx]

    masks = {}
    for i in idx_d:
        masks[i] = file["observations_table"]["z_coordinate"][
            idx_d[i][0] : idx_d[i][-1]
        ]
        masks[i] = np.isin(masks[i], plevs)
        # masks[i] = np.isfinite(masks[i])

    mask = masks["air_temperature"]
    t_idx = idx_d["air_temperature"]
    df_dict["z_coordinate"] = list(
        file["observations_table"]["z_coordinate"][t_idx[0] : t_idx[-1]][mask]
    )
    df_dict["date_time"] = seconds_to_datetime(
        list(file["observations_table"]["date_time"][t_idx[0] : t_idx[-1]][mask])
    )
    df_dict["latitude"] = list(
        file["observations_table"]["latitude"][t_idx[0] : t_idx[-1]][mask]
    )
    df_dict["longitude"] = list(
        file["observations_table"]["longitude"][t_idx[0] : t_idx[-1]][mask]
    )
    repid = np.asarray(
        file["observations_table"]["report_id"][t_idx[0] : t_idx[-1]][mask]
    )
    df_dict["report_id"] = list(
        repid.view("|S{}".format(repid.shape[1])).flatten().astype(str)
    )
    # df_dict['RASE_bias_estimate'] = list(file['advanced_homogenisation']['RASE_bias_estimate'][t_idx[0]:t_idx[-1]][mask])
    # df_dict['latitude_displacement'] = list(file['advanced_homogenisation']['latitude_displacement'][t_idx[0]:t_idx[-1]][mask])
    # df_dict['longitude_displacement'] = list(file['advanced_homogenisation']['longitude_displacement'][t_idx[0]:t_idx[-1]][mask])
    # df_dict['time_since_launch'] = list(file['advanced_homogenisation']['time_since_launch'][t_idx[0]:t_idx[-1]][mask])
    df_dict["air_temperature"] = list(
        file["observations_table"]["observation_value"][t_idx[0] : t_idx[-1]][mask]
    )

    df_dict_d = {}
    for i in masks:
        if i != "air_temperature":
            df_dict_d[i] = {}
    for i in masks:
        if i != "air_temperature":
            df_dict_d[i]["z_coordinate"] = list(
                file["observations_table"]["z_coordinate"][idx_d[i][0] : idx_d[i][-1]][
                    masks[i]
                ]
            )
            df_dict_d[i]["date_time"] = seconds_to_datetime(
                list(
                    file["observations_table"]["date_time"][idx_d[i][0] : idx_d[i][-1]][
                        masks[i]
                    ]
                )
            )
            df_dict_d[i][i] = list(
                file["observations_table"]["observation_value"][
                    idx_d[i][0] : idx_d[i][-1]
                ][masks[i]]
            )

    # df_dict['date_time'] = seconds_to_datetime(df_dict['date_time'])
    df = pd.DataFrame.from_dict(df_dict)

    # h_df_dict['date_time'] = seconds_to_datetime(h_df_dict['date_time'])
    h_df = pd.DataFrame.from_dict(h_df_dict)

    # put dfs together:
    for i in df_dict_d:
        if i != "air_temperature":
            df = df.merge(
                pd.DataFrame.from_dict(df_dict_d[i]),
                how="left",
                on=["date_time", "z_coordinate"],
            )
df

Unnamed: 0,z_coordinate,date_time,latitude,longitude,report_id,air_temperature,relative_humidty,geopotential,eastward_wind_speed,northward_wind_speed,dew_point,specific_humidity
0,1000.0,1970-01-01 00:00:00,48.25,16.370001,200000000030705,226.5,,295950.0,47.454479,17.272017,,
1,2000.0,1970-01-01 00:00:00,48.25,16.370001,200000000030705,212.300003,,252420.0,43.134579,7.60579,,
2,3000.0,1970-01-01 00:00:00,48.25,16.370001,200000000030705,209.300003,,227790.0,31.799999,0.0,,
3,5000.0,1970-01-01 00:00:00,48.25,16.370001,200000000030705,210.899994,,197090.0,21.816664,1.908711,,
4,7000.0,1970-01-01 00:00:00,48.25,16.370001,200000000030705,213.5,,176620.0,15.6,0.0,,
5,10000.0,1970-01-01 00:00:00,48.25,16.370001,200000000030705,213.899994,,154740.0,7.051259,1.889379,,
6,15000.0,1970-01-01 00:00:00,48.25,16.370001,200000000030705,247.100006,,129550.0,14.870597,2.622087,273.299988,0.02606
7,20000.0,1970-01-01 00:00:00,48.25,16.370001,200000000030705,216.699997,,111490.0,10.3416,8.677632,,
8,30000.0,1970-01-01 00:00:00,48.25,16.370001,200000000030705,217.5,,86110.0,16.712479,19.917156,,
9,40000.0,1970-01-01 00:00:00,48.25,16.370001,200000000030705,231.699997,,67670.0,15.35,26.586981,,
