In [1]:
import glob
import gzip
import os
import shutil
import sys
import time
import warnings
import zipfile
from datetime import date

import cdsapi
import h5py
import numpy
import numpy as np
import pandas as pd
import urllib3
import xarray

warnings.filterwarnings("ignore")
import pycountry

sys.path.append(os.getcwd() + "/../cds-backend/code/")
import copy
import glob
import json

import cds_eua4 as eua
import numba
import pandas
import psutil
import requests
from numba import njit


def find_nearest(array, value):
    array = np.asarray(array)
    idx = (np.abs(array - value)).argmin()
    return array[idx]


def datetime_to_seconds(dates, ref="1900-01-01T00:00:00"):
    """from datetime64 to seconds since 1900-01-01 00:00:00"""
    return ((dates - np.datetime64(ref)) / np.timedelta64(1, "s")).astype(np.int64)


def seconds_to_datetime(seconds, ref="1900-01-01"):
    """from seconds to datetime64"""
    seconds = np.asarray(seconds)
    return pd.to_datetime(seconds, unit="s", origin=ref)


import matplotlib
import matplotlib.pylab as plt
import matplotlib.pyplot as maplt

matplotlib.rcParams.update({"font.size": 20})
matplotlib.rcParams["figure.figsize"] = (20, 10)
font = {
    "family": "normal",
    # 'weight' : 'bold',
    "size": 22,
}
matplotlib.rc("font", **font)

## target file structure

In [5]:
comp_files = glob.glob("/users/staff/uvoggenberger/scratch/comp/*.nc")
comp_files[0]

'/users/staff/uvoggenberger/scratch/comp/0-20100-0-00201_CEUAS_merged_v0.nc'

In [6]:
with eua.CDMDataset(comp_files[0]) as file:
    display(file)

File: <HDF5 file "0-20100-0-00201_CEUAS_merged_v0.nc" (mode r)>
Filesize: 1.87 MB
Filename: /users/staff/uvoggenberger/scratch/comp/0-20100-0-00201_CEUAS_merged_v0.nc
(G)roups/(V)ariables: 

 - G | crs__________________________________________ : : 4
 - V | dateindex____________________________________ : : (13, 3)
 - V | days_________________________________________ : : (13,)
 - V | drange_______________________________________ : : (3,)
 - G | header_table_________________________________ : : 46
 - G | id_scheme____________________________________ : : 4
 - G | iri_rstype_map_______________________________ : : 8
 - G | observations_table___________________________ : : 50
 - G | observed_variable____________________________ : : 15
 - V | record_______________________________________ : : (23,)
 - G | recordindices________________________________ : : 5
 - G | sensor_configuration_________________________ : : 12
 - G | source_configuration_________________________ : : 33
 - G | station_confi

## codec to read header

In [6]:
codec = pd.read_csv('./interc_2005/codec.csv')
codec

Unnamed: 0,Codec,Aliases,Languages
0,ascii,"646, us-ascii",English
1,big5,"big5-tw, csbig5",Traditional Chinese
2,big5hkscs,"big5-hkscs, hkscs",Traditional Chinese
3,cp037,"IBM037, IBM039",English
4,cp273,"273, IBM273, csIBM273",German New in version 3.4.
...,...,...,...
92,utf_16_be,UTF-16BE,all languages
93,utf_16_le,UTF-16LE,all languages
94,utf_7,"U7, unicode-1-1-utf-7",all languages
95,utf_8,"U8, UTF, utf8, cp65001",all languages


## correct meaning of columns?

In [7]:
pd.read_csv("./interc_2005/vaisala_01.csv", header=2, delimiter=' ')

Unnamed: 0,number,pressure,temperature,relative_humidity,geopotential,wdir,wspeed,height
0,0,961.6,28.5,71,425,60,6.2,425
1,1,961.2,26.7,74,428,61,5.1,431
2,2,960.8,25.2,75,433,62,4.8,436
3,3,960.2,25.0,76,437,62,5.0,439
4,4,959.6,25.0,77,443,61,5.3,444
...,...,...,...,...,...,...,...,...
4583,4583,6.8,-37.7,1,33681,64,23.6,33818
4584,4584,6.7,-37.7,1,33687,64,23.6,33827
4585,4585,6.7,-37.7,1,33695,65,23.6,33835
4586,4586,6.7,-37.8,1,33704,65,23.5,33843


## reading meisei

In [8]:
input_files = glob.glob("./interc_2005/*.CSV")
input_files[:3]

['./interc_2005/F2005020714S7420400.CSV',
 './interc_2005/F2005020723S7420401.CSV',
 './interc_2005/F2005020814S7420402.CSV']

In [21]:
# "cp932" -> Japanese encoding:
#
df = pd.read_csv(input_files[0], header=6, encoding="cp932")
keys = df.keys()
display(keys)
google_translate_keys = [
    "Observation Time",
    "DCnt",
    "ST",
    "RE",
    "SondeN",
    "FCnt",
    "AGC",
    "Received FREQ",
    "WM",
    "Wind Direction",
    "Wind Speed",
    "Altitude",
    "X Distance",
    "Y Distance",
    "GF",
    "HDP",
    "PDP",
    "Positioning latitude",
    "Positioning Longitude",
    "V",
    "Barometric Pressure 0",
    "Temperature 0",
    "Humidity 0",
    "FE",
    "FRT",
    "FTI",
    " FVH",
    " FVL",
    " FSP1",
    " FSP2",
    " FSP3",
    " FSP4",
    "N",
    "N1",
    "N2",
    "N3",
    "N4",
    "N5",
    "N6",
    "N7",
    "N8",
]
rename_dict = {}
for i in range(len(keys)):
    rename_dict[keys[i]] = google_translate_keys[i]
df = df.rename(columns=rename_dict)
display(df)

Index(['観測時刻', 'DCnt', 'ST', 'RE', 'SondeN', ' FCnt', 'AGC', ' 受信FREQ', 'WM',
       ' 風向', '  風速', '   高度', '  Ｘ距離', '  Ｙ距離', 'GF', 'HDP', ' PDP', ' 測位緯度',
       '  測位経度', 'V', ' 気圧0', '温度0', '湿度0', '  FE', '   FRT', '   FTI',
       '   FVH', '   FVL', '  FSP1', '  FSP2', '  FSP3', '  FSP4', 'N', 'N1',
       'N2', 'N3', 'N4', 'N5', 'N6', 'N7', 'N8'],
      dtype='object')

Unnamed: 0,Observation Time,DCnt,ST,RE,SondeN,FCnt,AGC,Received FREQ,WM,Wind Direction,...,FSP4,N,N1,N2,N3,N4,N5,N6,N7,N8
0,04:00:01,8579,0,0,-------,-----,77,404.5000,1,180.0,...,------,0,--,--,--,--,--,--,--,--
1,04:00:02,8582,2,0,-------,-----,77,404.5000,1,180.0,...,------,0,--,--,--,--,--,--,--,--
2,04:00:03,8585,2,0,-------,-----,77,404.5000,1,180.0,...,------,0,--,--,--,--,--,--,--,--
3,13:36:55,8598,2,0,-------,00082,76,404.5000,1,180.0,...,0.00,0,--,--,--,--,--,--,--,--
4,13:36:56,8601,2,0,-------,-----,77,404.5000,1,180.0,...,------,0,--,--,--,--,--,--,--,--
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6103,15:20:21,9424,7,0,7420400,10320,32,404.5015,1,87.8,...,0.00,0,--,--,--,--,--,--,--,--
6104,15:20:22,9426,7,0,7420400,10322,48,404.5015,1,87.8,...,0.00,0,--,--,--,--,--,--,--,--
6105,15:20:23,9428,7,0,7420400,10322,41,404.5015,1,87.8,...,0.00,0,--,--,--,--,--,--,--,--
6106,15:20:25,9430,7,0,7420400,10326,57,404.5015,1,87.8,...,0.00,8,05,06,09,14,18,22,26,30


## direct access to data files?

In [2]:
!chardetect Vaisala.SND

Vaisala.SND: no result


In [3]:
!file -i Vaisala.SND

Vaisala.SND: application/octet-stream; charset=binary


In [13]:
for i in codec['Codec']:
    try:
        with open("Vaisala.SND", "rb") as f:
            # bytes = f.read()  # (1)
            bytes = f.read().decode(i)  # (2)
            # bytes = np.genfromtxt(f)  # (3)
            print(i)
            print(bytes[:10000])
            print()
    except: 
        pass

cp037
å 8¼áÃÃO  å 8ÛáC  Ñå YNáòö¯  èå F F F °Cá·  yå Ø9áòöü  öå 8\áÃÃ¶  6å 8láªrâ ¶ã Ì¡áÃÃ¶  Wå 0Iáòö¯  7å çPáªrr  }å F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F 