## GHCN Example
GHCN is a collection of weather station data collected from the year 1763 to present.
This notebook illustrates how to read ghcn data using h5pyd and query for sp
See: https://www.ncei.noaa.gov/products/land-based-station/global-historical-climatology-network-daily,
for a description of the format.

In [1]:
import h5pyd
import pandas as pd

In [3]:
! hsls -H -v --bucket hdflab2 /shared/ghcn/

ghcn                                                folder   2021-11-09 04:58:44 /shared/ghcn/
ghcn                                   90.0G        domain   2022-11-25 15:38:14 /shared/ghcn/ghcn.h5
ghcn                                                folder   2021-11-12 18:56:19 /shared/ghcn/year
3 items
90.0G bytes


In [5]:
# set use_cache to false as the domain will be updated periodically
f = h5pyd.File("/shared/ghcn/ghcn.h5", bucket="hdflab2", use_cache=False)

In [6]:
list(f)

['data', 'stations']

In [7]:
# get the main dataset
dset = f['data']
dset

<HDF5 dataset "data": shape (3018181465,), type "|V32">

In [21]:
dset.nrows

3018181465

In [22]:
type(dset)

h5pyd._hl.table.Table

In [24]:
isinstance(dset, h5pyd.Dataset)

True

In [26]:
isinstance(dset, h5pyd.Table)

True

In [8]:
# dataset has a compound type to represent the different reporting fields
# for each observation
dset.dtype

dtype([('station_id', 'S11'), ('ymd', 'S8'), ('element', 'S4'), ('data_value', '<i2'), ('m_flag', 'S1'), ('q_flag', 'S1'), ('s_flag', 'S1'), ('obs_time', 'S4')])

In [9]:
dset.dtype.itemsize

32

In [10]:
dset.chunks

(91268,)

In [12]:
dset.shape[0] // dset.chunks[0]

33069

In [13]:
# read the first few rows to a Pandas dataframe
df = pd.DataFrame(dset[:12])
df

Unnamed: 0,station_id,ymd,element,data_value,m_flag,q_flag,s_flag,obs_time
0,b'ITE00100554',b'17630101',b'TMAX',-36,b'',b'',b'E',b''
1,b'ITE00100554',b'17630101',b'TMIN',-50,b'',b'',b'E',b''
2,b'ITE00100554',b'17630102',b'TMAX',-26,b'',b'',b'E',b''
3,b'ITE00100554',b'17630102',b'TMIN',-40,b'',b'',b'E',b''
4,b'ITE00100554',b'17630103',b'TMAX',-9,b'',b'',b'E',b''
5,b'ITE00100554',b'17630103',b'TMIN',-29,b'',b'',b'E',b''
6,b'ITE00100554',b'17630104',b'TMAX',-4,b'',b'',b'E',b''
7,b'ITE00100554',b'17630104',b'TMIN',-24,b'',b'',b'E',b''
8,b'ITE00100554',b'17630105',b'TMAX',21,b'',b'',b'E',b''
9,b'ITE00100554',b'17630105',b'TMIN',1,b'',b'',b'E',b''


In [15]:
df['ymd']

0     b'17630101'
1     b'17630101'
2     b'17630102'
3     b'17630102'
4     b'17630103'
5     b'17630103'
6     b'17630104'
7     b'17630104'
8     b'17630105'
9     b'17630105'
10    b'17630106'
11    b'17630106'
Name: ymd, dtype: object

In [16]:
# show the last twelve rows as a Pandas dataframe
df = pd.DataFrame(dset[-12:])
df

Unnamed: 0,station_id,ymd,element,data_value,m_flag,q_flag,s_flag,obs_time
0,b'USR0000WGOL',b'20220821',b'TMAX',289,b'H',b'',b'U',b''
1,b'USR0000WGOL',b'20220821',b'TMIN',189,b'H',b'',b'U',b''
2,b'USR0000WGOL',b'20220821',b'TAVG',236,b'',b'',b'U',b''
3,b'USR0000WGRE',b'20220821',b'TMAX',283,b'H',b'',b'U',b''
4,b'USR0000WGRE',b'20220821',b'TMIN',156,b'H',b'',b'U',b''
5,b'USR0000WGRE',b'20220821',b'TAVG',211,b'',b'',b'U',b''
6,b'USR0000WGRN',b'20220821',b'TMAX',289,b'H',b'',b'U',b''
7,b'USR0000WGRN',b'20220821',b'TMIN',144,b'H',b'',b'U',b''
8,b'USR0000WGRN',b'20220821',b'TAVG',212,b'',b'',b'U',b''
9,b'USR0000WHAG',b'20220821',b'TMAX',267,b'H',b'',b'U',b''


In [17]:
# this is a smaller dataset that has one row for each reporting station
stations = f["stations"]
stations

<HDF5 dataset "stations": shape (124248,), type "|V66">

In [18]:
df = pd.DataFrame(stations[-12:])
df

Unnamed: 0,station_id,lat,lon,elev,state,name,gsn_flag,hcn_flag,wmo_id
0,b'ZI000067867',-19.450001,29.85,1429.0,b'',b'GWERU',b'',b'',b'67867'
1,b'ZI000067889',-18.283001,32.75,1880.0,b'',b'WYANGA',b'',b'',b'67889'
2,b'ZI000067964',-20.15,28.617001,1344.0,b'',b'BULAWAYO (GOETZ OBS',b'GSN',b'',b'67964'
3,b'ZI000067965',-20.017,28.617001,1326.0,b'',b'BULAWAYO AIRPORT',b'',b'',b'67965'
4,b'ZI000067969',-21.049999,29.367001,861.0,b'',b'WEST NICHOLSON',b'',b'',b'67969'
5,b'ZI000067975',-20.066999,30.867001,1095.0,b'',b'MASVINGO',b'',b'',b'67975'
6,b'ZI000067977',-21.017,31.583,430.0,b'',b'BUFFALO RANGE',b'',b'',b'67977'
7,b'ZI000067983',-20.200001,32.616001,1132.0,b'',b'CHIPINGE',b'GSN',b'',b'67983'
8,b'ZI000067991',-22.216999,30.0,457.0,b'',b'BEITBRIDGE',b'',b'',b'67991'
9,b'',0.0,0.0,0.0,b'',b'',b'',b'',b''


In [19]:
# let's see what the station in Darwin, Australia has reported
station_id = 'ASN00014016'

In [27]:
# this will take a few minutes to query the entire dataset
%time arr = dset.read_where(f"station_id == b'{station_id}'", start=0, stop=1_000_000)

CPU times: user 4.54 ms, sys: 0 ns, total: 4.54 ms
Wall time: 1.31 s


In [28]:
arr.shape

(578,)

In [29]:
# show data frame with all rows for this station id
df = pd.DataFrame(arr)
df

Unnamed: 0,index,station_id,ymd,element,data_value,m_flag,q_flag,s_flag,obs_time
0,860214,b'ASN00014016',b'18690306',b'PRCP',351,b'',b'',b'a',b''
1,860406,b'ASN00014016',b'18690307',b'PRCP',0,b'',b'',b'a',b''
2,860579,b'ASN00014016',b'18690308',b'PRCP',0,b'',b'',b'a',b''
3,860770,b'ASN00014016',b'18690309',b'PRCP',127,b'',b'',b'a',b''
4,860961,b'ASN00014016',b'18690310',b'PRCP',0,b'',b'',b'a',b''
...,...,...,...,...,...,...,...,...,...
573,998934,b'ASN00014016',b'18701231',b'PRCP',0,b'',b'',b'a',b''
574,999175,b'ASN00014016',b'18710101',b'PRCP',140,b'',b'',b'a',b''
575,999421,b'ASN00014016',b'18710102',b'PRCP',36,b'',b'',b'a',b''
576,999688,b'ASN00014016',b'18710103',b'PRCP',0,b'',b'',b'a',b''


In [None]:
%%time
# get unique station ids per year
station_year_map = {}
cursor = dset.create_cursor()
count = 0
for row in cursor:
    station_id = row['station_id'].decode('ascii')
    ymd = row['ymd'].decode('ascii')
    if len(ymd) != 8:
        # print(f"unexpected ymd: {ymd}")
        count += 1
        continue
    year = int(ymd[:4])  # format YYYYMMDD
    if year not in station_year_map:
        station_year_map[year] = set()
    station_ids = station_year_map[year]
    station_ids.add(station_id)
print("bad lines:", count)

In [None]:
station_year_map.keys()

In [None]:
len(station_year_map[1876])

In [None]:
for year in station_year_map:
    station_ids = station_year_map[year]
    print(f"{year} - {len(station_ids)}")