### Step1: Data conversion
Read the BAG dataset, convert it to a geopaquet.

In [None]:
from pathlib import Path
from datetime import datetime

# dask_geopandas, dask-expr, pyogrio
import dask_geopandas as dg

In [None]:
# Get metadata
data_BAG = dg.read_file('../data/downloaded/BAG/bag-light-NL.gpkg', npartitions=50)
data_BAG

In [None]:
# Create a path to save the data
path_BAG = Path('../data/dataset/BAG.parquet/')
path_BAG.mkdir(exist_ok=True, parents=True)

In [None]:
# This does not work, complains about the data type of the geometry column
# data_BAG.to_parquet(path_BAG)
# data_BAG[['rdf_seealso', 'identificatie', 'bouwjaar', 'status', 'gebruiksdoel',
#        'oppervlakte_min', 'oppervlakte_max', 'aantal_verblijfsobjecten']].to_parquet(path_BAG) # This works because there is no geometry column

for i in range(data_BAG.npartitions):
    data_partition = data_BAG.get_partition(i).compute()
    data_partition.to_parquet(path_BAG / f'bag_{i}.parquet')

### Compair performace of gpkg and geopaquet

Test getting two columns from the BAG dataset, and then compute the first partition. Reading from parquet is almost 50 times faster than reading from geopackage.

In [None]:
%%timeit
data_BAG_gpks = dg.read_file('../data/downloaded/BAG/bag-light-NL.gpkg', npartitions=50, columns=['bouwjaar', 'status'])
data_BAG_slice = data_BAG_gpks.get_partition(0).compute()

In [None]:
%%timeit
data_BAG_parque = dg.read_parquet('../data/dataset/BAG.parquet/', columns=['bouwjaar', 'status'])
data_BAG_parque.get_partition(0).compute()

## Convert KNMI station data

In [1]:
import pandas as pd
import xarray as xr
import numpy as np
import datetime

In [2]:
df_stations = pd.read_csv('../../data/downloaded/KNMI/knmistations.csv')
df_stations = df_stations.rename(columns={'STN': 'station'}).set_index('station')
df_stations

Unnamed: 0_level_0,STARTT,STOPT,LOCATIE,HOOGTE,POS_X,POS_Y,POS_NB,POS_OL
station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
201,20090313,99991231,platformD15-FA-1,42.7,-4.5,707.1,54.3,2.9
203,20091109,99991231,platformP11-B,41.8,15.7,487.6,52.4,3.3
204,20061206,99991231,platformK14-FA-1C,41.8,37.6,588.3,53.3,3.6
205,20090313,99991231,platformA12-CPP,48.4,55.0,825.1,55.4,3.8
206,20061206,99991231,platformF16-A,43.4,65.1,682.0,54.1,4.0
...,...,...,...,...,...,...,...,...
370,19510101,99991231,Eindhoven,22.6,154.3,384.4,51.4,5.4
375,19510201,99991231,Volkel,22.0,177.1,407.7,51.7,5.7
377,19990501,99991231,Ell,30.0,181.2,356.3,51.2,5.8
380,19060101,99991231,Maastricht,114.3,181.3,323.9,50.9,5.8


In [12]:
ds = xr.Dataset.from_dataframe(df_stations)
# ds = ds.expand_dims('time')
ds

In [4]:
# Load the knmi file for station 280 as a pandas dataframe

# Get the column names
header = pd.read_csv('../../data/downloaded/KNMI/knmi_stn280_example_file.txt', skiprows=32, nrows=0, delimiter=',')
col_names = [col.replace(' ', '') for col in header.columns] # Remove the spaces from the column names
col_names = [col.replace('#', '') for col in col_names] # Remove the # from the column names

# skip 32 rows since they are comments
# keep row 31 as header
# remove row 32 since it only has a hashtag
df_one_stn = pd.read_csv('../../data/downloaded/KNMI/knmi_stn280_example_file.txt', skiprows=34, header=0)

# Add the column names
df_one_stn.columns = col_names

df_one_stn

Unnamed: 0,STN,YYYYMMDD,HH,DD,FH,FF,FX,T,T10,TD,...,VV,N,U,WW,IX,M,R,S,O,Y
0,280,20080101,2,210,0,10,10,-33,,-33,...,0,9,100,35,7,1,0,0,0,1
1,280,20080101,3,0,0,0,10,-31,,-31,...,0,9,100,35,7,1,0,0,0,1
2,280,20080101,4,0,0,0,10,-29,,-29,...,2,9,100,35,7,1,0,0,0,1
3,280,20080101,5,990,10,10,20,-22,,-23,...,17,8,99,20,7,1,0,0,0,1
4,280,20080101,6,990,10,10,20,-19,-35,-19,...,12,8,100,10,7,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
960,280,20080213,16,360,20,20,40,35,,17,...,36,8,88,10,7,0,0,0,0,0
961,280,20080213,17,20,20,30,40,27,,14,...,36,8,91,10,7,0,0,0,0,0
962,280,20080213,18,20,30,30,60,20,20,11,...,27,8,94,10,7,0,0,0,0,0
963,280,20080213,19,30,30,20,50,21,,12,...,30,8,94,10,7,0,0,0,0,0


In [5]:
# Replace blanks with NaN
for col in ['T10', 'WW']:
    df_one_stn[col] = df_one_stn[col].replace(r'^\s*$', float('nan'), regex=True).astype(float) # Replace empty strings with nan, then convert to float
df_one_stn.columns

Index(['STN', 'YYYYMMDD', 'HH', 'DD', 'FH', 'FF', 'FX', 'T', 'T10', 'TD', 'SQ',
       'Q', 'DR', 'RH', 'P', 'VV', 'N', 'U', 'WW', 'IX', 'M', 'R', 'S', 'O',
       'Y'],
      dtype='object')

In [6]:
df_one_stn['time'] = df_one_stn['YYYYMMDD'].astype(int).astype(str).apply(lambda x: datetime.datetime.strptime(x, '%Y%m%d'))
df_one_stn = df_one_stn.drop(columns=['YYYYMMDD'])
ds_one_stn = df_one_stn.set_index('time').to_xarray()
ds_one_stn

In [7]:
stn = np.unique(ds_one_stn['STN'].values)
assert stn.shape[0]==1

ds_one_stn = ds_one_stn.expand_dims('station').assign_coords(station=stn)
ds_one_stn

In [13]:
ds

In [14]:
ds.merge(ds_one_stn, combine_attrs='override')