In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

In [2]:
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings("ignore")

In [None]:
data = pd.read_parquet('../data/data.pq')

In [None]:
def save_to_npz(data: pd.DataFrame, features: list, start: list, target: list, test_size: float = 0.1) -> None:

    k = True

    while k:

        ind = np.random.choice(data.ind.unique())
        year = np.random.choice(data.ts.dt.year.unique())
        n = data[(data.ind == ind) & (data.ts.dt.year == year)].dec.nunique()
        k = (n > 16) or (n < 11)

    data = data.set_index(['ind', data.ts.dt.year, 'dec'])
    sample_idx = data.loc[[ind], [year], :].index.unique().to_numpy()
    train_val_idx = data.drop([ind, year]).index.unique().to_numpy()
    idx_stays = np.random.permutation(train_val_idx.shape[0])
    break_point = int(len(idx_stays)*test_size)
    train_idx = train_val_idx[idx_stays[break_point:]]
    val_idx = train_val_idx[idx_stays[:break_point]]
    all_idx = {'train': train_idx, 'val': val_idx, 'sample': sample_idx}

    for key in all_idx.keys():

        for ind, year, dec in tqdm(all_idx[key], desc=f'Saving {key} to npz'):

            v = data.loc[ind, year, dec][features].to_numpy()
            z0 = data.loc[ind, year, dec][start].to_numpy()[0]
            z1 = data.loc[ind, year, dec][target].to_numpy()[0]

            np.savez_compressed(f'../data/dataset/{key}/{ind}_{year}_{dec}.npz',
                                v=v, z0=z0, z1=z1, ind=ind, year=year, dec=dec)

    alls = set(Path('../data/dataset').rglob('*.npz'))

    for path in tqdm(alls, desc="Search data with NaN"):

        file = np.load(path)
        v, z0, z1 = file['v'], file['z0'], file['z1']

        if np.isnan(v).sum() or np.isnan(z0).sum() or np.isnan(z1).sum():
            os.remove(path)

In [None]:
FEATURES_COLS = ['t2m', 'td2m', 'ff', 'R12', 'phi', 'air', 'soilw', 'precip', 'soiltype', 'covertype']

START_VAL_COLS = ['val_1', 'val_2']
TARGET_COLS = ['val_1_next', 'val_2_next']

In [None]:
save_to_npz(data, FEATURES_COLS, START_VAL_COLS, TARGET_COLS)

Saving train to npz:   0%|          | 0/52800 [00:00<?, ?it/s]

Saving val to npz:   0%|          | 0/13200 [00:00<?, ?it/s]

Saving sample to npz:   0%|          | 0/16 [00:00<?, ?it/s]

Search data with NaN:   0%|          | 0/66016 [00:00<?, ?it/s]

In [3]:
alls = set(Path('../data/dataset').rglob('*.npz'))
import os

for path in alls:
    os.remove(path)