In [1]:
import os
import tarfile
import re
import numpy as np
import h5py
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
import zipfile
from datetime import datetime, timedelta

from netCDF4 import Dataset
from mpl_toolkits.basemap import Basemap

from batchcreator import minmax
from batchcreator import DataGenerator as dg
import tensorflow as tf
import config

Pysteps configuration file found at: /usr/people/schreurs/.local/lib/python3.9/site-packages/pysteps/pystepsrc



# Aart Radar Dataset

In [2]:
path_aart = '/nobackup_1/users/schreurs/project_GAN/dataset_aart'

folders = sorted([f for f in os.listdir(path_aart) if os.path.isdir(os.path.join(path_aart, f))])

## Unpack zip files

In [4]:
# Testing path
zip_path = path_aart + '/' + folders[0] + '/' + folders[0] + '.zip'
print("zip path: ", zip_path)
unpack_folder = path_aart + '/' + folders[0]
print("unpack in folder: ", unpack_folder)

zip path:  /nobackup/users/schreurs/project_GAN/dataset_aart/2009/2009.zip
unpack in folder:  /nobackup/users/schreurs/project_GAN/dataset_aart/2009


In [7]:
def unzip():
    for folder in tqdm(folders):
        zip_path = path_aart + '/' + folder + '/' + folder + '.zip'
        unpack_folder = path_aart + '/' + folder

        with zipfile.ZipFile(zip_path, 'r') as zip_file:
            zip_file.extractall(unpack_folder)
#unzip()

## Convert NC to npy 

Numpy arrays take more memory than nc files. Therefor I will only convert 1 year of data to numpy for now

In [3]:
def get_filesnames(start_dt, end_dt):
    '''
    This function returns filenames between the a starting date and end date. 
    start_dt: starting date time 
    end_dt: end date time
    '''
    # Create list of IDs to retrieve
    dts = np.arange(start_dt, end_dt, timedelta(minutes=5)).astype(datetime)
    # Convert to filenames
    filenames = ['{:%Y%m%d%H%M}'.format(dt) for dt in dts]
    return filenames 

def nc2npy(in_path, out_path, year=2018, overwrite=False, preprocess=False, filenames=None):
    '''
    Converts nc files of a given year to numpy files 
    in_path: path that points to the .nc files
    out_path: directory to store numpy files
    year: indicates which year to convert
    as_int: converts the decimal numbers to integers. 
            Note that the data is discrete, therefor integers might be more suitable than floats
    overwrite: If true then overwrite previously preprocessed data. If false, then skips files that
                are already preprocessed
    preprocess: If true then preprocesses the data by converting rain to dbz, normalize to [0,1] and rescale to 256x256
    '''   
    if filenames is not None:
        out_path = config.dir_aart_prep
    else:
        # Get filename of corresponding year
        start = datetime(year, 1, 1, 0, 0)
        end = datetime(year,12, 31, 23, 55)
        filenames = get_filesnames(start,end)
        
    # Create directory if it does not exist
    if not os.path.exists(out_path):
        os.makedirs(out_path)
    for filename in tqdm(filenames): 
        year_str = filename[:4]
        path_f = in_path + year_str + '/' + config.prefix_aart + filename + '.nc'
        
        if not overwrite and filename +'.npy' in output_files:
            # Skip this file if already processed,
            # go to next file in list
            continue
        try:
            with Dataset(path_f, mode='r') as ds:
                rain = ds['image1_image_data'][:][0].data
                mask = ds['image1_image_data'][:][0].mask
                
                # Apply mask
                rain = rain * ~mask
                # convert to mm/h from mm/5min
                rain = rain*12
                if preprocess:
                    rain = perform_preprocessing(rain)  
            np.save(out_path + '/{}.npy'.format(filename), rain)
        except Exception as e:
            print(e)

In [4]:
def perform_preprocessing(y, downscale256=True):
    # convert to dbz and perform normalize to values between 0 and 1
    y = minmax(y, norm_method='minmax', undo=False, convert_to_dbz = True)
    
    y = np.expand_dims(y, axis=-1)
    if downscale256:
        # Temporary expand y dimensions so that cropping function works: h,w,c -> 1,1,h,w,c
        y = np.expand_dims(y, axis=0)
        y = np.expand_dims(y, axis=0)
        # First make the images square size
        y = dg.crop_center(dg, y, cropx=384, cropy=384) 
        
        # Remove the extra dimensions 
        y = y[0][0]
        y =  tf.image.resize(y, (256, 256))
    return y

In [5]:
path_aart = config.dir_aart
path_aart_prep = config.dir_aart_prep

# Get files that are already converted to numpy
output_files = sorted([f for f in os.listdir(path_aart_prep) 
                       if os.path.isfile(os.path.join(path_aart_prep, f))])

print(len(output_files))
print('Approx {:.2f} years of data'.format(len(output_files)/288/365))

70462
Approx 0.67 years of data


In [6]:
# Preprocess data of the 30minute interval with 3 y
# Load all target files in the training set
fn_aart_train = np.load('datasets/train2008_2018_3y_30m.npy', allow_pickle = True)[:,1]
fn_aart_val = np.load('datasets/val2019_3y_30m.npy', allow_pickle = True)[:,1]
filenames_aart = np.append(fn_aart_train, fn_aart_val)

# flatten the list
filenames_aart = [item for sublist in filenames_aart for item in sublist]
print(len(filenames_aart))
# remove duplicate filenames:
filenames_aart = sorted(list(set(filenames_aart)))
print(len(filenames_aart))
print(filenames_aart[0], filenames_aart[-1])

221025
80710
200801031330 201912301330


In [7]:
nc2npy(path_aart, path_aart_prep, overwrite = False, preprocess=True, filenames=filenames_aart)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  rain = ds['image1_image_data'][:][0].data
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  mask = ds['image1_image_data'][:][0].mask
100%|██████████| 80710/80710 [12:37<00:00, 106.53it/s] 


In [36]:
# Convert the 3y dataset to 1y
data = np.load('train2015_2018_3y_30m.npy', allow_pickle = True)
y = [[ys[0]] for ys in data[:,1]]
data[:,1] = y
np.save('train2015_2018_1y_30m.npy', data)

In [37]:
# Convert the 3y dataset to 1y
data = np.load('val2019_3y_30m.npy', allow_pickle = True)
y = [[ys[0]] for ys in data[:,1]]
data[:,1] = y
np.save('val2019_1y_30m.npy', data)

In [7]:
data = np.load('train2008_2018_3y_30m.npy', allow_pickle = True)

In [11]:
data = np.load('datasets/train2015_2018_3y_30m.npy', allow_pickle = True)
data.shape

(25409, 2)

In [8]:
data.shape

(10493, 2)

In [48]:
without2008 = []

for sample in data:
    if not sample[1][0][:4] == '2008':
        without2008.append(sample)
without2008 = np.array(without2008)
without2008.shape

(9351, 2)

In [49]:
without2008[0]

array([list(['200901180135', '200901180140', '200901180145', '200901180150', '200901180155', '200901180200']),
       list(['200901180230', '200901180300', '200901180330'])],
      dtype=object)

In [51]:
np.save('train2009_2018_3y_30m.npy', without2008)

In [15]:
from batchcreator import get_list_IDs

# for testing preprocess some more files with different y interval setttings
start_dt = datetime(2020,1,1,0,0)
end_dt =  datetime(2021,1,1,0,0)

list_IDs = get_list_IDs(start_dt, end_dt, x_seq_size=6, y_seq_size=3, filter_no_rain='avg0.01mm', y_interval=30)
print(list_IDs[-1])
print(len(list_IDs))

[Errno 2] No such file or directory: '/nobackup_1/users/schreurs/project_GAN/rtcor_heavy_rain_labels/2020/03/202003271505.npy'
[Errno 2] No such file or directory: '/nobackup_1/users/schreurs/project_GAN/rtcor_heavy_rain_labels/2020/07/202007160430.npy'
[Errno 2] No such file or directory: '/nobackup_1/users/schreurs/project_GAN/rtcor_heavy_rain_labels/2020/07/202007160435.npy'
[Errno 2] No such file or directory: '/nobackup_1/users/schreurs/project_GAN/rtcor_heavy_rain_labels/2020/10/202010011130.npy'
[Errno 2] No such file or directory: '/nobackup_1/users/schreurs/project_GAN/rtcor_heavy_rain_labels/2020/10/202010062040.npy'
[Errno 2] No such file or directory: '/nobackup_1/users/schreurs/project_GAN/rtcor_heavy_rain_labels/2020/10/202010190845.npy'
[Errno 2] No such file or directory: '/nobackup_1/users/schreurs/project_GAN/rtcor_heavy_rain_labels/2020/10/202010200020.npy'
[Errno 2] No such file or directory: '/nobackup_1/users/schreurs/project_GAN/rtcor_heavy_rain_labels/2020/11/20

In [13]:
import os
import config as conf
list_IDs2 = []
print(len(list_IDs))
for io in tqdm(list_IDs):
    add = True
    for fs in io:
        for f in fs:
            year = f[:4]
            month = f[4:6]
            ts = f
            f_path = conf.dir_rtcor + '{Y}/{m}/{prefix}{ts}.h5'.format(Y=year, m=month, ts=ts, prefix=conf.prefix_rtcor)

            if not os.path.isfile(f_path):
                print(f_path)
                add = False
    if add:
        list_IDs2.append(io)

print(len(list_IDs2))

  3%|▎         | 2304/67153 [00:00<00:02, 23034.75it/s]

67153


 11%|█         | 7105/67153 [00:00<00:02, 23811.72it/s]

/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2008/08/RAD_NL25_RAC_5M_200808150730.h5
/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2008/08/RAD_NL25_RAC_5M_200808150730.h5
/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2008/08/RAD_NL25_RAC_5M_200808150730.h5
/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2008/09/RAD_NL25_RAC_5M_200809231200.h5
/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2008/09/RAD_NL25_RAC_5M_200809231200.h5
/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2008/09/RAD_NL25_RAC_5M_200809231330.h5
/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2008/09/RAD_NL25_RAC_5M_200809231400.h5
/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2008/09/RAD_NL25_RAC_5M_200809231330.h5
/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2008/09/RAD_NL25_RAC_5M_200809231400.h5
/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2008/10/RAD_NL25_RAC_5M_200810080800.h5
/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2008/1

 29%|██▊       | 19285/67153 [00:01<00:02, 16909.65it/s]

/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2010/05/RAD_NL25_RAC_5M_201005310000.h5
/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2010/05/RAD_NL25_RAC_5M_201005310000.h5
/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2010/05/RAD_NL25_RAC_5M_201005310030.h5
/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2010/05/RAD_NL25_RAC_5M_201005310000.h5
/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2010/05/RAD_NL25_RAC_5M_201005310030.h5
/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2010/05/RAD_NL25_RAC_5M_201005310100.h5


 33%|███▎      | 22420/67153 [00:01<00:06, 6735.61it/s] 

/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2011/04/RAD_NL25_RAC_5M_201104041500.h5
/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2011/04/RAD_NL25_RAC_5M_201104041530.h5
/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2011/04/RAD_NL25_RAC_5M_201104041600.h5


 45%|████▍     | 30135/67153 [00:03<00:04, 7503.68it/s]

/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2012/05/RAD_NL25_RAC_5M_201205160600.h5


 47%|████▋     | 31791/67153 [00:03<00:04, 7646.86it/s]

/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2012/08/RAD_NL25_RAC_5M_201208140830.h5
/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2012/08/RAD_NL25_RAC_5M_201208140830.h5
/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2012/08/RAD_NL25_RAC_5M_201208140830.h5


 50%|█████     | 33749/67153 [00:03<00:04, 8113.04it/s]

/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2012/11/RAD_NL25_RAC_5M_201211061230.h5
/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2012/11/RAD_NL25_RAC_5M_201211061230.h5


 53%|█████▎    | 35326/67153 [00:03<00:04, 6838.06it/s]

/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2013/02/RAD_NL25_RAC_5M_201302091500.h5
/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2013/02/RAD_NL25_RAC_5M_201302091500.h5
/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2013/02/RAD_NL25_RAC_5M_201302091500.h5


 68%|██████▊   | 45950/67153 [00:04<00:01, 12332.50it/s]

/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2014/08/RAD_NL25_RAC_5M_201408121330.h5
/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2014/08/RAD_NL25_RAC_5M_201408121330.h5
/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2014/08/RAD_NL25_RAC_5M_201408121330.h5
/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2014/08/RAD_NL25_RAC_5M_201408261330.h5
/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2014/08/RAD_NL25_RAC_5M_201408261330.h5
/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2014/08/RAD_NL25_RAC_5M_201408261330.h5


 75%|███████▍  | 50176/67153 [00:05<00:03, 4982.05it/s] 

/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2015/06/RAD_NL25_RAC_5M_201506230530.h5
/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2015/06/RAD_NL25_RAC_5M_201506230530.h5
/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2015/06/RAD_NL25_RAC_5M_201506230530.h5
/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2015/07/RAD_NL25_RAC_5M_201507291600.h5
/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2015/07/RAD_NL25_RAC_5M_201507291600.h5


 78%|███████▊  | 52394/67153 [00:05<00:02, 5718.38it/s]

/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2015/09/RAD_NL25_RAC_5M_201509141730.h5
/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2015/09/RAD_NL25_RAC_5M_201509141730.h5
/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2015/09/RAD_NL25_RAC_5M_201509141800.h5
/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2015/09/RAD_NL25_RAC_5M_201509141730.h5
/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2015/09/RAD_NL25_RAC_5M_201509141800.h5
/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2015/09/RAD_NL25_RAC_5M_201509141830.h5


 92%|█████████▏| 61808/67153 [00:07<00:00, 5994.05it/s]

/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2017/03/RAD_NL25_RAC_5M_201703180430.h5
/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2017/03/RAD_NL25_RAC_5M_201703180430.h5
/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2017/03/RAD_NL25_RAC_5M_201703180430.h5
/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2017/03/RAD_NL25_RAC_5M_201703180530.h5
/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2017/03/RAD_NL25_RAC_5M_201703181400.h5


 95%|█████████▌| 63796/67153 [00:07<00:00, 4102.66it/s]

/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2017/05/RAD_NL25_RAC_5M_201705201500.h5
/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2017/05/RAD_NL25_RAC_5M_201705201500.h5
/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2017/05/RAD_NL25_RAC_5M_201705201500.h5
/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2017/07/RAD_NL25_RAC_5M_201707191800.h5


 98%|█████████▊| 65889/67153 [00:08<00:00, 6565.75it/s]

/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2017/08/RAD_NL25_RAC_5M_201708021200.h5
/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2017/08/RAD_NL25_RAC_5M_201708021200.h5
/nobackup_1/users/schreurs/project_GAN/dataset_rtcor2/2017/08/RAD_NL25_RAC_5M_201708021200.h5


100%|██████████| 67153/67153 [00:08<00:00, 8025.02it/s]

67105





In [14]:
np.save('datasets/train2008_2018_3y_30m', list_IDs2)

  return array(a, dtype, copy=False, order=order, subok=True)
