In [1]:
import os
import tarfile
import re
import numpy as np
import h5py
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
import zipfile
from datetime import datetime, timedelta

from netCDF4 import Dataset
from mpl_toolkits.basemap import Basemap

from batchcreator import minmax
from batchcreator import DataGenerator as dg
import tensorflow as tf
import config

# Aart Radar Dataset

In [2]:
path_aart = '/nobackup_1/users/schreurs/project_GAN/dataset_aart'

folders = sorted([f for f in os.listdir(path_aart) if os.path.isdir(os.path.join(path_aart, f))])

## Unpack zip files

In [4]:
# Testing path
zip_path = path_aart + '/' + folders[0] + '/' + folders[0] + '.zip'
print("zip path: ", zip_path)
unpack_folder = path_aart + '/' + folders[0]
print("unpack in folder: ", unpack_folder)

zip path:  /nobackup/users/schreurs/project_GAN/dataset_aart/2009/2009.zip
unpack in folder:  /nobackup/users/schreurs/project_GAN/dataset_aart/2009


In [7]:
def unzip():
    for folder in tqdm(folders):
        zip_path = path_aart + '/' + folder + '/' + folder + '.zip'
        unpack_folder = path_aart + '/' + folder

        with zipfile.ZipFile(zip_path, 'r') as zip_file:
            zip_file.extractall(unpack_folder)
#unzip()

## Convert NC to npy 

Numpy arrays take more memory than nc files. Therefor I will only convert 1 year of data to numpy for now

In [2]:
def get_filesnames(start_dt, end_dt):
    '''
    This function returns filenames between the a starting date and end date. 
    start_dt: starting date time 
    end_dt: end date time
    '''
    # Create list of IDs to retrieve
    dts = np.arange(start_dt, end_dt, timedelta(minutes=5)).astype(datetime)
    # Convert to filenames
    filenames = ['{:%Y%m%d%H%M}'.format(dt) for dt in dts]
    return filenames 

def nc2npy(in_path, out_path, year=2018, as_int=True, overwrite=False, label_dir=None, preprocess=False, filenames=None):
    '''
    Converts nc files of a given year to numpy files 
    in_path: path that points to the .nc files
    out_path: directory to store numpy files
    year: indicates which year to convert
    as_int: converts the decimal numbers to integers. 
            Note that the data is discrete, therefor integers might be more suitable than floats
    overwrite: If true then overwrite previously preprocessed data. If false, then skips files that
                are already preprocessed
    label_dir: directory to store no rain labels. If none then do not label the data
    preprocess: If true then preprocesses the data by converting rain to dbz, normalize to [0,1] and rescale to 256x256
    '''   
    if filenames is not None:
        out_path = config.dir_aart_prep
    else:
        # Get filename of corresponding year
        start = datetime(year, 1, 1, 0, 0)
        end = datetime(year,12, 31, 23, 55)
        filenames = get_filesnames(start,end)
        
    # Create directory if it does not exist
    if not os.path.exists(out_path):
        os.makedirs(out_path)
    if label_dir and not os.path.exists(label_dir):
        os.makedirs(label_dir)
    for filename in tqdm(filenames): 
        year_str = filename[:4]
        path_f = in_path + year_str + '/' + config.prefix_aart + filename + '.nc'
        
        if not overwrite and filename +'.npy' in output_files:
            # Skip this file if already processed,
            # go to next file in list
            continue
        try:
            with Dataset(path_f, mode='r') as ds:
                rain = ds['image1_image_data'][:][0].data
                mask = ds['image1_image_data'][:][0].mask
                # Apply mask
                rain = rain * ~mask
        
                if as_int:
                    rain *=100
                    rain = rain.astype(int)
                
                if preprocess:
                    rain = perform_preprocessing(rain)
                np.save(out_path + '/{}.npy'.format(filename), rain)    
                if label_dir:
                    no_rain_thresh =0.3
                    if as_int:
                        no_rain_thresh = 30
                    nr_rainy_pixels = np.sum(rain >= no_rain_thresh)
                    no_rain_label = nr_rainy_pixels  == 0
                    label_fn = label_dir + '/{}.npy'.format(filename)
                    np.save(label_fn, no_rain_label)
        except Exception as e:
            print(e)
            rain = np.zeros((765,700))
            if preprocess:
                rain = np.zeros((256,256,1))
            # np.save(out_path + '/{}.npy'.format(filename), rain)
            if label_dir:
                no_rain_label = True
                label_fn = label_dir + '/{}.npy'.format(filename)
                np.save(label_fn, no_rain_label)
            print('Error: could not convrt the file {} to numpy'.format(filename))

In [7]:
def perform_preprocessing(y, downscale256=True):
    y = minmax(y, tanh=False, undo=False, convert_to_dbz = True)
    y = np.expand_dims(y, axis=-1)
    if downscale256:
        # Temporary expand y dimensions so that cropping function works: h,w,c -> 1,1,h,w,c
        y = np.expand_dims(y, axis=0)
        y = np.expand_dims(y, axis=0)
        # First make the images square size
        y = dg.crop_center(dg, y, cropx=384, cropy=384) 
        
        # Remove the extra dimensions 
        y = y[0][0]
        y =  tf.image.resize(y, (256, 256))
    return y

In [8]:
path_aart = config.dir_aart
path_aart_prep = config.dir_aart_prep

# Get files that are already converted to numpy
output_files = sorted([f for f in os.listdir(path_aart_prep) 
                       if os.path.isfile(os.path.join(path_aart_prep, f))])

print(len(output_files))
print('Approx {:.2f} years of data'.format(len(output_files)/288/365))

4756
Approx 0.05 years of data


In [None]:
# Load all target files in the training set
fn_aart_train = np.load('train2015_2018.npy', allow_pickle = True)[:,1]
fn_aart_val = np.load('val2019.npy', allow_pickle = True)[:,1]

filenames_aart = np.append(fn_aart_train, fn_aart_val)
# flatten the list
filenames_aart = [item for sublist in filenames_aart for item in sublist]

nc2npy(path_aart, path_aart_prep, overwrite = True, preprocess=True, filenames=filenames_aart)

 65%|██████▌   | 3104/4756 [03:06<01:35, 17.31it/s]