# Training a neural network in PyTorch
This notebook demonstrates training a classifier in PyTorch.

In [1]:
# define imports
import os
import pathlib
import re
import iris

import numpy as np

import torch

In [2]:
# define settings to be used for the notebook

REDUCE_DATA_AMOUNT = True
HYPERPARAMETER_TUNING = False

## Loading in the Cloud Base Height Data

In [3]:
if REDUCE_DATA_AMOUNT:
    # hard coded paths for development
    paths_to_load = [ '/scratch/hsouth/cbh_data/20160101T0000Z_glm_pa004.nc',
            '/scratch/hsouth/cbh_data/20160101T0000Z_glm_pa010.nc',
            '/scratch/hsouth/cbh_data/20160101T0000Z_glm_pa018.nc',
            '/scratch/hsouth/cbh_data/20160101T0000Z_glm_pa022.nc']
else:
    paths_to_load = []
    # search on cbh_data directory within user scrath space
    file_directory = (pathlib.Path(os.environ['SCRATCH']) / 'cbh_data')
    for path in os.listdir(file_directory):
        if re.search(r'.nc\b', path):
            paths_to_load.append(str((pathlib.Path(os.environ['SCRATCH']) / 'cbh_data') / path))
            
print('Find files complete, list of paths:', paths_to_load)

cubes = iris.load(paths_to_load)

Find files complete, list of paths: ['/scratch/hsouth/cbh_data/20160101T0000Z_glm_pa004.nc', '/scratch/hsouth/cbh_data/20160101T0000Z_glm_pa010.nc', '/scratch/hsouth/cbh_data/20160101T0000Z_glm_pa018.nc', '/scratch/hsouth/cbh_data/20160101T0000Z_glm_pa022.nc']


In [4]:
# show cubes
print(cubes)

0: m01s05i250 / (unknown)              (time: 4; model_level_number: 70; latitude: 480; longitude: 640)
1: cloud_volume_fraction_in_atmosphere_layer / (1) (time: 4; model_level_number: 70; latitude: 480; longitude: 640)
2: air_pressure / (Pa)                 (time: 4; model_level_number: 70; latitude: 480; longitude: 640)
3: air_temperature / (K)               (time: 4; model_level_number: 70; latitude: 480; longitude: 640)
4: convective_rainfall_flux / (kg m-2 s-1) (time: 4; latitude: 480; longitude: 640)
5: convective_snowfall_flux / (kg m-2 s-1) (time: 4; latitude: 480; longitude: 640)
6: specific_humidity / (kg kg-1)       (time: 4; model_level_number: 70; latitude: 480; longitude: 640)
7: stratiform_rainfall_flux / (kg m-2 s-1) (time: 4; latitude: 480; longitude: 640)
8: stratiform_snowfall_flux / (kg m-2 s-1) (time: 4; latitude: 480; longitude: 640)
9: upward_air_velocity / (m s-1)       (time: 4; model_level_number: 70; latitude: 480; longitude: 640)


## Preprocess the data for training

In [5]:
# extract data we want for the task, temperature, pressure, and humidity for inputs, and cloud volume for outputs
list_of_input_cubes = [ 'air_temperature',
                         'air_pressure',
                         'specific_humidity']
target_cube_name = ['cloud_volume_fraction_in_atmosphere_layer']

target_cube = iris.cube.CubeList(cube for cube in cubes if (cube.long_name) in target_cube_name)
input_cubes = iris.cube.CubeList(cube for cube in cubes if (cube.standard_name) in list_of_input_cubes)

# verify success
print("target cube:\n",target_cube, '\n')
print("input cubes:\n",input_cubes)

target cube:
 0: cloud_volume_fraction_in_atmosphere_layer / (1) (time: 4; model_level_number: 70; latitude: 480; longitude: 640) 

input cubes:
 0: air_pressure / (Pa)                 (time: 4; model_level_number: 70; latitude: 480; longitude: 640)
1: air_temperature / (K)               (time: 4; model_level_number: 70; latitude: 480; longitude: 640)
2: specific_humidity / (kg kg-1)       (time: 4; model_level_number: 70; latitude: 480; longitude: 640)


In [6]:
# Flatten time and lat/long down to sample number
# defining function as preprocessing is applied to both input and target
def flatten_cubes_with_numpy(cube_list):
    cube_array = np.array([cube.data for cube in cube_list])
    
    # print("created dimensions:", cube_array.shape)
    
    cube_num, time, height, lat, long = cube_array.shape
    
    # # verify shape
    # print(cube_array.shape)
    
    # swap axis of time and height to ensure flattening preserves height
    cube_array = cube_array.transpose(0,2,1,3,4)
    cubes_flattened = np.reshape(cube_array, (cube_num, height,(lat*long*time)))
    
    # print("new dimensions", cubes_flattened.shape)
    
    cube_to_return = cubes_flattened.T
    # remove unnecessary dimensions
    cube_to_return = cube_to_return.squeeze()
    return cube_to_return

input_array = flatten_cubes_with_numpy(input_cubes)
target_array = flatten_cubes_with_numpy(target_cube)

# print("verify squeeze", target_array.shape)

(3, 4, 70, 480, 640)
(1, 4, 70, 480, 640)


## Preprocess the data toward ML algorithm input

In [7]:
# preprocess the target
# for the target, we define a cloud exisitng in a height layer:
# if the cloud volume fraction is greater than 2 out of possible 8 oktas
cloud_threshold = 2./8.
#find the first occurences where the cloud volume is greater than the threshold, 
# stores 0 otherwise
cloud_over_threshold = np.where(target_array>cloud_threshold)
_, first_duplicate_indicies = np.unique(cloud_over_threshold[0], return_index=True)
print("Number of cloud bases found:",first_duplicate_indicies.shape)
print("Out of samples:", target_array.shape[0])

Number of cloud bases found: (962499,)
Out of samples: 1228800


In [8]:
#for clouds where no base was found, add a marker at the final height layer 
# (where no cloud volume over threshold appears in the data)

# verify the claim that no cloud bases appear in the final layer
# can be strengthened to, no clouds exist in the final layer
print("list of clouds at final height level:", np.where(target_array[:,-1]>cloud_threshold))

# encode the cloud in onehot vector
one_hot_encoded_bases = np.zeros(target_array.shape)
one_hot_encoded_bases[cloud_over_threshold[0][first_duplicate_indicies],cloud_over_threshold[1][first_duplicate_indicies]] = 1
# mark the end (final layer) if no cloud base detected
flip = lambda booleanVal: not booleanVal
vflip = np.vectorize(flip)
one_hot_encoded_bases[np.where(vflip(np.any(one_hot_encoded_bases, axis=1)))[0], -1] = 1

# Now reduce vectors as if each height layer is treated as a class where the model will predict, onehot -> class label e.g. 0,0,1,0, -> 2
class_label_encoded_bases = np.argmax(one_hot_encoded_bases,axis=1)

list of clouds at final height level: (array([], dtype=int64),)


In [16]:
# preprocess the inputs
# normalize variables (want access to unique features at top-level for processing access,
# so transpose, process, and transpose back)
data_x = input_array.T
data_x = (data_x - data_x.min(axis=(1,2)).reshape((3,1,1))) / (data_x.ptp(axis=(1,2)).reshape((3,1,1)))
data_x = data_x.T

# # verify dimensions
# print(data_x.shape)

(1228800, 70, 3)


In [11]:
#verify input and output shapes
print("Input dim:", data_x.shape)
print("Output dim:", one_hot_encoded_bases.shape)

Input dim: (1228800, 70, 3)
Output dim: (1228800, 70)


In [15]:
# create an extra positional encoding optionally for input use

sample_num, height_dim, _ = data_x.shape
# generate height values
height_position_vector = np.arange(height_dim)
# extend dimensions out to match input feats
height_position_vector = np.repeat([height_position_vector], sample_num, axis=0)

# # verify
# print(height_position_vector.shape)

x,y = height_position_vector.shape
# add a dimension for height to act as a feature
height_position_vector = height_position_vector.reshape(x,y,1)

# fit the dtype of the feature to match the dtype of other feats
height_position_vector = height_position_vector.astype(data_x.dtype)

# # combine height feature into input array 
# data_x = np.concatenate((height_position_vector, data_x), axis=2, dtype=np.float32) #leave the concat for within the model after producing embedding

# verify datatypes
print("input dtype", data_x.dtype)
print("height encoding dtype", height_position_vector.dtype)

input dtype float32
height encoding dtype float32


## Get data into training, validation, and testing sets + load data into library specific datastructure

## Define the network

## Perform the network initialization and training

## Display and evaluate results