# Notebook to investigate the som.py script

## Running the original code

In [None]:
# imports for running the original code:
import random
import sys
import xarray as xr
import logging
import numpy as np

In [None]:
# Progress class from original code
class Progress(object):

    def __init__(self,label,silent=False):
        self.label = label
        self.last_progress_frac = None
        self.silent = silent

    def report(self,msg,progress_frac):
        if self.silent:
            return
        if self.last_progress_frac is None or (progress_frac - self.last_progress_frac) >= 0.01:
            self.last_progress_frac = progress_frac
            i = int(100*progress_frac)
            if i > 100:
                i = 100
            si = i // 2
            sys.stdout.write("\r%s %s %-05s %s" % (self.label,msg,str(i)+"%","#"*si))
            sys.stdout.flush()

    def complete(self,msg):
        if self.silent:
            return
        sys.stdout.write("\n%s %s\n" % (self.label,msg))
        sys.stdout.flush()

In [None]:
# The SelfOrganisingMap class from the original code. Includes the fit_transform function
# which is the bit that takes the time and will be modified to try to speed up code.

class SelfOrganisingMap(object):

    """
    Train Self Organising Map (SOM) with cells arranged in a 2-dimensional rectangular layout

    Parameters
    ----------
    iters : int
        the number of training iterations to use when training the SOM
    gridwidth : int
        number of cells across the grid
    gridheight : int
        number of cells down the grid
    initial_neighbourhood : int
        the initial neighbourhood size

    Keyword Parameters
    ------------------
    verbose : bool
        whether to print progress messages
    seed : int
        random seed - set to produce repeatable results
    """

    def __init__(self, gridwidth, gridheight, iters, initial_neighbourhood, verbose=False, seed=None):
        self.gridheight = gridheight
        self.gridwidth = gridwidth
        self.iters = iters
        self.initial_neighbourhood = initial_neighbourhood
        self.verbose = verbose
        self.rng = random.Random()
        if seed:
            self.rng.seed(seed)
        self.learn_rate_initial = 0.5
        self.learn_rate_final = 0.05

    def get_weights(self,outputIndex):
        return self.weights[:,outputIndex].tolist()

    def fit_transform(self, instances):
        self.neighbour_limit = 0
        self.nr_inputs = instances.shape[1]
        self.nr_instances = instances.shape[0]
        self.instance_mask = ~np.any(np.isnan(instances), axis=1)

        self.nr_outputs = self.gridwidth * self.gridheight
        self.nr_weights = self.nr_outputs * self.nr_inputs

        self.weights = np.zeros((self.nr_inputs, self.nr_outputs))
        for row in range(0, self.nr_inputs):
            for col in range(0, self.nr_outputs):
                self.weights[row, col] = self.rng.random()

        p = Progress("SOM",silent=not self.verbose)
        progress_frac = 0.0
        p.report("Starting", progress_frac)
        iteration = 0
        while iteration < self.iters:
            learn_rate = (1.0 - float(iteration) / float(self.iters)) \
                         * (self.learn_rate_initial - self.learn_rate_final) + self.learn_rate_final
            neighbour_limit = self.initial_neighbourhood - int(
                (float(iteration) / float((self.iters + 1))) * self.initial_neighbourhood)
            logging.debug("iter=%d (of %d) / learning-rate=%f / neighbourhood=%d"%(iteration, self.iters,
                                                                                   learn_rate,
                                                                                   neighbour_limit))
            for i in range(self.nr_instances):
                if self.instance_mask[i]:
                    winner = self.compute_activations(instances[i, :])
                    self.update_network(winner, instances[i, :], neighbour_limit, learn_rate)

            iteration += 1
            progress_frac = iteration/self.iters
            p.report("Training neighbourhood=%d"%(neighbour_limit), progress_frac)

        p.complete("SOM Training Complete")

        scores = np.zeros(shape=(self.nr_instances, 2))

        for i in range(self.nr_instances):
            if self.instance_mask[i]:
                winner = self.coords(self.compute_activations(instances[i, :]))
            else:
                winner = [np.nan,np.nan]
            scores[i,:] = np.array(winner)

        return scores

    def compute_activations(self,values):
        inarr = np.expand_dims(values, axis=1)
        sqdiffs = (self.weights - inarr) ** 2
        sumsdiffs = np.sum(sqdiffs, axis=0)
        return np.argmin(sumsdiffs)

    def update_network(self, winner, values, neighbour_limit, learn_rate):
        (wx,wy) = self.coords(winner)
        for x in range(max(0,wx-neighbour_limit),min(self.gridwidth, wx+neighbour_limit+1)):
            for y in range(max(0, wy - neighbour_limit), min(self.gridheight, wy + neighbour_limit + 1)):
                index = self.get_output(x, y)
                self.weights[:,index] -= learn_rate * (self.weights[:, index]-values)

    def coords(self, output):
        return (output % self.gridwidth, output // self.gridwidth)

    def get_output(self, x, y):
        return x + (y*self.gridwidth)


In [None]:
# need to download the test data:
#!wget https://gws-access.jasmin.ac.uk/public/nceo_uor/niall/sla_c3s_clim.nc -P data/

In [None]:
# The next few cells contain the main code in the som.py script
# that runs on the example file

# SOM training parameters
# we would like to be able to run gridsize=100, iters=100
gridsize = 8
gridheight = 8
iters = 10

initial_neighbourhood = min(2,int(gridsize/3))
da = xr.open_dataset("data/sla_c3s_clim.nc")["sla_c3s"] # sea level anomalies averaged by month-of-year,
                                                        # lat and lon cell

stack_dims = ("lat","lon")
stack_sizes = (da.shape[1],da.shape[2])

# each (lat,lon) position becomes an independent case
# flatten lat and lon dimensions and transpose to arrange by (ncases, time)
# where ncases = nlat*nlon
instances = da.stack(case=stack_dims).transpose("case", "month").values

In [None]:
# run SOM to reduce time dimension from 12 to 2
s = SelfOrganisingMap(gridsize, gridsize, iters, initial_neighbourhood, seed=1, verbose=True)

In [None]:
%%time
scores = s.fit_transform(instances)

In [None]:
# restore lat/lon dimensions and output
a = scores.reshape(stack_sizes + (2,))
new_dims = stack_dims + ("som_axis",)
output = xr.DataArray(data=a, dims=new_dims, name="monthly_sla_som")
output.to_netcdf("som.nc")

## Using CuPy as drop in replacement for NumPy

In [None]:
import cupy as np

In [None]:
# The SelfOrganisingMap class with small modifcation to work with CuPy:
# wx and wy need to be converted to integers in the update_network method.

class SelfOrganisingMap(object):

    """
    Train Self Organising Map (SOM) with cells arranged in a 2-dimensional rectangular layout

    Parameters
    ----------
    iters : int
        the number of training iterations to use when training the SOM
    gridwidth : int
        number of cells across the grid
    gridheight : int
        number of cells down the grid
    initial_neighbourhood : int
        the initial neighbourhood size

    Keyword Parameters
    ------------------
    verbose : bool
        whether to print progress messages
    seed : int
        random seed - set to produce repeatable results
    """

    def __init__(self, gridwidth, gridheight, iters, initial_neighbourhood, verbose=False, seed=None):
        self.gridheight = gridheight
        self.gridwidth = gridwidth
        self.iters = iters
        self.initial_neighbourhood = initial_neighbourhood
        self.verbose = verbose
        self.rng = random.Random()
        if seed:
            self.rng.seed(seed)
        self.learn_rate_initial = 0.5
        self.learn_rate_final = 0.05

    def get_weights(self,outputIndex):
        return self.weights[:,outputIndex].tolist()

    def fit_transform(self, instances):
        self.neighbour_limit = 0
        self.nr_inputs = instances.shape[1]
        self.nr_instances = instances.shape[0]
        self.instance_mask = ~np.any(np.isnan(instances), axis=1)

        self.nr_outputs = self.gridwidth * self.gridheight
        self.nr_weights = self.nr_outputs * self.nr_inputs

        self.weights = np.zeros((self.nr_inputs, self.nr_outputs), dtype=np.float32)
        for row in range(0, self.nr_inputs):
            for col in range(0, self.nr_outputs):
                self.weights[row, col] = self.rng.random()

        p = Progress("SOM",silent=not self.verbose)
        progress_frac = 0.0
        p.report("Starting", progress_frac)
        iteration = 0
        while iteration < self.iters:
            learn_rate = (1.0 - float(iteration) / float(self.iters)) \
                         * (self.learn_rate_initial - self.learn_rate_final) + self.learn_rate_final
            neighbour_limit = self.initial_neighbourhood - int(
                (float(iteration) / float((self.iters + 1))) * self.initial_neighbourhood)
            logging.debug("iter=%d (of %d) / learning-rate=%f / neighbourhood=%d"%(iteration, self.iters,
                                                                                   learn_rate,
                                                                                   neighbour_limit))
            for i in range(self.nr_instances):
                if self.instance_mask[i]:
                    winner = self.compute_activations(instances[i, :])
                    self.update_network(winner, instances[i, :], neighbour_limit, learn_rate)

            iteration += 1
            progress_frac = iteration/self.iters
            p.report("Training neighbourhood=%d"%(neighbour_limit), progress_frac)

        p.complete("SOM Training Complete")

        scores = np.zeros(shape=(self.nr_instances, 2))

        for i in range(self.nr_instances):
            if self.instance_mask[i]:
                winner = self.coords(self.compute_activations(instances[i, :]))
            else:
                winner = [np.nan,np.nan]
            scores[i,:] = np.array(winner)

        return scores

    def compute_activations(self,values):
        inarr = np.expand_dims(values, axis=1)
        sqdiffs = (self.weights - inarr) ** 2
        sumsdiffs = np.sum(sqdiffs, axis=0)
        return np.argmin(sumsdiffs)

    def update_network(self, winner, values, neighbour_limit, learn_rate):
        (wx,wy) = self.coords(winner)
        wx = int(wx)  ## modified
        wy = int(wy)  ## modified
        for x in range(max(0,wx-neighbour_limit),min(self.gridwidth, wx+neighbour_limit+1)):
            for y in range(max(0, wy - neighbour_limit), min(self.gridheight, wy + neighbour_limit + 1)):
                index = self.get_output(x, y)
                self.weights[:,index] -= learn_rate * (self.weights[:, index]-values)

    def coords(self, output):
        return (output % self.gridwidth, output // self.gridwidth)

    def get_output(self, x, y):
        return x + (y*self.gridwidth)


In [None]:
# Main code from the som.py script with small modification to work with CuPy
# Because the instances array was set up using xarray, it was automatically a
# NumPy array so I needed to convert this to a CuPy array (here cupy imported as np!)

# SOM training parameters
# we would like to be able to run gridsize=100, iters=100
gridsize = 8
gridheight = 8
iters = 10

initial_neighbourhood = min(2,int(gridsize/3))
da = xr.open_dataset("data/sla_c3s_clim.nc")["sla_c3s"] # sea level anomalies averaged by month-of-year,
                                                        # lat and lon cell

stack_dims = ("lat","lon")
stack_sizes = (da.shape[1],da.shape[2])

# each (lat,lon) position becomes an independent case
# flatten lat and lon dimensions and transpose to arrange by (ncases, time)
# where ncases = nlat*nlon
instances = da.stack(case=stack_dims).transpose("case", "month").values
instances = np.array(instances)  # modified

In [None]:
# run SOM to reduce time dimension from 12 to 2
s = SelfOrganisingMap(gridsize, gridsize, iters, initial_neighbourhood, seed=1, verbose=True)

In [None]:
%%time
scores = s.fit_transform(instances)

In [None]:
# Using CuPy instead of NumPy slows things down considerably, with time taken using default setting now around
# 22mins when previously only about 2 minutes.

In [None]:
# How do the 2 methods compare for larger gridsize?
# Need to re-run the imports for numpy or cupy and their corresponding SelfOrganisingMap class
# before running this cell to get results for numpy vs cupy.
gridsize = 100
gridheight = 100
iters = 1  # just check for one iteration for timings comparison

initial_neighbourhood = min(2,int(gridsize/3))
da = xr.open_dataset("data/sla_c3s_clim.nc")["sla_c3s"]

stack_dims = ("lat","lon")
stack_sizes = (da.shape[1],da.shape[2])

instances = da.stack(case=stack_dims).transpose("case", "month").values
instances = np.array(instances, dtype=np.float32)

In [None]:
# for CuPy:
s = SelfOrganisingMap(gridsize, gridsize, iters, initial_neighbourhood, seed=1, verbose=True)
%time scores = s.fit_transform(instances)

In [None]:
# for NumPy:
s = SelfOrganisingMap(gridsize, gridsize, iters, initial_neighbourhood, seed=1, verbose=True)
%time scores = s.fit_transform(instances)

In [None]:
# Although using NumPy for larger grids is much slower than for the smaller grids,
# and the difference for larger grids vs smaller grids with CuPy is smaller, the time using NumPy for
# 100 x 100 grid is still much faster.
# So CuPy doesn't really help us here for this size of problem.

In [None]:
# NOTE:
# Using the default setup where the instances and weights arrays use float64 values gives different
# results for the NumPy vs CuPy runs. The sum of squares are slighlty different and the differences 
# grow each time we go through the instance loop. I don't think it would make a difference to the
# conclusions drawn, but want to have an exact comparison if possible.
# Using float32 should also run faster on GPU so useful to see if we get any improvement here.
# I've gone back and explicitly set the arrays to be float32 - unfortunately still getting different results
# for the full set of instances (matches for longer when testing with subsets).
# It also doesn't seem to have impact on timings.

# Found this thread discussing same problem observed:
# https://github.com/cupy/cupy/issues/2559
# As I understand it, this implies the numpy result could be the one that is wrong when calculating np.sum(..., axis=0)
# Will accept the differences and continue for now, but would need to look more closely at methods if using
# any of this code in practice.

## Focus on BMU

In [None]:
# I tried a few different methods for calculating the best matching using a GPU, and 
# CuPy's ReductionKernel class seemed to be the fastest solution.
# (Using guvectorize and cuda jit were quite complicated and did not lead to faster results)

In [None]:
import numpy as np
import cupy as cp
from numba import njit
import matplotlib.pyplot as plt

In [None]:
# Original method
def cpu_distance(weights, inarr):
    sqdiffs = (weights - inarr) ** 2
    sumsdiffs = np.sum(sqdiffs, axis=0)
    return np.argmin(sumsdiffs)

# Original method, but with numba jit
@njit
def numba_cpu_distance(weights, inarr):
    sqdiffs = (weights - inarr) ** 2
    sumsdiffs = np.sum(sqdiffs, axis=0)
    return np.argmin(sumsdiffs)

# Original method, but replacing numpy with cupy
def gpu_distance(weights, inarr):
    sqdiffs = (weights - inarr) ** 2
    sumsdiffs = cp.sum(sqdiffs, axis=0)
    return cp.argmin(sumsdiffs)

sqsum_kernel = cp.ReductionKernel(
    'T x, T y',  # input params
    'T z',  # output params
    '(x - y) * (x - y)',  # map
    'a + b',  # reduce
    'z = a',  # post-reduction map
    '0',  # identity value
    'sqsum'  # kernel name
    )

# GPU method using ReductionKernel
def gpu_reduction_distance(weights, inarr):
    return cp.argmin(sqsum_kernel(weights, inarr, axis=1))

In [None]:
n = 8

In [None]:
# Make arrays with the same shape as those used in SOM code.
weights_cp = cp.arange(n*n*12, dtype=cp.float32).reshape(12, n*n)
inarr_cp = cp.arange(12, dtype=cp.float32).reshape(12, 1)

weights_np = np.arange(n*n*12, dtype=np.float32).reshape(12, n*n)
inarr_np = np.arange(12, dtype=np.float32).reshape(12, 1)

In [None]:
%%timeit
cpu_distance(weights_np, inarr_np)

In [None]:
%%timeit
numba_cpu_distance(weights_np, inarr_np)  # run twice to ignore compile time

In [None]:
%%timeit
gpu_distance(weights_cp, inarr_cp)  # run twice

In [None]:
%%timeit
gpu_reduction_distance(weights_cp, inarr_cp)  # run twice

In [None]:
# Create some lists containing results for different values of n:
n_vals = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
cpu_times = []
numba_cpu_times = []
gpu_times = []
gpu_reduction_times = []

for n in n_vals:
    weights_cp = cp.arange(n*n*12).reshape(12, n*n)
    inarr_cp = cp.arange(12).reshape(12, 1)

    weights_np = np.arange(n*n*12).reshape(12, n*n)
    inarr_np = np.arange(12).reshape(12, 1)
    
    result = %timeit -o cpu_distance(weights_np, inarr_np)
    cpu_times.append(result.average)
    result = %timeit -o numba_cpu_distance(weights_np, inarr_np)
    numba_cpu_times.append(result.average)
    result = %timeit -o gpu_distance(weights_cp, inarr_cp)
    gpu_times.append(result.average)
    result = %timeit -o gpu_reduction_distance(weights_cp, inarr_cp)
    gpu_reduction_times.append(result.average)

In [None]:
plt.plot(n_vals, cpu_times, label='NumPy')
plt.plot(n_vals, numba_cpu_times, label='NumPy with Numba JIT wrapper')
plt.plot(n_vals, gpu_times, label='CuPy')
plt.plot(n_vals, gpu_reduction_times, label='CuPy reduction kernel')
plt.legend()

In [None]:
# Suggests it would be better to use GPU for gridsize > 50

The running of the update network part may not take longer as the grid grows,
however it would make sense to run on the GPU to avoid time consuming copies to/from
the device to calculated the BMU.
If this calculation takes much longer on the GPU it could wipe out any gains made by running
the distance calculations on GPU.

## Update network calculations

In [None]:
# orginal functions:
def coords(output, gridsize):
    return (output % gridsize, output // gridsize)

def get_output(x, y, gridsize):
    return x + (y*gridsize)

def update_network(weights, winner, values, neighbour_limit, learn_rate, gridsize):
    (wx,wy) = coords(winner, gridsize)
    for x in range(max(0,wx-neighbour_limit),min(gridsize, wx+neighbour_limit+1)):
        for y in range(max(0, wy - neighbour_limit), min(gridsize, wy + neighbour_limit + 1)):
            index = get_output(x, y, gridsize)
            weights[:,index] -= learn_rate * (weights[:, index]-values)

# GPU - parallelise the loop:
def update_network_gpu(weights, winner, values, neighbour_limit, learn_rate, gridsize):
    (wx,wy) = coords(winner, gridsize)
    x = cp.arange(max(0,wx-neighbour_limit),min(gridsize, wx+neighbour_limit+1))
    y = cp.arange(max(0, wy - neighbour_limit), min(gridsize, wy + neighbour_limit + 1))
    indices = get_output(x, y, gridsize)
    weights[:,indices] -= learn_rate * (weights[:, indices]-values)
    
# original functions with numba jit:
@njit
def coords_jit(output, gridsize):
    return (output % gridsize, output // gridsize)

@njit
def get_output_jit(x, y, gridsize):
    return x + (y*gridsize)

@njit
def update_network_jit(weights, winner, values, neighbour_limit, learn_rate, gridsize):
    (wx,wy) = coords_jit(winner, gridsize)
    for x in range(max(0,wx-neighbour_limit),min(gridsize, wx+neighbour_limit+1)):
        for y in range(max(0, wy - neighbour_limit), min(gridsize, wy + neighbour_limit + 1)):
            index = get_output_jit(x, y, gridsize)
            weights[:,index] -= learn_rate * (weights[:, index]-values)

In [None]:
n = 100
# values for CPU
weights_np = np.arange(n*n*12, dtype=np.float32).reshape(12, n*n)
inarr_np = np.arange(12, dtype=np.float32).reshape(12, 1)

weights_new_np = weights_np.copy()
winner = 0
values_np = inarr_np.squeeze()
neighbour_limit = 2
learn_rate = 0.5

# values for GPU
weights_cp = cp.arange(n*n*12, dtype=cp.float32).reshape(12, n*n)
inarr_cp = cp.arange(12, dtype=cp.float32).reshape(12, 1)
weights_new_cp = weights_cp.copy()
values_cp = inarr_cp.squeeze()

In [None]:
%%timeit 
update_network(weights_new_np, winner, values_np, neighbour_limit, learn_rate, n)

In [None]:
%%timeit 
update_network_jit(weights_new_np, winner, values_np, neighbour_limit, learn_rate, n)

In [None]:
%%timeit 
update_network(weights_new_cp, winner, values_cp, neighbour_limit, learn_rate, n)

In [None]:
%%timeit 
update_network_gpu(weights_new_cp, winner, inarr_cp, neighbour_limit, learn_rate, n)

In [None]:
# Significantly slower with GPU (changing n does not affect values)

## Combined BMU and network update

In [None]:
%%timeit
# Just CPU
n = 100
weights_np = np.arange(n*n*12, dtype=np.float32).reshape(12, n*n)
inarr_np = np.arange(12, dtype=np.float32).reshape(12, 1)
numba_cpu_distance(weights_np, inarr_np)
update_network_jit(weights_np, winner, inarr_np.squeeze(), neighbour_limit, learn_rate, n)

In [None]:
%%timeit
# Just GPU
n = 100
weights_cp = cp.arange(n*n*12, dtype=cp.float32).reshape(12, n*n)
inarr_cp = cp.arange(12, dtype=cp.float32).reshape(12, 1)
gpu_reduction_distance(weights_cp, inarr_cp)
update_network_gpu(weights_cp, winner, inarr_cp, neighbour_limit, learn_rate, n)

In [None]:
%%timeit
# Mixing GPU and CPU in the loop...
n = 100
weights_cp = cp.arange(n*n*12, dtype=cp.float32).reshape(12, n*n)
inarr_cp = cp.arange(12, dtype=cp.float32).reshape(12, 1)
np.argmin(gpu_reduction_distance(weights_cp, inarr_cp))
weights_np = cp.asnumpy(weights_cp)
inarr_np = cp.asnumpy(inarr_cp)
update_network_jit(weights_np, winner, inarr_np.squeeze(), neighbour_limit, learn_rate, n)

In [None]:
# results are VERY variable! But it does seem like the three different options give similar timings when
# we use n = 100, with GPU potentially giving faster results than CPU (sometimes the other way around!!)

In [None]:
# To get a better idea, we'll apply these methods to the original problem and look at timings....

## SOM 2 - using Niall's updated code that uses numba jit

In [None]:
@njit
def find_bmu(values,weights):
    inarr = np.expand_dims(values, axis=1)
    sqdiffs = (weights - inarr) ** 2
    sumsdiffs = np.sum(sqdiffs, axis=0)
    return np.argmin(sumsdiffs)

@njit
def iterate(nr_instances,instances,instance_mask,weights,gridwidth,gridheight,neighbour_limit,learn_rate):
    for i in range(nr_instances):
        if instance_mask[i]:
            winner = find_bmu(instances[i,:],weights)
            wx = winner % gridwidth
            wy = winner // gridwidth
            update_network(weights, gridwidth, gridheight, wx, wy, instances[i, :], neighbour_limit,
                           learn_rate)

@njit
def update_network(weights, gridwidth, gridheight, wx, wy, values, neighbour_limit, learn_rate):
    for x in range(max(0, wx - neighbour_limit), min(gridwidth, wx + neighbour_limit + 1)):
        for y in range(max(0, wy - neighbour_limit), min(gridheight, wy + neighbour_limit + 1)):
            index = x + (y * gridwidth)
            weights[:, index] -= learn_rate * (weights[:, index] - values)

@njit
def compute_scores(nr_instances,instance_mask,instances,weights,gridwidth):
    scores = np.zeros(shape=(nr_instances, 2))
    for i in range(nr_instances):
        if instance_mask[i]:
            bmu = find_bmu(instances[i, :],weights)
            wx = bmu % gridwidth
            wy = bmu // gridwidth
        else:
            wx = np.nan
            wy = np.nan
        scores[i, :] = np.array([wx,wy])

    return scores

In [None]:
class SelfOrganisingMap(object):

    """
    Train Self Organising Map (SOM) with cells arranged in a 2-dimensional rectangular layout

    Parameters
    ----------
    iters : int
        the number of training iterations to use when training the SOM
    gridwidth : int
        number of cells across the grid
    gridheight : int
        number of cells down the grid
    initial_neighbourhood : int
        the initial neighbourhood size

    Keyword Parameters
    ------------------
    verbose : bool
        whether to print progress messages
    seed : int
        random seed - set to produce repeatable results
    """

    def __init__(self, gridwidth, gridheight, iters, initial_neighbourhood, verbose=False, seed=None):
        self.gridheight = gridheight
        self.gridwidth = gridwidth
        self.iters = iters
        self.initial_neighbourhood = initial_neighbourhood
        self.verbose = verbose
        self.rng = random.Random()
        if seed:
            self.rng.seed(seed)
        self.learn_rate_initial = 0.5
        self.learn_rate_final = 0.05

    def get_weights(self,outputIndex):
        return self.weights[:,outputIndex].tolist()

    def fit_transform(self, instances):
        self.neighbour_limit = 0
        self.nr_inputs = instances.shape[1]
        self.nr_instances = instances.shape[0]
        self.instance_mask = ~np.any(np.isnan(instances), axis=1)

        self.nr_outputs = self.gridwidth * self.gridheight
        self.nr_weights = self.nr_outputs * self.nr_inputs

        self.weights = np.zeros((self.nr_inputs, self.nr_outputs))
        for row in range(0, self.nr_inputs):
            for col in range(0, self.nr_outputs):
                self.weights[row, col] = self.rng.random()

        p = Progress("SOM",silent=not self.verbose)
        progress_frac = 0.0
        p.report("Starting", progress_frac)
        iteration = 0
        while iteration < self.iters:
            learn_rate = (1.0 - float(iteration) / float(self.iters)) \
                         * (self.learn_rate_initial - self.learn_rate_final) + self.learn_rate_final
            neighbour_limit = self.initial_neighbourhood - int(
                (float(iteration) / float((self.iters + 1))) * self.initial_neighbourhood)
            logging.debug("iter=%d (of %d) / learning-rate=%f / neighbourhood=%d"%(iteration, self.iters,
                                                                                   learn_rate,
                                                                                   neighbour_limit))

            iterate(self.nr_instances,instances,self.instance_mask,self.weights,self.gridwidth,self.gridheight,neighbour_limit,learn_rate)

            iteration += 1
            progress_frac = iteration/self.iters
            p.report("Training neighbourhood=%d"%(neighbour_limit), progress_frac)

        p.complete("SOM Training Complete")

        return compute_scores(self.nr_instances,self.instance_mask,instances,self.weights,self.gridwidth)

    def coords(self, output):
        return (output % self.gridwidth, output // self.gridwidth)

    def get_output(self, x, y):
        return x + (y*self.gridwidth)

In [None]:
# SOM training parameters
# we would like to be able to run gridsize=100, iters=100
gridsize = 200
gridheight = 200
iters = 1

initial_neighbourhood = min(2,int(gridsize/3))
da = xr.open_dataset("data/sla_c3s_clim.nc")["sla_c3s"] # sea level anomalies averaged by month-of-year,
                                                        # lat and lon cell

stack_dims = ("lat","lon")
stack_sizes = (da.shape[1],da.shape[2])

# each (lat,lon) position becomes an independent case
# flatten lat and lon dimensions and transpose to arrange by (ncases, time)
# where ncases = nlat*nlon
instances = da.stack(case=stack_dims).transpose("case", "month").values

In [None]:
# run SOM to reduce time dimension from 12 to 2
s = SelfOrganisingMap(gridsize, gridsize, iters, initial_neighbourhood, seed=1, verbose=True)
import time
start_time = time.time()
scores = s.fit_transform(instances)
end_time = time.time()
print("Elapsed time: %d seconds" % (int(end_time-start_time)))

## Numba JIT plus GPU reduction kernel for BMU calculation

In [None]:
sqsum_kernel = cp.ReductionKernel(
    'T x, T y',  # input params
    'T z',  # output params
    '(x - y) * (x - y)',  # map
    'a + b',  # reduce
    'z = a',  # post-reduction map
    '0',  # identity value
    'sqsum'  # kernel name
    )

def iterate(nr_instances,instances,instance_mask,weights,gridwidth,gridheight,neighbour_limit,learn_rate):
    for i in range(nr_instances):
        if instance_mask[i]:
            weights = cp.array(weights)
            inarr = cp.array(np.expand_dims(instances[i,:], axis=1))
            np_ss = cp.asnumpy(sqsum_kernel(weights, inarr, axis=1))
            winner = np.argmin(np_ss)
            weights = cp.asnumpy(weights)
            wx = winner % gridwidth
            wy = winner // gridwidth
            update_network(weights, gridwidth, gridheight, wx, wy, instances[i,:], neighbour_limit,
                           learn_rate)

@njit
def update_network(weights, gridwidth, gridheight, wx, wy, values, neighbour_limit, learn_rate):
    for x in range(max(0, wx - neighbour_limit), min(gridwidth, wx + neighbour_limit + 1)):
        for y in range(max(0, wy - neighbour_limit), min(gridheight, wy + neighbour_limit + 1)):
            index = x + (y * gridwidth)
            weights[:, index] -= learn_rate * (weights[:, index] - values)

#@njit
def compute_scores(nr_instances,instance_mask,instances,weights,gridwidth):
    scores = np.zeros(shape=(nr_instances, 2))
    for i in range(nr_instances):
        if instance_mask[i]:
            weights = cp.array(weights)
            inarr = cp.array(np.expand_dims(instances[i,:], axis=1))
            np_ss = sqsum_kernel(weights, inarr, axis=1)
            bmu = cp.argmin(np_ss)
            wx = int(bmu % gridwidth)
            wy = int(bmu // gridwidth)
        else:
            wx = np.nan
            wy = np.nan
        scores[i, :] = np.array([wx,wy])

    return scores

In [None]:
class SelfOrganisingMap(object):

    """
    Train Self Organising Map (SOM) with cells arranged in a 2-dimensional rectangular layout

    Parameters
    ----------
    iters : int
        the number of training iterations to use when training the SOM
    gridwidth : int
        number of cells across the grid
    gridheight : int
        number of cells down the grid
    initial_neighbourhood : int
        the initial neighbourhood size

    Keyword Parameters
    ------------------
    verbose : bool
        whether to print progress messages
    seed : int
        random seed - set to produce repeatable results
    """

    def __init__(self, gridwidth, gridheight, iters, initial_neighbourhood, verbose=False, seed=None):
        self.gridheight = gridheight
        self.gridwidth = gridwidth
        self.iters = iters
        self.initial_neighbourhood = initial_neighbourhood
        self.verbose = verbose
        self.rng = random.Random()
        if seed:
            self.rng.seed(seed)
        self.learn_rate_initial = 0.5
        self.learn_rate_final = 0.05

    def get_weights(self,outputIndex):
        return self.weights[:,outputIndex].tolist()

    def fit_transform(self, instances):
        self.neighbour_limit = 0
        self.nr_inputs = instances.shape[1]
        self.nr_instances = instances.shape[0]
        self.instance_mask = ~np.any(np.isnan(instances), axis=1)

        self.nr_outputs = self.gridwidth * self.gridheight
        self.nr_weights = self.nr_outputs * self.nr_inputs

        self.weights = np.zeros((self.nr_inputs, self.nr_outputs), dtype=np.float32)
        for row in range(0, self.nr_inputs):
            for col in range(0, self.nr_outputs):
                self.weights[row, col] = self.rng.random()

        p = Progress("SOM",silent=not self.verbose)
        progress_frac = 0.0
        p.report("Starting", progress_frac)
        iteration = 0
        while iteration < self.iters:
            learn_rate = (1.0 - float(iteration) / float(self.iters)) \
                         * (self.learn_rate_initial - self.learn_rate_final) + self.learn_rate_final
            neighbour_limit = self.initial_neighbourhood - int(
                (float(iteration) / float((self.iters + 1))) * self.initial_neighbourhood)
            logging.debug("iter=%d (of %d) / learning-rate=%f / neighbourhood=%d"%(iteration, self.iters,
                                                                                   learn_rate,
                                                                                   neighbour_limit))

            iterate(self.nr_instances,instances,self.instance_mask,self.weights,self.gridwidth,self.gridheight,neighbour_limit,learn_rate)

            iteration += 1
            progress_frac = iteration/self.iters
            p.report("Training neighbourhood=%d"%(neighbour_limit), progress_frac)

        p.complete("SOM Training Complete")

        return compute_scores(self.nr_instances,self.instance_mask,instances,self.weights,self.gridwidth)

    def coords(self, output):
        return (output % self.gridwidth, output // self.gridwidth)

    def get_output(self, x, y):
        return x + (y*self.gridwidth)

In [None]:
# SOM training parameters
# we would like to be able to run gridsize=100, iters=100
gridsize = 200
gridheight = 200
iters = 1

initial_neighbourhood = min(2,int(gridsize/3))
da = xr.open_dataset("data/sla_c3s_clim.nc")["sla_c3s"] # sea level anomalies averaged by month-of-year,
                                                        # lat and lon cell

stack_dims = ("lat","lon")
stack_sizes = (da.shape[1],da.shape[2])

# each (lat,lon) position becomes an independent case
# flatten lat and lon dimensions and transpose to arrange by (ncases, time)
# where ncases = nlat*nlon
instances = da.stack(case=stack_dims).transpose("case", "month").values
instances = np.array(instances, dtype=np.float32)

In [None]:
# run SOM to reduce time dimension from 12 to 2
s = SelfOrganisingMap(gridsize, gridsize, iters, initial_neighbourhood, seed=1, verbose=True)
import time
start_time = time.time()
scores = s.fit_transform(instances)
end_time = time.time()
print("Elapsed time: %d seconds" % (int(end_time-start_time)))

## Original code, swapping out np for cp, using reduction and parallel network update

In [None]:
import cupy as np

In [None]:
# The SelfOrganisingMap class with small modifcation to work with CuPy:
# wx and wy need to be converted to integers in the update_network method.
# Plus using the reduction kernel to calculate sum of squares

sqsum_kernel = cp.ReductionKernel(
    'T x, T y',  # input params
    'T z',  # output params
    '(x - y) * (x - y)',  # map
    'a + b',  # reduce
    'z = a',  # post-reduction map
    '0',  # identity value
    'sqsum'  # kernel name
    )

class SelfOrganisingMap(object):

    """
    Train Self Organising Map (SOM) with cells arranged in a 2-dimensional rectangular layout

    Parameters
    ----------
    iters : int
        the number of training iterations to use when training the SOM
    gridwidth : int
        number of cells across the grid
    gridheight : int
        number of cells down the grid
    initial_neighbourhood : int
        the initial neighbourhood size

    Keyword Parameters
    ------------------
    verbose : bool
        whether to print progress messages
    seed : int
        random seed - set to produce repeatable results
    """

    def __init__(self, gridwidth, gridheight, iters, initial_neighbourhood, verbose=False, seed=None):
        self.gridheight = gridheight
        self.gridwidth = gridwidth
        self.iters = iters
        self.initial_neighbourhood = initial_neighbourhood
        self.verbose = verbose
        self.rng = random.Random()
        if seed:
            self.rng.seed(seed)
        self.learn_rate_initial = 0.5
        self.learn_rate_final = 0.05

    def get_weights(self,outputIndex):
        return self.weights[:,outputIndex].tolist()

    def fit_transform(self, instances):
        self.neighbour_limit = 0
        self.nr_inputs = instances.shape[1]
        self.nr_instances = instances.shape[0]
        self.instance_mask = ~np.any(np.isnan(instances), axis=1)

        self.nr_outputs = self.gridwidth * self.gridheight
        self.nr_weights = self.nr_outputs * self.nr_inputs

        self.weights = np.zeros((self.nr_inputs, self.nr_outputs), dtype=np.float32)
        for row in range(0, self.nr_inputs):
            for col in range(0, self.nr_outputs):
                self.weights[row, col] = self.rng.random()

        p = Progress("SOM",silent=not self.verbose)
        progress_frac = 0.0
        p.report("Starting", progress_frac)
        iteration = 0
        while iteration < self.iters:
            learn_rate = (1.0 - float(iteration) / float(self.iters)) \
                         * (self.learn_rate_initial - self.learn_rate_final) + self.learn_rate_final
            neighbour_limit = self.initial_neighbourhood - int(
                (float(iteration) / float((self.iters + 1))) * self.initial_neighbourhood)
            logging.debug("iter=%d (of %d) / learning-rate=%f / neighbourhood=%d"%(iteration, self.iters,
                                                                                   learn_rate,
                                                                                   neighbour_limit))
            for i in range(self.nr_instances):
                if self.instance_mask[i]:
                    winner = self.compute_activations(instances[i, :])
                    self.update_network(winner, instances[i, :], neighbour_limit, learn_rate)

            iteration += 1
            progress_frac = iteration/self.iters
            p.report("Training neighbourhood=%d"%(neighbour_limit), progress_frac)

        p.complete("SOM Training Complete")

        scores = np.zeros(shape=(self.nr_instances, 2))

        for i in range(self.nr_instances):
            if self.instance_mask[i]:
                winner = self.coords(self.compute_activations(instances[i, :]))
            else:
                winner = [np.nan,np.nan]
            scores[i,:] = np.array(winner)

        return scores

    def compute_activations(self,values):
        inarr = np.expand_dims(values, axis=1)
        sumsdiffs = sqsum_kernel(self.weights, inarr, axis=1)
        return np.argmin(sumsdiffs)

    def update_network(self, winner, values, neighbour_limit, learn_rate):
        inarr = np.expand_dims(values, axis=1)        
        (wx,wy) = self.coords(winner)
        x = np.arange(max(0, int(wx-neighbour_limit)), min(gridsize, int(wx+neighbour_limit+1)))
        y = np.arange(max(0, int(wy - neighbour_limit)), min(gridsize, int(wy + neighbour_limit + 1)))
        x = np.expand_dims(x, axis=1)
        y = np.expand_dims(y, axis=0)
        indices = self.get_output(x, y)
        indices = indices.flatten()
        self.weights[:,indices] -= learn_rate * (self.weights[:, indices]-inarr)

    def coords(self, output):
        return (output % self.gridwidth, output // self.gridwidth)

    def get_output(self, x, y):
        return x + (y*self.gridwidth)


In [None]:
# Main code from the som.py script with small modification to work with CuPy
# Because the instances array was set up using xarray, it was automatically a
# NumPy array so I needed to convert this to a CuPy array (here cupy imported as np!)

# SOM training parameters
# we would like to be able to run gridsize=100, iters=100
gridsize = 100
gridheight = 100
iters = 1

initial_neighbourhood = min(2,int(gridsize/3))
da = xr.open_dataset("data/sla_c3s_clim.nc")["sla_c3s"] # sea level anomalies averaged by month-of-year,
                                                        # lat and lon cell

stack_dims = ("lat","lon")
stack_sizes = (da.shape[1],da.shape[2])

# each (lat,lon) position becomes an independent case
# flatten lat and lon dimensions and transpose to arrange by (ncases, time)
# where ncases = nlat*nlon
instances = da.stack(case=stack_dims).transpose("case", "month").values
instances = np.array(instances, dtype=np.float32)  # modified

In [None]:
# run SOM to reduce time dimension from 12 to 2
s = SelfOrganisingMap(gridsize, gridsize, iters, initial_neighbourhood, seed=1, verbose=True)
import time
start_time = time.time()
scores = s.fit_transform(instances)
end_time = time.time()
print("Elapsed time: %d seconds" % (int(end_time-start_time)))

In [None]:
# Testing with grid 100 x 100 gave results along the lines of:
# CPU njit: 1min
# CPU njit mixed with GPU reduction kernel: 2min
# GPU all: 3min

# Testing with grid 200 x 200:
# CPU: 4.5min
# mix: 3.5min
# GPU: 3min

In [None]:
# From the results in this notebook it would seem that it is faster and simpler to run
# with numba jit on the CPU for the types of grid sizes being used here.
# If the grid size is more like 200x200 or more then we start to see a benefit of using
# a GPU.
# Timings above just include 1 iteration so if looking at doing ~ 100 iters, the difference are
# going to be more significant.