In [None]:
import numpy as np
import umap
import joblib
import os

from matplotlib import pyplot as plt
import matplotlib

%matplotlib inline

In [2]:
seed = 42
np.random.seed(seed)

In [3]:
font = {
    'size'   : 10
}
matplotlib.rc('font', **font)


In [4]:
DATA_FOLDER = '/data1/scidata/meteotn_data_new'

print("USING float16 data")

PLOT_FOLDER = '../../plots/umap_embeddings/'
os.makedirs(PLOT_FOLDER, exist_ok=True)

USING float16 data


In [5]:
all_scans_npz = os.path.join(DATA_FOLDER, 'runs_64x64.npz')
all_scans = np.load(all_scans_npz)

all_scans = all_scans['arr_0']
print(all_scans.shape)

(362233, 64, 64)


### Preprocess data

In [7]:
# truncate to first decimal value
# np.trunc only deals with the integer part of the number
# multiplying by 10 solves the problem

all_scans = np.trunc(all_scans * 10) / 10

In [8]:
# normalize between 0 and 1
all_scans = (all_scans - all_scans.min()) / (all_scans.max() - all_scans.min())

assert all_scans.min() == 0
assert all_scans.max() == 1

### Split train and val data

In [9]:
i = 200000
train_data = all_scans[:i]
val_data = all_scans[i:]

train_data.shape, val_data.shape

((200000, 64, 64), (162233, 64, 64))

### UMAP parameters
`n_components` = 5
`n_neighbors` = 200
`metric` = euclidean

In [10]:
n_components = 5
n_neighbors = 200
metric = 'euclidean'

min_dist = 0.1

In [11]:
train_data_flatten = train_data.reshape((train_data.shape[0], -1))
val_data_flatten = val_data.reshape((val_data.shape[0], -1))

In [12]:
model = umap.UMAP(
    n_components=n_components, n_neighbors=n_neighbors, min_dist=min_dist, metric=metric
)
train_emb = model.fit_transform(train_data_flatten)

The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "../../../.local/share/virtualenvs/meteo-tn-meteotn-bssZidOA/lib/python3.7/site-packages/umap/rp_tree.py", line 135:
@numba.njit(fastmath=True, nogil=True, parallel=True)
def euclidean_random_projection_split(data, indices, rng_state):
^

  state.func_ir.loc))
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "../../../.local/share/virtualenvs/meteo-tn-meteotn-bssZidOA/lib/python3.7/site-packages/umap/utils.py", line 409:
@numba.njit(parallel=True)
def build_candidates(current_graph, n_vertices, n_neighbors, max_candidates, rng_st

In [13]:
val_emb = model.transform(val_data_flatten)

The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "../../../.local/share/virtualenvs/meteo-tn-meteotn-bssZidOA/lib/python3.7/site-packages/umap/nndescent.py", line 124:
    @numba.njit(parallel=True)
    def init_from_random(n_neighbors, data, query_points, heap, rng_state):
    ^

  state.func_ir.loc))
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "../../../.local/share/virtualenvs/meteo-tn-meteotn-bssZidOA/lib/python3.7/site-packages/umap/nndescent.py", line 135:
    @numba.njit(parallel=True)
    def init_from_tree(tree, data, query_points, heap, rng_state):
    ^

  state

### Plot UMAP embeddings for train and val

In [14]:
def plot_umap_mosaic(embeddings, war, idx=None, title='', umap_c=5):
    if idx is None:
        idx = np.arange(len(embeddings))
        
    plt.figure(figsize=(10, 10))
    
    if title:
        plt.suptitle(title, fontsize=20)
    
    i = 1
    r = 1
    
    for f in range(umap_c):
        for s in range(f + 1, umap_c):
            plt.subplot(umap_c - 1, umap_c - 1, i)

            plt.title(f"{f} vs {s}")

            plt.scatter(
                embeddings[idx, f],
                embeddings[idx, s],
                c=war[idx],
                cmap='Spectral_r',
                marker='.',
                s=1,
                alpha=.1,
            )
            
#             plt.xlabel('')
#             plt.ylabel('')

#             plt.xticks([])
#             plt.yticks([])

            plt.gca().set_aspect('equal', 'datalim')
    
            i += 1

        i = r * (umap_c - 1) + 1
        r += 1


In [15]:
threshold = 0.14

In [16]:
train_war = np.sum(train_data > threshold, axis=(1, 2)) / (32 * 32 * np.pi)
print(train_war.min(), train_war.max())

train_idx = np.argsort(train_war)

plot_umap_mosaic(train_emb, train_war, train_idx, title='UMAP train (c=5, n=200)')

outfile = os.path.join(PLOT_FOLDER, "umap_train_64x64.png")

plt.savefig(
    outfile,
    dpi=300,
    bbox_inches='tight'
)
plt.close()



0.0 0.7385784077858268


# Val

In [17]:
val_war = np.sum(val_data > threshold, axis=(1, 2)) / (32 * 32 * np.pi)
val_war.min(), val_war.max()

val_idx = np.argsort(val_war)

plot_umap_mosaic(val_emb, val_war, val_idx, title='UMAP val (c=5, n=200)')


outfile = os.path.join(PLOT_FOLDER, "umap_val_64x64.png")

plt.savefig(
    outfile,
    dpi=300,
    bbox_inches='tight'
)
plt.close()





### Save UMAP model and embeddings

In [18]:
fname = os.path.join(DATA_FOLDER, "umap.joblib.gz")
joblib.dump(model, fname, compress=9)


['/data1/scidata/meteotn_data_new/umap.joblib.gz']

In [19]:
outfile_path = os.path.join(DATA_FOLDER, "train_data.npz")
with open(outfile_path, "wb") as outfile:
    np.savez_compressed(outfile, train_data)
    
outfile_path = os.path.join(DATA_FOLDER, "val_data.npz")
with open(outfile_path, "wb") as outfile:
    np.savez_compressed(outfile, val_data)


In [20]:
outfile_path = os.path.join(DATA_FOLDER, "train_emb.npz")
with open(outfile_path, "wb") as outfile:
    np.savez_compressed(outfile, train_emb)
    
outfile_path = os.path.join(DATA_FOLDER, "val_emb.npz")
with open(outfile_path, "wb") as outfile:
    np.savez_compressed(outfile, val_emb)
