## Unsupervised learning: dimensionality reduction

In [None]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import pandas as pd
import os
import xarray as xr

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap

mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

In [None]:
#Open data
DATADIR = '../data/ERA5/'
mslp = xr.open_mfdataset(DATADIR + 'Daymean_era5_2deg_MSL_EU_19790101-20210902.nc', combine='by_coords')
# Convert to hPa
mslp.MSL.values = mslp.MSL.values/100
lon = mslp.lon
lat = mslp.lat

In [None]:
mslp.MSL.shape

In [None]:
# plot an example (day) of mslp
mslp.MSL.isel(time=200).plot();

In [None]:
#monthly_means = mslp.groupby("time.month").mean()
seas_means = mslp.groupby("time.season").mean()

In [None]:
fg = seas_means.MSL.plot(col="season",  col_wrap=4,
    # The remaining kwargs customize the plot just as for not-faceted plots
    robust=True,
    cmap=mpl.cm.RdYlBu_r)

# Use this to plot contours on each panel
# Note that this plotting call uses the original DataArray gradients
fg.map_dataarray(
    xr.plot.contour, x="lon", y="lat", colors="k", levels=13, add_colorbar=False
)

# Starting the analysis
1. Calculate anomalies (input for PCA?)

In [None]:
# compute anomalies
climatology = mslp.mean('time')
# By season
season_climatology = mslp.groupby('time.season').mean('time')


In [None]:
# climatological anomalies
anom_mslp =  mslp.MSL  - climatology
# By season
anom_seas_mslp = mslp.groupby('time.season') - season_climatology

2. PCA

In [None]:
# start using the whole data set for PCA. Then, anomalies can be used
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [None]:
# Some studies used scale-weighted by the latitude 
weights = np.cos(np.deg2rad(mslp.lat))
weights.name = "weights"
r_weights = np.sqrt(weights)
data_mean = mslp.mean()
# Normalised by the weights from the cosine of the latitude
mslp_weights = (mslp - data_mean) / r_weights

In [None]:
# We need to reshape the data [time,latxlon]  
mslp_weights_stacked = mslp_weights.stack(latlon=('lat', 'lon'))

In [None]:
# We need to reshape the data [time,latxlon]  
mslp_stacked = mslp.stack(latlon=('lat', 'lon'))

In [None]:
# Load in memory for computing the PCA

In [None]:
mslp_weights_stacked.load()

In [None]:
mslp_stacked.load()

In [None]:
type(mslp_stacked)
X = mslp_stacked.MSL
XW = mslp_weights_stacked.MSL

In [None]:
# The number of variables (features) is 1025 (41 points in longitude * 25 points in latitude)
# Standardise the data
from sklearn.preprocessing import StandardScaler
scaler  = StandardScaler()
scaler = scaler.fit(X)
X = scaler.transform(X)

In [None]:
#pca = PCA(n_components = 4) # start with 4 
pca = PCA()
pca.fit(X)

In [None]:
cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1

In [None]:
# See how many components 
f, ax = plt.subplots(figsize=(6,6))
ax.plot(range(1,21), pca.explained_variance_ratio_[0:20]*100)
ax.plot(range(1,21), pca.explained_variance_ratio_[0:20]*100,'ro')
ax.grid(ls=':')
ax.set_xticks(range(1,21)); 
ax.set_xlabel('PC#');
ax.set_ylabel("% variance");

In [None]:
# We can take 4 or 5
# Following the literature I will take 4 (e.g. Cortesi et a., 2021)
n = 12 # We can change this number
pca.explained_variance_ratio_[:n].sum()

In [None]:
# Additional checks
# pca = PCA(n_components=0.95)
# X_reduced = pca.fit_transform(X)
# I will beed to incllude 18 PC to get 95% 
# pca.n_components_

In [None]:
PCs = pca.fit_transform(X)
PCs_n = PCs[:,:n]

In [None]:
# Data frame format for the selected components
PCdf = pd.DataFrame(PCs_n, index = mslp['time'], \
                    columns = ["PC%s" % (x) for x in range(1, PCs_n.shape[1] +1)])
# see the data
PCdf.head()

The EOFS (Empirical orthogonal functions) contain the spatial patterns associated with each PC


In [None]:
EOFs = pca.components_
EOFs = EOFs[:n,:]
EOFs.shape

# EOFs_r = EOFs.reshape((ipc, len(lat), len(lon)))

In [None]:
# reshape the data
EOFs_r = EOFs.reshape((n, len(lat), len(lon)))
EOFs_r.shape

In [None]:
nn = []
tot_var = []
for ip in range(n):
    xn = pca.explained_variance_ratio_[:ip + 1].sum()
    nn.append(xn)
    xx =  pca.explained_variance_ratio_[:ip + 1].sum() - pca.explained_variance_ratio_[:ip ].sum()
    tot_var.append(xx)

In [None]:
# Convert into Xarray for visualization
#nn = [0,1,2,3,4]
XD_EOFs_r = xr.DataArray(data=EOFs_r, coords=[("PCA", tot_var), ("lat", lat), ("lon", lon)])


In [None]:
fg = XD_EOFs_r.plot(col="PCA",  col_wrap=4,
    # The remaining kwargs customize the plot just as for not-faceted plots
    robust=True,
    cmap=mpl.cm.RdYlBu_r)

# Use this to plot contours on each panel
# Note that this plotting call uses the original DataArray gradients
fg.map_dataarray(
    xr.plot.contour, x="lon", y="lat", colors="k", levels=13, add_colorbar=False
)

# K-Cluster analysis

In [None]:
from sklearn.cluster import KMeans

In [None]:
# Perform K-cluster analysis using the PCds obtained before
nclusters = 12
kmeans = KMeans(init='k-means++', n_clusters=nclusters, n_init=10)

In [None]:
kmeans.fit(PCdf.values)

In [None]:
kmeans2 = KMeans(n_clusters=12, random_state=42)
y_pred = kmeans.fit_predict(PCdf.values)

In [None]:
y_pred

In [None]:
kmeans.cluster_centers_
kmeans.labels_

Each day belongs to a cluster, labelled by kmeands.labels_

In [None]:
np.unique(kmeans.labels_)

In [None]:
labels = pd.DataFrame(kmeans.labels_, index=mslp['time'], columns=['cluster'])

In [None]:
# See how many days belong to cluster 0
index = labels.query('cluster == {}'.format(0))

In [None]:
len(index)

For each cluster we calculate the mean 

In [None]:
num_tot = len(labels.cluster)
clusters = []
nbdays = []
for iclus in range(nclusters): 
    index = labels.query('cluster == {}'.format(iclus)) 
    freq = (len(index)/num_tot)*100
    freq = round(freq,2)
    nbdays.append(freq)
    cluster = mslp.sel(time=index.index).mean('time')
    clusters.append(cluster)

In [None]:
clusters = xr.concat(clusters, dim='cluster')
#nbdays

In [None]:
clusters.assign_coords(cluster=nbdays)

In [None]:
fg_C = clusters.MSL.plot(col="cluster",  col_wrap=4,
    # The remaining kwargs customize the plot just as for not-faceted plots
    robust=True, 
    cmap=mpl.cm.RdYlBu_r)

fg_C.map_dataarray(
    xr.plot.contour, x="lon", y="lat", colors="k", levels=13, add_colorbar=False
)

# need to change the labels