## Import libraries

In [2]:
# from the datasets module of scikit learn, import the load_digits function
from sklearn.datasets import load_digits
# from the matplotlib library, import the pyplot module, for plotting. We give it the name plt as this is standard practice
from matplotlib import pyplot as plt
# from scikit learn, import Principal Components Analysis
from sklearn.decomposition import PCA
# import numpy and name it np, as this is standard practice
import numpy as np

## Digits example

In [3]:
# let us import the digits dataset
digits = load_digits()

In [None]:
# the variable digits is a dictionary. Let's see what keys it has
print(list(digits.keys()))

In [None]:
# we can inspect the value of each of these keys. For instance, if we print the value for data, we get a pointcloud of 1797 points in R^64
digits['data'].shape

In [None]:
# the value of target contains the label for each of the 1797 images
print(digits['target'].shape)

# let's display the label for the first 25 digits
print(digits['target'][:25])

In [None]:
# the data consists of a specific encoding of each image. The images themselves are the value of the "images" keys:
digits['images'].shape

# we see that there are 1797 elements, each consisting of a 8 by 8 vector/matrix

In [None]:
# let us display the first 25 images, with their corresponding label

fig = plt.figure(figsize=(5, 5))

for i in range(25):
    ax = fig.add_subplot(5, 5, i + 1, xticks=[], yticks=[])
    ax.imshow(digits.images[i], cmap=plt.cm.binary)
    ax.text(0, 7, str(digits.target[i]))

plt.figure()

In [None]:
# let's display the first two principal components of each element in the pointcloud

# we construct a PCA object with 2 components
pca = PCA(n_components=2)

# we compute the 2D projection of the digits data
proj2d = pca.fit_transform(digits.data)

# we display the projection, coloring each point by its label
plt.scatter(proj2d[:, 0], proj2d[:, 1], c=digits.target, cmap="Paired")
plt.colorbar()

In [None]:
# let us now do a 2D projection of just one of the classes, to try to interpret what the 2 principal components are recovering in that case

# find the indices for the images that contain a 1
indices = np.where(digits['target']==1)[0]

# project to 2D only the 1's
pca = PCA(n_components=2)
proj = pca.fit_transform(digits.data[indices])
plt.scatter(proj[:,0],proj[:,1])
plt.show()

In [None]:
# there seems to be a fair amount of structure in the plot above.
# for instance, there seem to be two distinct clusters, one with significantly fewer points. The larger cluster seems also have nontrivial structure.
# let's now plot some of the digits on top of their corresponding point in the 2D embedding, to try to understand what the 2 principal components are capturing

# plotting all the images would be too much, so let us just plot 100
subsample = np.random.choice(len(indices), 100)

from matplotlib.offsetbox import OffsetImage, AnnotationBbox

fig, ax = plt.subplots(figsize=(15,15))
ax.scatter(proj[subsample,0], proj[subsample,1]) 

for x0, y0, im in zip(proj[subsample,0], proj[subsample,1],digits["images"][indices][subsample]):
    ab = AnnotationBbox(OffsetImage(im, cmap='binary',zoom=2), (x0, y0))
    ax.add_artist(ab)

In [None]:
# interpret the plot above!

## Persistent homology example

In [None]:
# we conclude by computing persistence homology in a very simple example to see how the Python wrapper for Ripser works

In [None]:
# install Ripser in the cloud, to use in this session
! pip install ripser

In [None]:
# import ripser, to compute persistent homology, and persim, to plot persistence diagrams
from ripser import ripser
from persim import plot_diagrams

In [None]:
# sample a noisy circle

n_points = 100
circle = np.array([[np.sin(x),np.cos(x)] for x in np.linspace(0,2*np.pi,n_points)])
var = 0.01
circle += np.random.multivariate_normal([0,0],var * np.array([[1,0],[0,1]]), n_points)

_ = plt.scatter(circle[:,0],circle[:,1])

In [None]:
# compute the persistence diagrams of dimensions 0 and 1 and plot them
pds = ripser(circle, maxdim = 1)['dgms']
plot_diagrams(pds)

In [None]:
# interpret the diagram above!