# Using giotto-tda for persistent homology

First lets write a function to create nice data ;)

In [None]:
import numpy as np
def make_point_clouds(n_samples_per_shape: int, n_points: int, noise: float):
    """Make point clouds for circles, spheres, and tori with random noise.
    """
    circle_point_clouds = [
        np.asarray(
            [
                [np.sin(t) + noise * (np.random.rand(1)[0] - 0.5), np.cos(t) + noise * (np.random.rand(1)[0] - 0.5), 0]
                for t in range((n_points ** 2))
            ]
        )
        for kk in range(n_samples_per_shape)
    ]
    # label circles with 0
    circle_labels = np.zeros(n_samples_per_shape)

    sphere_point_clouds = [
        np.asarray(
            [
                [
                    np.cos(s) * np.cos(t) + noise * (np.random.rand(1)[0] - 0.5),
                    np.cos(s) * np.sin(t) + noise * (np.random.rand(1)[0] - 0.5),
                    np.sin(s) + noise * (np.random.rand(1)[0] - 0.5),
                ]
                for t in range(n_points)
                for s in range(n_points)
            ]
        )
        for kk in range(n_samples_per_shape)
    ]
    # label spheres with 1
    sphere_labels = np.ones(n_samples_per_shape)

    torus_point_clouds = [
        np.asarray(
            [
                [
                    (2 + np.cos(s)) * np.cos(t) + noise * (np.random.rand(1)[0] - 0.5),
                    (2 + np.cos(s)) * np.sin(t) + noise * (np.random.rand(1)[0] - 0.5),
                    np.sin(s) + noise * (np.random.rand(1)[0] - 0.5),
                ]
                for t in range(n_points)
                for s in range(n_points)
            ]
        )
        for kk in range(n_samples_per_shape)
    ]
    # label tori with 2
    torus_labels = 2 * np.ones(n_samples_per_shape)

    point_clouds = np.concatenate((circle_point_clouds, sphere_point_clouds, torus_point_clouds))
    labels = np.concatenate((circle_labels, sphere_labels, torus_labels))

    return point_clouds, labels


# esferas y toros ;)
n_samples_per_class = 10
point_clouds, labels = make_point_clouds(n_samples_per_class, 10, 0.1)
point_clouds.shape
print(f"There are {point_clouds.shape[0]} point clouds in {point_clouds.shape[2]} dimensions, "
      f"each with {point_clouds.shape[1]} points.")

# Calculate persistent homology

``VietorisRipsPersistence``, and all other
“persistent homology” transformers in ``gtda.homology``, expect input in
the form of a 3D array or, in some cases, a list of 2D arrays. For each
entry in the input (here, for each point cloud in ``point_clouds``) they
compute a topological summary which is also a 2D array, and then stack
all these summaries into a single output 3D array. So, in our case,
``diagrams[i]`` represents the topology of ``point_clouds[i]``.
``diagrams[i]`` is interpreted as follows: - Each row is a triplet
describing a single topological feature found in ``point_clouds[i]``. -
The first and second entries (respectively) in the triplet denote the
values of the “filtration parameter” at which the feature appears or
disappears respectively. They are referred to as the “birth” and “death”
values of the feature (respectively). The meaning of “filtration
parameter” depends on the specific transformer, but in the case of
``VietorisRipsPersistence`` on point clouds it has the interpretation of
a length scale. - A topological feature can be a connected component, 1D
hole/loop, 2D cavity, or more generally :math:`d`-dimensional “void”
which exists in the data at scales between its birth and death values.
The integer :math:`d` is the *homology dimension* (or degree) of the
feature and is stored as the third entry in the triplet. In this
example, the shapes should have 2D cavities so we explicitly tell
``VietorisRipsPersistence`` to look for these by using the
``homology_dimensions`` parameter!

In [None]:
from gtda.homology import VietorisRipsPersistence
from gtda.plotting import plot_diagram

VR = VietorisRipsPersistence(homology_dimensions=[0, 1, 2])  # Parameter explained in the text
diagrams = VR.fit_transform(point_clouds)
# diagrams.shape

i = 0
plot_diagram(diagrams[i])

# Extract features
Instantiate a ``PersistenceEntropy`` transformer and extract scalar features from the persistence diagrams.

In [None]:
from gtda.diagrams import PersistenceEntropy

PE = PersistenceEntropy()
features = PE.fit_transform(diagrams)

# Use the new features in a standard classifier
Leverage the compatibility with ``scikit-learn`` to perform a train-test split and score the features.

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(features, labels)
model = RandomForestClassifier()
model.fit(X_train, y_train)
model.score(X_valid, y_valid)

# Encapsulate the steps above in a pipeline
- Define an end-to-end pipeline by chaining transformers from giotto-tda with scikit-learn ones

- Train-test split the input point cloud data and labels.

- Fit the pipeline on the training data.

- Score the fitted pipeline on the test data.

In [None]:
from sklearn.pipeline import make_pipeline

steps = [VietorisRipsPersistence(homology_dimensions=[0, 1, 2]),
         PersistenceEntropy(),
         RandomForestClassifier()]

pipeline = make_pipeline(*steps)

pcs_train, pcs_valid, labels_train, labels_valid = train_test_split(point_clouds, labels)
pipeline.fit(pcs_train, labels_train)
pipeline.score(pcs_valid, labels_valid)