# Project Laboratory - Topological Data Analysis
Final Version 8.0

Students:
* Holger Espinola
* Li Siyuan

Dataset: Iris

Task 02:
- Construction of simplicial complexes and their invariants 
- Persistence diagrams,
  finding metrics (Wasserstein, bottleneck) 
- Classification of obtained metrics in any programming language, 
  you can use any packages

Installing package giotto-tda

In [1]:
# install library giotto-tda
import sys
!{sys.executable} -m pip install -U giotto-tda



Preprocessing of data

In [1]:
# load the dataset
from sklearn.datasets import load_iris

# load iris dataset 
x, y = load_iris(return_X_y = True)

# show the first 10 points
print("X: ", x[:10])
print("Y: ", y[:10])

X:  [[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]]
Y:  [0 0 0 0 0 0 0 0 0 0]


In [2]:
# check the dimensionality of data
print("dim x = ", x.shape)
print("dim y = ", y.shape)

dim x =  (150, 4)
dim y =  (150,)


In [3]:
# exploratory visualization - plot cloud of points
from gtda.plotting import plot_point_cloud

# for sepal-length and sepal-width
plot_point_cloud(x[:, :2])

In [4]:
# for petal-length and petal-width
plot_point_cloud(x[:, 2:])

In [5]:
# reshape x to have an extra dimension
x = x.reshape((x.shape[0], x.shape[1], 1))
print("dim x = ", x.shape)

dim x =  (150, 4, 1)


In [6]:
# split dataset in train-set and test-set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 1, test_size = 0.5)

Calculate persistence homology

In [7]:
# calculation of persistence homology
from gtda.homology import VietorisRipsPersistence

persistence_homology = VietorisRipsPersistence(
    metric = "euclidean",
    homology_dimensions = [0, 1]
)


In [8]:
# generate the diagrams of persistence for train
diagrams_train = persistence_homology.fit_transform(x_train)
print(diagrams_train)
print("dim diag-train = ", diagrams_train.shape)

[[[0.         1.39999998 0.        ]
  [0.         1.60000002 0.        ]
  [0.         2.0999999  0.        ]
  [0.         0.         1.        ]]

 [[0.         1.         0.        ]
  [0.         1.20000005 0.        ]
  [0.         2.5        0.        ]
  [0.         0.         1.        ]]

 [[0.         1.         0.        ]
  [0.         1.29999995 0.        ]
  [0.         1.70000005 0.        ]
  [0.         0.         1.        ]]

 [[0.         0.30000001 0.        ]
  [0.         0.80000001 0.        ]
  [0.         2.70000005 0.        ]
  [0.         0.         1.        ]]

 [[0.         1.20000005 0.        ]
  [0.         1.5        0.        ]
  [0.         1.5        0.        ]
  [0.         0.         1.        ]]

 [[0.         1.29999995 0.        ]
  [0.         1.70000005 0.        ]
  [0.         2.20000005 0.        ]
  [0.         0.         1.        ]]

 [[0.         1.29999995 0.        ]
  [0.         1.60000002 0.        ]
  [0.         1.79999995 0

In [9]:
# plot persistence diagram for train set
persistence_homology.plot(diagrams_train)

In [10]:
# generate the diagram of persistence for test
diagrams_test = persistence_homology.transform(x_test)
print(diagrams_test)
print("dim diag-test = ", diagrams_test.shape)

[[[0.         1.         0.        ]
  [0.         1.79999995 0.        ]
  [0.         2.79999995 0.        ]
  [0.         0.         1.        ]]

 [[0.         0.5        0.        ]
  [0.         1.39999998 0.        ]
  [0.         2.0999999  0.        ]
  [0.         0.         1.        ]]

 [[0.         1.39999998 0.        ]
  [0.         1.60000002 0.        ]
  [0.         2.20000005 0.        ]
  [0.         0.         1.        ]]

 [[0.         0.89999998 0.        ]
  [0.         1.5        0.        ]
  [0.         2.5999999  0.        ]
  [0.         0.         1.        ]]

 [[0.         1.5        0.        ]
  [0.         1.79999995 0.        ]
  [0.         2.5999999  0.        ]
  [0.         0.         1.        ]]

 [[0.         1.39999998 0.        ]
  [0.         1.60000002 0.        ]
  [0.         1.70000005 0.        ]
  [0.         0.         1.        ]]

 [[0.         0.80000001 0.        ]
  [0.         1.79999995 0.        ]
  [0.         2.         0

In [11]:
persistence_homology.plot(diagrams_test)

In [12]:
# import packages
from gtda.diagrams import BettiCurve, PersistenceLandscape, PairwiseDistance, PersistenceEntropy

Calculate metric wasserstein

In [13]:
# calculate metric of wasserstein
pw = 2 
wdis = PairwiseDistance(metric = "wasserstein", 
                      metric_params = {"p": pw, "delta": 0.1},
                      order = None)

In [15]:
# wassertein distance between x and x_train
wdis_train = wdis.fit_transform(diagrams_train)
print("dim wassertein-dis-train = ", wdis_train.shape)


dim wassertein-dis-train =  (75, 75, 2)


In [16]:
# wassertein distance between x and x_test
wdis_test = wdis.transform(diagrams_test)
print("dim wassertein-dis test ", wdis_test.shape)

dim wassertein-dis test  (75, 75, 2)


In [17]:
# flatten the wasserstein distance arrays into 1D arrays
wdis_train_flatten = wdis_train.reshape(wdis_train.shape[0], -1)
wdis_test_flatten = wdis_test.reshape(wdis_test.shape[0], -1)

print(wdis_train_flatten.shape)
print(wdis_test_flatten.shape)


(75, 150)
(75, 150)


Calculate metric bottleneck

In [18]:
bottdis = PairwiseDistance(metric = "bottleneck",
                           metric_params = {"delta": 0.1},
                           order = None)

In [19]:
# bottleneck distance for train
bottdis_train = bottdis.fit_transform(diagrams_train)
print("dim bottleneck-dis train = ", bottdis_train.shape)

dim bottleneck-dis train =  (75, 75, 2)


In [20]:
# bottleneck distance for test
bottdis_test = bottdis.transform(diagrams_test)
print("dim bottleneck-dis test = ", bottdis_test.shape)

dim bottleneck-dis test =  (75, 75, 2)


In [21]:
# flatten the wasserstein distance arrays into 1D arrays
bott_train_flatten = bottdis_train.reshape(bottdis_train.shape[0], -1)
bott_test_flatten = bottdis_test.reshape(bottdis_test.shape[0], -1)

print(bott_train_flatten.shape)
print(bott_test_flatten.shape)

(75, 150)
(75, 150)


Betti curve and Betti distance

In [22]:
# calculate the betti curve
bc = BettiCurve()
betti_curve = bc.fit_transform(diagrams_train)
bc.plot(betti_curve, sample = 0)

In [24]:
# calculate the betti distance
pd2 = PairwiseDistance(metric = "betti",
                       metric_params = {"p": 1},
                       order = None)
betti_dis = pd2.fit_transform(diagrams_train)
print(betti_dis.shape)

(75, 75, 2)


Persistence landscape

In [25]:
# build the persitence landscape
pl = PersistenceLandscape()
landscapes = pl.fit_transform(diagrams_train)

In [26]:
pl.plot(landscapes, sample = 0)

In [27]:
# calculate the persistence landscape distance
pd3 = PairwiseDistance(metric = "landscape", 
                       metric_params = {"p": 1},
                       order = None)
landscape_dis = pd3.fit_transform(diagrams_train)
print(landscape_dis.shape)

(75, 75, 2)


Classification process 01 - wassertein distance + SVM

In [28]:
# define hyperparameter tuning
param_grid = {"C": [0.01, 0.1, 1, 10, 100], 
              "kernel": ["linear", "poly", "sigmoid", "rbf"], 
              "gamma": [0.001, 0.01, 0.1, 1, 10]}

In [29]:
# define SVM-classifier
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

classifier = SVC()
grid_search = GridSearchCV(classifier, param_grid)
grid_search.fit(wdis_train_flatten, y_train)


GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.01, 0.1, 1, 10, 100],
                         'gamma': [0.001, 0.01, 0.1, 1, 10],
                         'kernel': ['linear', 'poly', 'sigmoid', 'rbf']})

In [30]:
# get the results
print("best parameters: ", grid_search.best_params_)
print("best train-accuracy: ", grid_search.best_score_)

best parameters:  {'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}
best train-accuracy:  0.7066666666666668


In [31]:
# make predictions on test set
y_pred = grid_search.predict(wdis_test_flatten)

# get the accuracy
acc_test = accuracy_score(y_test, y_pred)
print("test-accuracy = ", acc_test)

test-accuracy =  0.64


Classification process 02 - bottleneck distance + Random Forest

In [32]:
# define the hyperparameter tuning
param_grid2 = {'n_estimators': [50, 100, 200],
              'max_depth': [None, 5, 10, 20],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 4]}

In [33]:
# define Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state = 42)
grid_search2 = GridSearchCV(estimator = rf, 
                            param_grid = param_grid2,
                            cv = 5)
grid_search2.fit(bott_train_flatten, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
             param_grid={'max_depth': [None, 5, 10, 20],
                         'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [50, 100, 200]})

In [34]:
# get the results
print("best parameters: ", grid_search2.best_params_)
print("best train-accuracy: ", grid_search2.best_score_)

best parameters:  {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 50}
best train-accuracy:  0.6799999999999999


In [35]:
# make predictions on test set
y_pred2 = grid_search2.predict(bott_test_flatten)

# get the accuracy
acc_test2 = accuracy_score(y_test, y_pred2)
print("test-accuracy = ", acc_test2)

test-accuracy =  0.6533333333333333


### Exercise 3: Eirene + Julia  
Construction of simplicial complexes and traking the zero-homologies  
Classification based on zero homology groups

In [36]:
# connect with julia using api
from julia.api import Julia

In [37]:
# create an object of julia
jl = Julia(compiled_modules = False)

In [38]:
# use the Eirene package
jl.using("Eirene")

In [45]:
u = x.reshape((x.shape[1], x.shape[0]))
u.shape

(4, 150)

In [47]:
from julia import Main

c = Main.eirene(u, model = "pc")

In [48]:
import numpy as np

# calculate the zero-homologies
a0 = Main.barcode(c, dim = 0)
a0 = np.append(np.array(a0), np.zeros((len(a0), 1)), axis = 1)

In [49]:
# check the zero-homologies
print(a0)
print(a0.shape)

[[0.         0.36055513 0.        ]
 [0.         0.14142136 0.        ]
 [0.         0.51961524 0.        ]
 [0.         0.17320508 0.        ]
 [0.         0.33166248 0.        ]
 [0.         0.24494897 0.        ]
 [0.         0.37416574 0.        ]
 [0.         0.31622777 0.        ]
 [0.         0.43588989 0.        ]
 [0.         0.24494897 0.        ]
 [0.         0.57445626 0.        ]
 [0.         0.45825757 0.        ]
 [0.         0.34641016 0.        ]
 [0.         0.2236068  0.        ]
 [0.         0.3        0.        ]
 [0.         0.36055513 0.        ]
 [0.         0.46904158 0.        ]
 [0.         0.41231056 0.        ]
 [0.         0.54772256 0.        ]
 [0.         0.14142136 0.        ]
 [0.         0.24494897 0.        ]
 [0.         0.42426407 0.        ]
 [0.         0.50990195 0.        ]
 [0.         0.42426407 0.        ]
 [0.         0.87749644 0.        ]
 [0.         0.2236068  0.        ]
 [0.         0.53851648 0.        ]
 [0.         0.2        0.  

In [50]:
from gtda.plotting import plot_diagram

plot_diagram(a0)

Classification process 03 - zero homologies + KNN

In [52]:
# define hyperparameter tuning
param_grid3 = {
    "n_neighbors": [3, 5, 7],
    "weights": ["uniform", "distance"],
    "p": [1, 2]
}

In [59]:
# define KNN-classifier
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
grid_search3 = GridSearchCV(knn, param_grid3)
grid_search3.fit(a0[:-1, 1].reshape(-1, 1), y[:-1])

GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [3, 5, 7], 'p': [1, 2],
                         'weights': ['uniform', 'distance']})

In [60]:
# get the results
print("best parameters: ", grid_search3.best_params_)
print("best train-accuracy: ", grid_search3.best_score_)

best parameters:  {'n_neighbors': 3, 'p': 1, 'weights': 'distance'}
best train-accuracy:  0.3691954022988506
