# Giotto-tda

* https://github.com/giotto-ai/giotto-tda
* https://giotto-ai.github.io/gtda-docs/0.5.1/library.html
* https://github.com/giotto-ai/giotto-tda/blob/master/examples/mapper_quickstart.ipynb

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/fuyu-quant/data-science-wiki/blob/main/nlp/text_embedding(japanese)/sentencebert.ipynb)

In [1]:
# Data wrangling
import numpy as np
import pandas as pd  # Not a requirement of giotto-tda, but is compatible with the gtda.mapper module

# Data viz
from gtda.plotting import plot_point_cloud

# TDA magic
from gtda.mapper import (
    CubicalCover,
    make_mapper_pipeline,
    Projection,
    plot_static_mapper_graph,
    plot_interactive_mapper_graph,
    MapperInteractivePlotter
)

# ML tools
from sklearn import datasets
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA

In [2]:
data, _ = datasets.make_circles(n_samples=5000, noise=0.05, factor=0.3, random_state=42)

plot_point_cloud(data)

In [3]:
# Define filter function – can be any scikit-learn transformer
filter_func = Projection(columns=[0, 1])
# Define cover
cover = CubicalCover(n_intervals=10, overlap_frac=0.3)
# Choose clustering algorithm – default is DBSCAN
clusterer = DBSCAN()

# Configure parallelism of clustering step
n_jobs = 1

# Initialise pipeline
pipe = make_mapper_pipeline(
    filter_func=filter_func,
    cover=cover,
    clusterer=clusterer,
    verbose=False,
    n_jobs=n_jobs,
)

In [4]:
fig = plot_static_mapper_graph(pipe, data)
fig.show(config={'scrollZoom': True})

In [5]:
plotly_params = {"node_trace": {"marker_colorscale": "Blues"}}
fig = plot_static_mapper_graph(
    pipe, data, color_data=data, plotly_params=plotly_params
)
fig.show(config={'scrollZoom': True})

In [6]:
# Initialise estimator to color graph by
pca = PCA(n_components=1)

fig = plot_static_mapper_graph(
    pipe, data, color_data=data, color_features=pca
)
fig.show(config={'scrollZoom': True})

In [7]:
fig = plot_static_mapper_graph(
    pipe, data, color_data=data, color_features=pca, node_color_statistic=lambda x: np.mean(x) / 2
)
fig.show(config={'scrollZoom': True})

In [8]:
graph = pipe.fit_transform(data)
node_elements = graph.vs["node_elements"]
print(f"There are {len(node_elements)} nodes.\nThe first node consists of row indices {node_elements[0]}.")

There are 90 nodes.
The first node consists of row indices [   0   85  100  167  253  264  276  372  385  390  427  462  504  508
  542  616  626  627  711  762  801  811  825  856  911  921  945  956
  963 1025 1029 1038 1058 1122 1134 1211 1280 1315 1347 1352 1357 1383
 1390 1398 1406 1450 1507 1585 1616 1650 1725 1738 1739 1744 1774 1786
 1817 1864 1899 1902 1932 1944 1964 1997 2009 2081 2231 2234 2270 2274
 2425 2450 2500 2512 2615 2632 2716 2756 2769 2773 2818 2830 2843 2876
 2893 2942 2956 2957 3010 3100 3101 3118 3150 3229 3321 3352 3383 3429
 3451 3455 3527 3531 3567 3589 3600 3666 3697 3708 3726 3738 3748 3761
 3779 3823 3947 3988 4048 4223 4282 4501 4506 4561 4587 4628 4749 4865
 4887 4912 4938 4945 4952 4967].


In [9]:
fig = plot_static_mapper_graph(
    pipe, data, node_color_statistic=np.arange(len(node_elements))
)
fig.show(config={'scrollZoom': True})

In [10]:
df = pd.DataFrame(data, columns=["x", "y"])
df.head()

Unnamed: 0,x,y
0,-0.711917,-0.546609
1,0.306951,-0.007028
2,0.288193,0.123284
3,-0.892223,0.502352
4,-0.143615,0.938935


In [11]:
pipe.set_params(filter_func=Projection(columns=["x", "y"]));

In [12]:
fig = plot_static_mapper_graph(pipe, df, color_data=df)
fig.show(config={'scrollZoom': True})

In [13]:
df["Circle"] = df["x"] ** 2 + df["y"] ** 2 < 0.25
df["Circle"] = df["Circle"].replace([False, True], ["A", "B"])

In [14]:
color_data = pd.get_dummies(df["Circle"], prefix="Circle")

fig = plot_static_mapper_graph(pipe, df[["x", "y"]], color_data=color_data)
fig.show(config={'scrollZoom': True})

In [15]:
# Reset back to numpy projection
pipe.set_params(filter_func=Projection(columns=[0, 1]));

In [16]:
fig = plot_static_mapper_graph(
    pipe, data, layout="fruchterman_reingold", color_data=data
)
fig.show(config={'scrollZoom': True})

In [17]:
fig = plot_static_mapper_graph(pipe, data, layout_dim=3, color_data=data)
fig.show(config={'scrollZoom': True})

In [18]:
node_scale = 30
fig = plot_static_mapper_graph(pipe, data, layout_dim=3, node_scale=node_scale)
fig.show(config={'scrollZoom': True})

In [19]:
graph = pipe.fit_transform(data)

In [20]:
graph.vs.attributes()

['pullback_set_label', 'partial_cluster_label', 'node_elements']

In [21]:
node_id = 0
node_elements = graph.vs["node_elements"]

print(f"""
Node ID: {node_id}
Node elements: {node_elements[node_id]}
Data points: {data[node_elements[node_id]]}
""")


Node ID: 0
Node elements: [   0   85  100  167  253  264  276  372  385  390  427  462  504  508
  542  616  626  627  711  762  801  811  825  856  911  921  945  956
  963 1025 1029 1038 1058 1122 1134 1211 1280 1315 1347 1352 1357 1383
 1390 1398 1406 1450 1507 1585 1616 1650 1725 1738 1739 1744 1774 1786
 1817 1864 1899 1902 1932 1944 1964 1997 2009 2081 2231 2234 2270 2274
 2425 2450 2500 2512 2615 2632 2716 2756 2769 2773 2818 2830 2843 2876
 2893 2942 2956 2957 3010 3100 3101 3118 3150 3229 3321 3352 3383 3429
 3451 3455 3527 3531 3567 3589 3600 3666 3697 3708 3726 3738 3748 3761
 3779 3823 3947 3988 4048 4223 4282 4501 4506 4561 4587 4628 4749 4865
 4887 4912 4938 4945 4952 4967]
Data points: [[-0.7119167  -0.54660896]
 [-0.91976898 -0.43025704]
 [-0.73571693 -0.62569565]
 [-0.73630218 -0.66957306]
 [-0.79134262 -0.57047771]
 [-0.84548577 -0.61571079]
 [-0.85448295 -0.60924645]
 [-0.78585898 -0.67334614]
 [-0.75059868 -0.72410451]
 [-0.63086658 -0.66158518]
 [-0.8793931  -0.45

In [22]:
filter_func = np.sum

pipe = make_mapper_pipeline(
    filter_func=filter_func,
    cover=cover,
    clusterer=clusterer,
    verbose=True,
    n_jobs=n_jobs,
)

In [23]:
fig = plot_static_mapper_graph(pipe, data)
fig.show(config={'scrollZoom': True})

[Pipeline] ............ (step 1 of 3) Processing scaler, total=   0.0s
[Pipeline] ....... (step 2 of 3) Processing filter_func, total=   0.0s
[Pipeline] ............. (step 3 of 3) Processing cover, total=   0.0s
[Pipeline] .... (step 1 of 3) Processing pullback_cover, total=   0.0s
[Pipeline] ........ (step 2 of 3) Processing clustering, total=   0.0s
[Pipeline] ............. (step 3 of 3) Processing nerve, total=   0.0s


In [24]:
pipe = make_mapper_pipeline()

# Generate interactive widget
plot_interactive_mapper_graph(pipe, data, color_data=data)

VBox(children=(HBox(children=(VBox(children=(HTML(value='<b>Cover parameters</b>'), Text(value='uniform', cont…

In [25]:
# Create the plotter object
MIP = MapperInteractivePlotter(pipe, data)

# Generate interactive widget
MIP.plot(color_data=data)

VBox(children=(HBox(children=(VBox(children=(HTML(value='<b>Cover parameters</b>'), Text(value='uniform', cont…

In [26]:
print("Attributes created by `.plot` and updated during the interactive session:\n",
      [attr for attr in dir(MIP) if attr.endswith("_") and attr[0] != "_"])

Attributes created by `.plot` and updated during the interactive session:
 ['color_features_', 'figure_', 'graph_', 'node_summaries_', 'pipeline_']
