
# Digits Dataset

This digits example shows two ways of customizing the tooltips options in the HTML visualization. It generates the visualization with tooltips set as the y-label, or number of the image. The second generated result uses the actual image in the tooltips.

`Visualization with y-label tooltip <../../_static/digits_ylabel_tooltips.html>`_

`Visualization with custom tooltips <../../_static/digits_custom_tooltips.html>`_


In [5]:
pip install pillow

Note: you may need to restart the kernel to use updated packages.


In [6]:
import io
import sys
import base64

import matplotlib.pyplot as plt
import numpy as np
import sklearn
from sklearn import datasets
from sklearn.preprocessing import MinMaxScaler
import kmapper as km

try:
    from PIL import Image
except ImportError as e:
    print("This example requires Pillow. Run `pip install pillow` and then try again.")
    sys.exit()


# Load digits data
data, labels = datasets.load_digits().data, datasets.load_digits().target

# Raw data is (0, 16), so scale to 8 bits (pillow can't handle 4-bit greyscale PNG depth)
scaler = MinMaxScaler(feature_range=(0, 255))
data = scaler.fit_transform(data).astype(np.uint8)

# Create images for a custom tooltip array
tooltip_s = []
for image_data in data:
    with io.BytesIO() as output:
        img = Image.fromarray(image_data.reshape((8, 8)), "L")
        img.save(output, "PNG")
        contents = output.getvalue()
        img_encoded = base64.b64encode(contents)
        img_tag = """<img src="data:image/png;base64,{}">""".format(
            img_encoded.decode("utf-8")
        )
        tooltip_s.append(img_tag)

tooltip_s = np.array(
    tooltip_s
)  # need to make sure to feed it as a NumPy array, not a list

# Initialize to use t-SNE with 2 components (reduces data to 2 dimensions). Also note high overlap_percentage.
mapper = km.KeplerMapper(verbose=2)

# Fit and transform data
projected_data = mapper.fit_transform(data, projection=sklearn.manifold.TSNE())

# Create the graph (we cluster on the projected data and suffer projection loss)
graph = mapper.map(
    projected_data,
    clusterer=sklearn.cluster.DBSCAN(eps=0.3, min_samples=15),
    cover=km.Cover(35, 0.4),
)

# Create the visualizations (increased the graph_gravity for a tighter graph-look.)
print("Output graph examples to html")
# Tooltips with image data for every cluster member
mapper.visualize(
    graph,
    title="Handwritten digits Mapper",
    path_html="output/digits_custom_tooltips.html",
    color_values=labels,
    color_function_name="labels",
    custom_tooltips=tooltip_s,
)
# Tooltips with the target y-labels for every cluster member
mapper.visualize(
    graph,
    title="Handwritten digits Mapper",
    path_html="output/digits_ylabel_tooltips.html",
    custom_tooltips=labels,
)

# Matplotlib examples
km.draw_matplotlib(graph, layout="spring")
plt.show()

KeplerMapper(verbose=2)
..Composing projection pipeline of length 1:
	Projections: TSNE()
	Distance matrices: False
	Scalers: MinMaxScaler()
..Projecting on data shaped (1797, 64)

..Projecting data using: 
	TSNE(verbose=2)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 1797 samples in 0.001s...
[t-SNE] Computed neighbors for 1797 samples in 0.119s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1797
[t-SNE] Computed conditional probabilities for sample 1797 / 1797
[t-SNE] Mean sigma: 157.044611
[t-SNE] Computed conditional probabilities in 0.130s
[t-SNE] Iteration 50: error = 75.2938538, gradient norm = 0.1244358 (50 iterations in 0.261s)
[t-SNE] Iteration 100: error = 63.8273544, gradient norm = 0.0456754 (50 iterations in 0.206s)
[t-SNE] Iteration 150: error = 62.3973579, gradient norm = 0.0293449 (50 iterations in 0.225s)
[t-SNE] Iteration 200: error = 61.9336815, gradient norm = 0.0246261 (50 iterations in 0.222s)
[t-SNE] Iteration 250: error = 61.71945

   > Found 1 clusters in hypercube 385.
Cube_386 is empty.

Cube_387 is empty.

Cube_388 is empty.

Cube_389 is empty.

Cube_390 is empty.

Cube_391 is empty.

Cube_392 is empty.

Cube_393 is empty.

Cube_394 is empty.

   > Found 1 clusters in hypercube 395.
Cube_396 is empty.

Cube_397 is empty.

Cube_398 is empty.

Cube_399 is empty.

Cube_400 is empty.

Cube_401 is empty.

   > Found 1 clusters in hypercube 402.
   > Found 1 clusters in hypercube 403.
   > Found 1 clusters in hypercube 404.
Cube_405 is empty.

Cube_406 is empty.

Cube_407 is empty.

Cube_408 is empty.

Cube_409 is empty.

Cube_410 is empty.

Cube_411 is empty.

   > Found 1 clusters in hypercube 412.
   > Found 1 clusters in hypercube 413.
   > Found 1 clusters in hypercube 414.
   > Found 1 clusters in hypercube 415.
Cube_416 is empty.

Cube_417 is empty.

   > Found 1 clusters in hypercube 418.
   > Found 1 clusters in hypercube 419.
   > Found 1 clusters in hypercube 420.
   > Found 1 clusters in hypercube 421.


FileNotFoundError: [Errno 2] No such file or directory: 'output/digits_custom_tooltips.html'

In [7]:
from statistics import mean
graphlens = []
for points in graph["nodes"].values():
    if points == []:
        continue
    graphlens.append(mean([lens[i][0] for i in points]))
print(graph["nodes"].keys())

import networkx as nx
nx_graph = km.adapter.to_nx(graph)
nx.write_graph6(nx_graph,"digits.graph6")

graphlens

dict_keys(['cube1_cluster0', 'cube2_cluster0', 'cube5_cluster0', 'cube6_cluster0', 'cube7_cluster0', 'cube10_cluster0', 'cube11_cluster0', 'cube12_cluster0', 'cube17_cluster0', 'cube18_cluster0', 'cube19_cluster0', 'cube23_cluster0', 'cube24_cluster0', 'cube27_cluster0', 'cube28_cluster0', 'cube29_cluster0', 'cube33_cluster0', 'cube34_cluster0', 'cube35_cluster0', 'cube36_cluster0', 'cube47_cluster0', 'cube48_cluster0', 'cube52_cluster0', 'cube53_cluster0', 'cube54_cluster0', 'cube55_cluster0', 'cube58_cluster0', 'cube60_cluster0', 'cube61_cluster0', 'cube69_cluster0', 'cube71_cluster0', 'cube72_cluster0', 'cube75_cluster0', 'cube77_cluster0', 'cube78_cluster0', 'cube79_cluster0', 'cube89_cluster0', 'cube93_cluster0', 'cube94_cluster0', 'cube95_cluster0', 'cube107_cluster0', 'cube108_cluster0', 'cube109_cluster0', 'cube120_cluster0', 'cube121_cluster0', 'cube134_cluster0', 'cube145_cluster0', 'cube147_cluster0', 'cube148_cluster0', 'cube153_cluster0', 'cube161_cluster0', 'cube162_clust

[0.494470605216406,
 0.5250811683319966,
 0.5547656501215521,
 0.5144832340140439,
 0.5557590717707123,
 0.5798432533244197,
 0.5286613494601218,
 0.5448111166624332,
 0.5061459647127453,
 0.48965169334776,
 0.5017295517886569,
 0.5463011773175607,
 0.505220566237519,
 0.5348211731395368,
 0.49520881391574945,
 0.5259546481101841,
 0.488567395814133,
 0.5204826880770703,
 0.5002968183460869,
 0.5064907389420243,
 0.4720709129958684,
 0.5257091922318812,
 0.5302710699725586,
 0.46972199642805074,
 0.5270392002988917,
 0.5278593229358283,
 0.5956158100141586,
 0.5594269558970423,
 0.528827570599993,
 0.6208487964126188,
 0.5291657939375259,
 0.5489183651589353,
 0.5531947177186987,
 0.5761076915977023,
 0.46897727852354254,
 0.4755360456127137,
 0.574134370262166,
 0.5447774165134499,
 0.513736123606663,
 0.5490786200122357,
 0.5607465765024948,
 0.559090532773815,
 0.5594032811721665,
 0.5561676175344578,
 0.511115765666528,
 0.49410846461220925,
 0.42782032596116865,
 0.611637002015326

In [2]:
len(graphlens)

20

# Cat dataset

In [1]:
import numpy as np
import sklearn
import kmapper as km
import numpy as np
data = np.genfromtxt("data/cat-reference.csv", delimiter=",")

mapper = km.KeplerMapper(verbose=2)

lens = mapper.fit_transform(data)

graph = mapper.map(
    lens,
    data,
    clusterer=sklearn.cluster.DBSCAN(eps=0.1, min_samples=5),
    cover=km.Cover(n_cubes=15, perc_overlap=0.2),
)

mapper.visualize(graph, path_html="cat.html")

from statistics import mean
graphlens = []
for points in graph["nodes"].values():
    if points == []:
        continue
    graphlens.append(mean([lens[i][0] for i in points]))
print(graph["nodes"].keys())

import networkx as nx
nx_graph = km.adapter.to_nx(graph)
nx.write_graph6(nx_graph,"cat.graph6")

graphlens

KeplerMapper(verbose=2)
..Composing projection pipeline of length 1:
	Projections: sum
	Distance matrices: False
	Scalers: MinMaxScaler()
..Projecting on data shaped (7207, 3)

..Projecting data using: sum

..Scaling with: MinMaxScaler()

Mapping on data shaped (7207, 3) using lens shaped (7207, 1)

Minimal points in hypercube before clustering: 5
Creating 15 hypercubes.
   > Found 2 clusters in hypercube 0.
   > Found 2 clusters in hypercube 1.
   > Found 2 clusters in hypercube 2.
   > Found 1 clusters in hypercube 3.
   > Found 2 clusters in hypercube 4.
   > Found 2 clusters in hypercube 5.
   > Found 1 clusters in hypercube 6.
   > Found 1 clusters in hypercube 7.
   > Found 1 clusters in hypercube 8.
   > Found 1 clusters in hypercube 9.
   > Found 1 clusters in hypercube 10.
   > Found 1 clusters in hypercube 11.
   > Found 1 clusters in hypercube 12.
   > Found 1 clusters in hypercube 13.
   > Found 1 clusters in hypercube 14.

Created 19 edges and 20 nodes in 0:00:00.114535.
W

[0.05821261969831075,
 0.031150753583003548,
 0.08954872397051421,
 0.09857525385113208,
 0.16629887151321296,
 0.16476644766417609,
 0.2336163493160702,
 0.29731944709888036,
 0.34000858078143714,
 0.36313136300937987,
 0.37699007243025967,
 0.4366291077891086,
 0.4905688524345674,
 0.5662495918092447,
 0.6320591064182264,
 0.7009019680051876,
 0.7713299046859922,
 0.842344226540508,
 0.8952833587938017,
 0.945359228014147]

# Lion dataset

In [8]:
import numpy as np
import sklearn
import kmapper as km
import numpy as np
data = np.genfromtxt("data/lion-reference.csv", delimiter=",")

mapper = km.KeplerMapper(verbose=2)

lens = mapper.fit_transform(data)

graph = mapper.map(
    lens,
    data,
    clusterer=sklearn.cluster.DBSCAN(eps=0.1, min_samples=5),
    cover=km.Cover(n_cubes=15, perc_overlap=0.2),
)

mapper.visualize(graph, path_html="lion.html")

from statistics import mean
graphlens = []
for points in graph["nodes"].values():
    if points == []:
        continue
    graphlens.append(mean([lens[i][0] for i in points]))
print(graph["nodes"].keys())

import networkx as nx
nx_graph = km.adapter.to_nx(graph)
nx.write_graph6(nx_graph,"lion.graph6")

graphlens

KeplerMapper(verbose=2)
..Composing projection pipeline of length 1:
	Projections: sum
	Distance matrices: False
	Scalers: MinMaxScaler()
..Projecting on data shaped (5000, 3)

..Projecting data using: sum

..Scaling with: MinMaxScaler()

Mapping on data shaped (5000, 3) using lens shaped (5000, 1)

Minimal points in hypercube before clustering: 5
Creating 15 hypercubes.
   > Found 1 clusters in hypercube 0.
   > Found 2 clusters in hypercube 1.
   > Found 3 clusters in hypercube 2.
   > Found 2 clusters in hypercube 3.
   > Found 1 clusters in hypercube 4.
   > Found 2 clusters in hypercube 5.
   > Found 2 clusters in hypercube 6.
   > Found 1 clusters in hypercube 7.
   > Found 1 clusters in hypercube 8.
   > Found 1 clusters in hypercube 9.
   > Found 1 clusters in hypercube 10.
   > Found 1 clusters in hypercube 11.
   > Found 1 clusters in hypercube 12.
   > Found 1 clusters in hypercube 13.
   > Found 1 clusters in hypercube 14.

Created 20 edges and 21 nodes in 0:00:00.079892.
W

[0.039928079328225216,
 0.08831831147617844,
 0.1147018731037132,
 0.18712375836228468,
 0.15467059782315284,
 0.15813593981707055,
 0.22686088057832582,
 0.23308245655207782,
 0.30025938194582186,
 0.36300185980417926,
 0.3981066300055593,
 0.43002419150092447,
 0.4345483791127312,
 0.5044392501848346,
 0.5656462748697907,
 0.6242883899864906,
 0.699988505169165,
 0.7790166017524458,
 0.8403023414143805,
 0.8974501211116412,
 0.9555435933561892]

# Horse

In [9]:
import numpy as np
import sklearn
import kmapper as km
import numpy as np
data = np.genfromtxt("data/horse-reference.csv", delimiter=",")

mapper = km.KeplerMapper(verbose=2)

lens = mapper.fit_transform(data)

graph = mapper.map(
    lens,
    data,
    clusterer=sklearn.cluster.DBSCAN(eps=0.1, min_samples=5),
    cover=km.Cover(n_cubes=15, perc_overlap=0.2),
)

mapper.visualize(graph, path_html="horse.html")

from statistics import mean
graphlens = []
for points in graph["nodes"].values():
    if points == []:
        continue
    graphlens.append(mean([lens[i][0] for i in points]))
print(graph["nodes"].keys())

import networkx as nx
nx_graph = km.adapter.to_nx(graph)
nx.write_graph6(nx_graph,"horse.graph6")

graphlens

KeplerMapper(verbose=2)
..Composing projection pipeline of length 1:
	Projections: sum
	Distance matrices: False
	Scalers: MinMaxScaler()
..Projecting on data shaped (8431, 3)

..Projecting data using: sum

..Scaling with: MinMaxScaler()

Mapping on data shaped (8431, 3) using lens shaped (8431, 1)

Minimal points in hypercube before clustering: 5
Creating 15 hypercubes.
   > Found 2 clusters in hypercube 0.
   > Found 3 clusters in hypercube 1.
   > Found 3 clusters in hypercube 2.
   > Found 2 clusters in hypercube 3.
   > Found 2 clusters in hypercube 4.
   > Found 2 clusters in hypercube 5.
   > Found 1 clusters in hypercube 6.
   > Found 1 clusters in hypercube 7.
   > Found 1 clusters in hypercube 8.
   > Found 1 clusters in hypercube 9.
   > Found 1 clusters in hypercube 10.
   > Found 1 clusters in hypercube 11.
   > Found 1 clusters in hypercube 12.
   > Found 1 clusters in hypercube 13.
   > Found 1 clusters in hypercube 14.

Created 22 edges and 23 nodes in 0:00:00.119720.
W

[0.0430658109471472,
 0.0690867702967714,
 0.11994831346252788,
 0.09758857269880156,
 0.1080859664445717,
 0.16389740748346315,
 0.16238086727824272,
 0.16402253353788693,
 0.23289411545642125,
 0.27092283070524437,
 0.2977037675689274,
 0.3106494792633316,
 0.36055878985824985,
 0.37374464157984083,
 0.43204056519908346,
 0.4964080660235876,
 0.5662378313865974,
 0.6289727458229478,
 0.6955275146233175,
 0.7658797243916117,
 0.843413566046764,
 0.9051344139088321,
 0.961966923337052]

# Breast Cancer

This example generates a Mapper built from the `Wisconsin Breast Cancer Dataset`_.



The reasoning behind the choice of lenses in the demonstration below is:

- **For lens1:** Lenses that make biological sense; in other words, lenses that highlight special features in the data, that I know about.
- **For lens2:** Lenses that disperse the data, as opposed to clustering many points together.

In the case of this particular data, using an anomaly score (in this case calculated using the IsolationForest from sklearn) makes biological sense since cancer cells are anomalous. For the second lens, we use the $l^2$ norm.

For an interactive exploration of lens for the breast cancer, see the `Choosing a lens notebook <../../notebooks/Cancer-demo.html>`_.

KeplerMapper also permits setting multiple datapoint color functions and node color functions in its html visualizations.
The example code below demonstrates three ways this might be done. The rendered visualizations are also viewable:

- `Visualization of the breat cancer mapper using multiple datapoint color functions <../../_static/breast-cancer-multiple-color-functions.html>`_
- `Visualization of the breat cancer mapper using multiple node color functions <../../_static/breast-cancer-multiple-node-color-functions.html>`_
- `Visualization of the breat cancer mapper using multiple datapoint and node color functions <../../_static/breast-cancer-multiple-color-functions-and-multiple-node-color-functions.html>`_


In [None]:
import sys

try:
    import pandas as pd
except ImportError as e:
    print(
        "pandas is required for this example. Please install with `pip install pandas` and then try again."
    )
    sys.exit()

import numpy as np
import kmapper as km
import sklearn
from sklearn import ensemble

# For data we use the Wisconsin Breast Cancer Dataset
# Via:
df = pd.read_csv("data/breast-cancer.csv")
feature_names = [c for c in df.columns if c not in ["id", "diagnosis"]]
df["diagnosis"] = df["diagnosis"].apply(lambda x: 1 if x == "M" else 0)
X = np.array(df[feature_names].fillna(0))  # quick and dirty imputation
y = np.array(df["diagnosis"])

# We create a custom 1-D lens with Isolation Forest
model = ensemble.IsolationForest(random_state=1729)
model.fit(X)
lens1 = model.decision_function(X).reshape((X.shape[0], 1))

# We create another 1-D lens with L2-norm
mapper = km.KeplerMapper(verbose=3)
lens2 = mapper.fit_transform(X, projection="l2norm")

# Combine both lenses to create a 2-D [Isolation Forest, L^2-Norm] lens
lens = np.c_[lens1, lens2]

# Create the simplicial complex
graph = mapper.map(
    lens,
    X,
    cover=km.Cover(n_cubes=15, perc_overlap=0.4),
    clusterer=sklearn.cluster.KMeans(n_clusters=2, random_state=1618033),
)


import matplotlib.pyplot as plt

km.draw_matplotlib(graph)
plt.show()

In [None]:
import networkx as nx
nx_graph = km.adapter.to_nx(graph)
nx.generate_graphml(nx_graph)

In [None]:
from statistics import mean
graphlens = []
for points in graph["nodes"].values():
    if points == []:
        continue
    graphlens.append(mean([lens[i][0] for i in points]))
print(graph["nodes"].keys())

graphlens