In [None]:
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import pandas as pd

from sklearn import model_selection
from sklearn.naive_bayes import GaussianNB
from sklearn import preprocessing 
from sklearn.datasets import make_blobs
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import pairwise_distances, log_loss
from sklearn.manifold import MDS

from scipy.spatial.distance import directed_hausdorff
from mpl_toolkits.mplot3d import Axes3D

In [None]:
from metricmapper import *

In [None]:
def estimate_scale(X, N=100, inp="point cloud", beta=0., C=10.):
    """
    Compute estimated scale of a point cloud or a distance matrix.

    Parameters:
        X (numpy array of shape (num_points) x (num_coordinates) if point cloud and (num_points) x (num_points) 
            if distance matrix): input point cloud or distance matrix.
        N (int): subsampling iterations (default 100). 
        inp (string): either "point cloud" or "distance matrix". Type of input data (default "point cloud").
        beta (double): exponent parameter (default 0.).
        C (double): constant parameter (default 10.).

    Returns:
        delta (double): estimated scale that can be used with eg agglomerative clustering.
    """
    num_pts = X.shape[0]
    delta, m = 0., int(  num_pts / np.exp((1+beta) * np.log(np.log(num_pts)/np.log(C)))  )
    for _ in range(N):
        subpop = np.random.choice(num_pts, size=m, replace=False)
        if inp == "point cloud":
            d, _, _ = directed_hausdorff(X, X[subpop,:])
        if inp == "distance matrix":
            d = np.max(np.min(X[:,subpop], axis=1), axis=0)
        delta += d/N
    return delta

In [None]:
def mapper2networkx(M, get_attrs=False):
    """
    Turn the 1-skeleton of M (computed after calling fit() method) into a networkx graph.
    This function requires networkx (https://networkx.org/documentation/stable/install.html).

    Parameters:
        M (MetricMapperComplex): simplicial complex
        get_attrs (bool): if True, the color functions will be used as attributes for the networkx graph.

    Returns:
        G (networkx graph): graph representing the 1-skeleton of the cover complex.
    """
    st = M.mapper_
    G = nx.Graph()
    for (splx,_) in st.get_skeleton(1):
        if len(splx) == 1:
            G.add_node(splx[0])
        if len(splx) == 2:
            G.add_edge(splx[0], splx[1])
    if get_attrs:
        attrs = {k: {"attr_name": M.node_info[k]["colors"]} for k in G.nodes()}
        nx.set_node_attributes(G, attrs)
    return G

# Annulus

## Dataset

### Underlying manifold

In [None]:
num_pts = 5000
radius  = 1.

In [None]:
theta   = np.random.uniform(low=0., high=2*np.pi, size=num_pts)
xs, ys  = radius * np.cos(theta), radius * np.sin(theta)
noise_x = np.random.normal(loc=xs, scale=.1, size=num_pts)
noise_y = np.random.normal(loc=ys, scale=.1, size=num_pts)
X       = np.hstack([np.reshape(xs+noise_x, [-1,1]), np.reshape(ys+noise_y, [-1,1])])

In [None]:
%matplotlib notebook
plt.scatter(X[:,0], X[:,1], s=3)
plt.show()

In [None]:
delta = estimate_scale(X, 100)
print(delta)

### Probability distributions

#### Conditional distributions

In [None]:
distributions = []

In [None]:
num_samples = 10000

Gaussian distribution.

In [None]:
for i in range(num_pts):
    distributions.append(np.random.normal(loc=X[i,0], scale=0.5, size=num_samples))

Bimodal distribution.

In [None]:
for i in range(num_pts):
    distrib = []
    d1 = np.random.normal(loc=X[i,0]+2,  scale=0.1, size=num_samples)
    d2 = np.random.normal(loc=-X[i,0]-2, scale=0.1, size=num_samples)
    distrib = np.concatenate([d1[:int(num_samples/2)], d2[:int(num_samples/2)]])
    np.random.shuffle(distrib)
    distributions.append(distrib)

Visualization.

In [None]:
%matplotlib notebook
plt.figure()
plt.hist(np.array(distributions[0]), bins=300, range=[-4.3,4.3])
plt.show()

In [None]:
#z = [distributions[i][0] for i in range(num_pts)]
z = [np.mean(distrib) for distrib in distributions]

In [None]:
%matplotlib notebook
fig = plt.figure()
ax  = fig.add_subplot(111, projection="3d")
ax.scatter(X[:,0], X[:,1], z, s=1.)
ax.view_init(elev=33, azim=64)
plt.show()

#### Single observations

Gaussian distributions.

In [None]:
real = []
for i in range(num_pts):
    real.append(np.random.normal(loc=X[i,0], scale=0.5, size=1)[0])

Bimodal distributions.

In [None]:
real = []
for i in range(num_pts):
    idx = np.random.choice(2, 1)
    if idx == 0:
        real.append(np.random.normal(loc=X[i,0]+2, scale=0.1, size=1)[0])
    else:
        real.append(np.random.normal(loc=-X[i,0]-2, scale=0.1, size=1)[0])

Visualization.

In [None]:
%matplotlib notebook
fig = plt.figure()
ax  = fig.add_subplot(111, projection="3d")
ax.scatter(X[:,0], X[:,1], real, s=1.)
ax.view_init(elev=33, azim=64)
plt.show()

#### Graphs

In [None]:
real = []

In [None]:
M, m = max(X[:,0]), min(X[:,0])
for i in range(num_pts):
    A = np.random.binomial(1, (X[i,0]-m)/(M-m), size=(5,5))
    Aplus, Aminus = np.triu(A), np.triu(A,1).T
    real.append(nx.Graph(Aplus + Aminus))

In [None]:
plt.figure()
nx.draw_networkx(real[60], with_labels=False)
plt.show()

## Single realization Mapper

In [None]:
cover = HypercubeCover(cover_mode="implicit", 
                       bnds=np.array([[np.array(real).min(), np.array(real).max()]]), 
                       resolutions=np.array([15]), 
                       gains=np.array([.3]))

mapper = MetricMapperComplex(
    filters=np.array(real)[:,np.newaxis], colors=X[:,0:1], codomain="vectors", cover=cover,
    clustering=AgglomerativeClustering(n_clusters=None, linkage="single", distance_threshold=1.)
                            ).fit(X)

In [None]:
G = mapper2networkx(mapper)
plt.figure()
nx.draw_networkx(G, with_labels=False,
                 node_color=[mapper.node_info_[name]["colors"][0] for name in G.nodes()])
plt.show()

## Mean-based Mapper 

In [None]:
distributions = infer_distributions_from_neighborhood(real, X, 3*delta, "point cloud")
means = np.array([np.mean(distrib) for distrib in distributions])[:,np.newaxis]

In [None]:
cover = HypercubeCover(cover_mode="implicit", 
                       bnds=np.array([[means.min(), means.max()]]), 
                       resolutions=np.array([10]), 
                       gains=np.array([.3]))

mapper = MetricMapperComplex(
    filters=means, colors=X[:,0:1], cover=cover, codomain="vectors",
    clustering=AgglomerativeClustering(n_clusters=None, linkage="single", distance_threshold=delta)
                            ).fit(X)

In [None]:
G = mapper2networkx(mapper)
plt.figure()
nx.draw_networkx(G, with_labels=False,
                 node_color=[mapper.node_info_[name]["colors"][0] for name in G.nodes()])
plt.show()

## Histogram-based Mapper

In [None]:
mapper = MetricMapperComplex(
    filters=real, codomain="distributions", infer_distributions=True, threshold=1., num_bins=100, 
    mode="NW", kernel=GaussianKernel(h=0.1),
    cover=kPDTMCover(num_patches=10, h=3, threshold=delta/10, tol=1e-7),
    correct_Rips=False, delta=delta, num_subdivisions=10,
    colors=np.reshape(X[:,0], [-1,1]), mask=5,
    clustering=AgglomerativeClustering(n_clusters=None, linkage="single", distance_threshold=delta)
                            ).fit(X)

In [None]:
G = mapper2networkx(mapper)
plt.figure()
nx.draw_networkx(G, with_labels=False,
                 node_color=[mapper.node_info_[name]["colors"][0] for name in G.nodes()])
plt.show()

## Metric Mapper

Distances for conditional probability distributions.

In [None]:
distributions = infer_distributions_from_neighborhood(real, X, 3*delta, "point cloud")
H, _ = Histogram(num_bins=100).fit_transform(distributions)
dists = EuclideanDistance().compute_matrix(H)

Distances for combinatorial graphs.

In [None]:
dists = np.zeros([num_pts, num_pts])
for i in range(num_pts):
    for j in range(i+1, num_pts):
        GED = nx.optimize_graph_edit_distance(real[i], real[j], upper_bound=20)
        for d in GED:
            dists[i,j] = d
        dists[j,i] = dists[i,j]

In [None]:
mapper = MetricMapperComplex(
    filters=dists, codomain="distance matrix", 
    cover=VoronoiCover(num_patches=10, threshold=0.01), distance=EuclideanDistance(),
    colors=np.reshape(X[:,0], [-1,1]), mask=5,
    clustering=AgglomerativeClustering(n_clusters=None, linkage="single", distance_threshold=delta)
                            ).fit(X)

In [None]:
G = mapper2networkx(mapper)
plt.figure()
nx.draw_networkx(G, with_labels=False,
                 node_color=[mapper.node_info_[name]["colors"][0] for name in G.nodes()])
plt.show()

# Machine Learning

## Synthetic

In [None]:
np.random.seed(0)

Generate data.

In [None]:
X, y = make_blobs(n_samples=5000, n_features=2, random_state=42, cluster_std=5.0)
X_train, y_train = X[:3000], y[:3000]
X_valid, y_valid = X[3000:4000], y[3000:4000]
X_train_valid, y_train_valid = X[:4000], y[:4000]
X_test, y_test = X[4000:], y[4000:]

In [None]:
%matplotlib notebook
plt.figure()
plt.scatter(X[:,0], X[:,1], c=y, s=5, cmap="rainbow")
plt.show()

Train uncalibrated random forest classifier on whole train and validation data and evaluate on test data.

In [None]:
clf = RandomForestClassifier(n_estimators=25)
clf.fit(X_train_valid, y_train_valid)
clf_probs = clf.predict_proba(X_test)
score = log_loss(y_test, clf_probs)

Train random forest classifier, calibrate on validation data and evaluate on test data.

In [None]:
clf = RandomForestClassifier(n_estimators=25)
clf.fit(X_train, y_train)
clf_probs = clf.predict_proba(X_test)

In [None]:
print(X_test.shape)

In [None]:
cover = HypercubeCover(cover_mode="implicit", 
                       bnds=np.array([[0.,1.],[0.,1.],[0.,1.]]), 
                       resolutions=np.array([10,10,10]), 
                       gains=np.array([.3,.3,.3]))

mapper = MetricMapperComplex(
    filters=clf_probs, colors=clf_probs, codomain="vectors", cover=cover,
    #correct_Rips=False, delta=10, correct_mode="cover_refinement",
    clustering=AgglomerativeClustering(n_clusters=None, linkage="single", distance_threshold=10)
                            ).fit(X_test)

In [None]:
print(mapper.mapper_.num_vertices())

In [None]:
%matplotlib notebook
G = stm.mapper2networkx(mapper)
plt.figure()
nx.draw_networkx(G, with_labels=False, pos=nx.kamada_kawai_layout(G), 
                 node_color=[np.var(mapper.node_info_[name]["colors"]) for name in G.nodes()])

## Accelero

Data can be downloaded at https://archive.ics.uci.edu/ml/datasets/Human+Activity+Recognition+Using+Smartphones

In [None]:
file_path= "./uci/"

Features.

In [None]:
data_path = file_path + "train/X_train.txt"
activity_features = pd.read_csv(data_path, delim_whitespace=True, header=None)

Activities.

In [None]:
data_path = file_path + "train/y_train.txt"
activity  = pd.read_csv(data_path, delim_whitespace=True, header=None)
activity  = activity.values[:,0] - 1
activity_names = ['WALKING','WALKING_UPSTAIRS','WALKING_DOWNSTAIRS','SITTING','STANDING','LAYING']

Subjects.

In [None]:
data_path = file_path + "train/subject_train.txt"
sujet = pd.read_csv(data_path, delim_whitespace=True, header=None)
sujet = sujet.values[:,0]

In [None]:
features_nor = preprocessing.scale(activity_features)

Use naive Bayes for prediction.

In [None]:
gnb = GaussianNB()
gnb.fit(features_nor, activity)
pred = gnb.predict(features_nor)
scores = model_selection.cross_val_score(GaussianNB(), features_nor, activity, cv=10)
print(scores)

Estimation of a posteriori probabilities.

In [None]:
posterior = gnb.predict_proba(features_nor)

In [None]:
delta = estimate_scale(features_nor, 100)

In [None]:
cover = HypercubeCover(cover_mode="implicit", 
                       bnds=np.array([[0.,1.],[0.,1.],[0.,1.],[0.,1.],[0.,1.],[0.,1.]]), 
                       resolutions=np.array([6,6,6,6,6,6]), 
                       gains=np.array([.4,.4,.4,.4,.4,.4]))

mapper = MetricMapperComplex(
    filters=posterior, colors=posterior, codomain="vectors", cover=cover,
    clustering=AgglomerativeClustering(n_clusters=None, linkage="single", distance_threshold=delta)
                            ).fit(features_nor)

In [None]:
%matplotlib notebook
G = stm.mapper2networkx(mapper)
plt.figure()
nx.draw_networkx(G, with_labels=False,
                 node_color=[np.var(mapper.node_info_[name]["colors"]) for name in G.nodes()],
                 vmin=0., vmax=0.1)