A simple example that allows to visualize the result of semi supervised graph clustering

# Open data

In [None]:
from sklearn import datasets

For this example we use the MNIST dataset 

In [None]:
digits = datasets.load_digits(n_class=10)
data = digits.data
label = digits.target
classes = digits.target_names
k = len(classes)

# Compute constraints

In [None]:
import numpy as np
np.random.seed(42) # For reproducibility

We compute the constraints of all the points and subselect the training one, by selecting 100 constraints on 100 points randomly selected (cf remark 3 in readme)

In [None]:
# Computes all constraints
ground_truth = 2 * np.equal.outer(label, label) - 1 
np.fill_diagonal(ground_truth, 0.)

In [None]:
# Create a subset of training points
train_selection = np.random.choice(np.arange(len(label)), size = 100, replace = False)
test_selection = [i for i in np.arange(len(label)) if i not in train_selection]

In [None]:
# Computes a subset of constraint using only training points
train_constraint = np.zeros_like(ground_truth)
index_train = np.array([(i, j) for k, i in enumerate(train_selection) for j in train_selection[:k]])
random_index = np.random.choice(np.arange(len(index_train)), size = 100, replace = False)
for (i, j) in index_train[random_index]:
    train_constraint[i, j] = ground_truth[i, j]
    train_constraint[j, i] = train_constraint[i, j]

# Compute affinity matrix

In [None]:
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.metrics.pairwise import euclidean_distances

For this example we compute one rbf kernel with the median initialization

In [None]:
eucldist = euclidean_distances(data, data, squared=True)

In [None]:
affinity = rbf_kernel(data, gamma = 1./(np.median(eucldist)))

# Clustering

In [None]:
from sskkmeans import ssKmeans
from KernelConstrainedKmeans.initialization import farthestInitialization

In [None]:
initialization = farthestInitialization(affinity, k, train_constraint)

In [None]:
weight = len(digits) / (k * len(index_train))

In [None]:
assignation = ssKmeans(affinity, initialization.copy(), "ratio cut", train_constraint * weight)

# Performances

In [None]:
from sklearn.metrics import v_measure_score

In order to show that the algo allow to increase the performances of the clustering, we compute performances after initialization and after the kernel constrained kmeans.

## After initialization

In [None]:
print("Performance on training : {:.2f}".format(v_measure_score(label[train_selection], initialization[train_selection])))
print("Performance on testing : {:.2f}".format(v_measure_score(label[test_selection], initialization[test_selection])))

## After algo

In [None]:
print("Performance on training : {:.2f}".format(v_measure_score(label[train_selection], assignation[train_selection])))
print("Performance on testing : {:.2f}".format(v_measure_score(label[test_selection], assignation[test_selection])))

# Visualization

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

Computes a projection of the data and visualize the different results

In [None]:
tsne = TSNE().fit_transform(data)

## Ground Truth

In [None]:
plt.figure()
plt.scatter(tsne[:,0], tsne[:, 1], c = label)
for i, const in enumerate(train_constraint):
    for j in np.argwhere(const > 0):
        plt.plot([tsne[i,0], tsne[j, 0]], [tsne[i, 1], tsne[j, 1]], color = "blue", ls = ":")
    for j in np.argwhere(const < 0):
        plt.plot([tsne[i,0], tsne[j, 0]], [tsne[i, 1], tsne[j, 1]], color = "red", ls = ":", alpha = 0.1)
plt.show()

## After computation

In [None]:
plt.figure()
plt.scatter(tsne[:,0], tsne[:, 1], c = assignation)
plt.show()