A simple example that allows to visualize the result of semi supervised graph clustering

# Open data

In [None]:
from sklearn import datasets

For this example we use the MNIST dataset 

In [None]:
digits = datasets.load_digits(n_class=10)
data = digits.data
label = digits.target
classes = digits.target_names
k = len(classes)

# Compute constraints

In [None]:
from scipy.sparse import coo_matrix
def random_indices(list_points, number_indices):
    """
        Generates a list of indices to apply on the constraint matrix
        without redundancy
        
        Arguments:
            list_points {List of Int / Int} -- Number of points in dataset or list of points to take into account
            number_indices {Int} -- Number of indices needed

        Returns:
            List of pairs of coordinates
    """
    if isinstance(list_points, int):
        list_points = np.arange(list_points)

    length = len(list_points)
    indices = set()
    while len(indices) < number_indices:
        i = np.random.randint(length - 1)
        j = np.random.randint(i + 1, length)
        indices.add((list_points[i], list_points[j]))

    return list(indices)

def generate_constraint(labels, indices):
    """
        Returns the sparse matrix of constraints

        Arguments:
            labels {Array n} -- Ground truth labels
            indices {List of (i int, j int)} -- Indices to keep 
    """
    rows, cols, vals = [], [], []
    for i, j in indices:
        rows.extend([i, j])
        cols.extend([j, i])
        vals.extend([1 if (labels[i] == labels[j]) else -1] * 2)

    return coo_matrix((vals, (rows, cols)), shape = (len(labels), len(labels)))

In [None]:
import numpy as np
np.random.seed(42) # For reproducibility

We compute the constraints of all the points and subselect the training one, by selecting 100 constraints on 100 points randomly selected (cf remark 3 in readme)

In [None]:
# Computes all constraints
ground_truth = 2 * np.equal.outer(label, label) - 1 
np.fill_diagonal(ground_truth, 0.)

In [None]:
# Create a subset of training points
train_selection = np.random.choice(np.arange(len(label)), size = 100, replace = False)
test_selection = [i for i in np.arange(len(label)) if i not in train_selection]

In [None]:
# Computes a subset of constraint using only training points
random_index = random_indices(train_selection, 100)
train_constraint = generate_constraint(label, random_index)

# Compute affinity matrix

In [None]:
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.metrics.pairwise import euclidean_distances

For this example we compute one rbf kernel with the median initialization

In [None]:
eucldist = euclidean_distances(data, data, squared=True)

In [None]:
affinity = rbf_kernel(data, gamma = 1./(np.median(eucldist)))

In [None]:
affinities = [rbf_kernel(data, gamma = 1./(np.median(eucldist)) * alpha) for alpha in [0.1, 0.5, 1, 5, 10]]

# Clustering

In [None]:
from sskkmeans import ssKmeans, crossValidationSskmeans
from KernelConstrainedKmeans.initialization import Initialization

In [None]:
initialization = Initialization(k, train_constraint).farthest_initialization(affinity)

In [None]:
weight = len(digits) / (k * len(index_train))

In [None]:
assignation = ssKmeans(affinity, initialization.copy(), "ratio cut", train_constraint * weight)

In [None]:
assignation_cv = crossValidationSskmeans(affinities, k, "ratio cut", train_constraint * weight)

# Performances

In [None]:
from sklearn.metrics import v_measure_score

In order to show that the algo allow to increase the performances of the clustering, we compute performances after initialization and after the kernel constrained kmeans.

## After initialization

In [None]:
print("Performance on training : {:.2f}".format(v_measure_score(label[train_selection], initialization[train_selection])))
print("Performance on testing : {:.2f}".format(v_measure_score(label[test_selection], initialization[test_selection])))

## After algo

In [None]:
print("Performance on training : {:.2f}".format(v_measure_score(label[train_selection], assignation[train_selection])))
print("Performance on testing : {:.2f}".format(v_measure_score(label[test_selection], assignation[test_selection])))

In [None]:
print("Performance on training : {:.2f}".format(v_measure_score(label[train_selection], assignation_cv[train_selection])))
print("Performance on testing : {:.2f}".format(v_measure_score(label[test_selection], assignation_cv[test_selection])))

# Visualization

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

Computes a projection of the data and visualize the different results

In [None]:
tsne = TSNE().fit_transform(data)

## Ground Truth

In [None]:
plt.figure()
plt.scatter(tsne[:,0], tsne[:, 1], c = label)
plt.show()

## After computation

In [None]:
plt.figure()
plt.scatter(tsne[:,0], tsne[:, 1], c = assignation)
plt.show()