<a href="https://colab.research.google.com/github/LarsHadidi/PRONTO/blob/mathprogram/mp/PDP-GEO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Progressive Dinner Party: Geometric Program

# Method: Solving clustered subsets

Clustering is the task of grouping a set of objects in such a way that objects in the same group (called a cluster) are more similar to each other than to those in other groups.

<img alt="clusters" src="https://raw.githubusercontent.com/benedekrozemberczki/awesome-community-detection/master/coms.png" width="25%"/>

## Imports

In [1]:
!pip install scikit-learn -q -U

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.8/9.8 MB[0m [31m36.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import numpy as np
import bokeh.palettes
from scipy import spatial
import bokeh.plotting as bkh
from sklearn import datasets, cluster

bkh.output_notebook()

## Data

In [3]:
data, _ = datasets.make_blobs(n_samples=3000, centers=3, cluster_std=0.5)
data = np.dot(data, [[-0.7, 0.5], [-0.2, -0.7]])
X = data[:,0]
Y = data[:,1]

C = spatial.distance.squareform(spatial.distance.pdist(data, metric='euclidean'))

bkh.output_notebook()
p = bkh.figure(width=800, height=800)
p.scatter(X, Y, fill_color='blue', alpha=0.2, size=10)
bkh.show(p)

##Clustering

In [4]:
clusterer = cluster.AgglomerativeClustering(metric='precomputed', linkage='complete', distance_threshold=5, n_clusters=None)
labels = clusterer.fit_predict(C)
centroids = {label: np.mean(data[labels==label,:], axis=0) for label in set(labels) if label != -1}

cmap = bokeh.palettes.all_palettes['Paired'][max(3,len(set(labels)))]

source = bkh.ColumnDataSource(data=dict(
    x=X,
    y=Y,
    color=[list(cmap)[i] for i in labels],
    label=labels
))

p = bkh.figure(width=800, height=800)
p.scatter(source=source, fill_color='color', size=10, legend_group='label')
p.scatter(np.vstack(list(centroids.values()))[:,0],np.vstack(list(centroids.values()))[:,1], fill_color='red', marker='triangle_pin', size=25)
bkh.show(p)

In [7]:
carrier_stack = set(labels)
  
while carrier_stack:  
  current_cluster = carrier_stack.pop()
  r = sum(labels==current_cluster) % 3
  dropout_idx = np.random.choice(np.where(labels == current_cluster)[0], r)
  labels[dropout_idx] = -1

for c in set(labels):
  n = sum(labels==c)
  if c != -1:
    print(f'{"*" if n%3==0 else " "}Cluster {c}: {n}')
  else:
    print(f'\n {n} dropouts')

*Cluster 0: 999
*Cluster 1: 999
*Cluster 2: 999

 3 dropouts


In [8]:
cmap = bokeh.palettes.all_palettes['Paired'][max(3,len(set(labels)))]

source = bkh.ColumnDataSource(data=dict(
    x=X,
    y=Y,
    color=[list(cmap)[i] for i in labels],
    label=labels
))

p = bkh.figure(width=800, height=800)
p.scatter(source=source, fill_color='color', size=10, legend_group='label')
bkh.show(p)