In [None]:
import numpy as np
import matplotlib.pyplot as plt
import umap
import os
from PCH import HDBSCAN
from PCH.utils import constraints_from_estimate, augment_labels
from sklearn.metrics import adjusted_rand_score
from matplotlib import collections as mc

In [None]:
def plot_constraints(visual_embedding, selected_labels, constraints, s=.1):
    fig, ax = plt.subplots(figsize=(10, 10))
    ax.scatter(
        visual_embedding[:, 0],
        visual_embedding[:, 1],
        c=selected_labels,
        s=s,
        cmap="tab20",
    )
    ax.add_collection(
        mc.LineCollection(
            visual_embedding[constraints["ML"]],
            linewidths=2,
            color="purple",
            linestyle="dashed",
            alpha=.85,
        )
    )
    ax.add_collection(
        mc.LineCollection(
            visual_embedding[constraints["CL"]],
            linewidths=2,
            color="black",
            linestyle="dashed",
            alpha=.25,
        )
    )
    fig.show()    

In [None]:
def merge_constraints(current_constraints, new_constraints):
    for key in new_constraints:
        if key not in current_constraints:
            current_constraints[key] = []
        current_constraints[key].extend(new_constraints[key])
    return current_constraints

In [None]:
MODEL = "vae"
PARENT_DIR = "embeddings"
# SAVE_DIR = "logs/vae/overfit_2/"
RUN_NAME = "overfit_5"
SAVE_DIR = os.path.join(PARENT_DIR, MODEL, RUN_NAME) + "/"

data = np.load(SAVE_DIR + "embeddings.npy")
labels = np.load(SAVE_DIR + "labels.npy")

In [None]:
print(data.shape)

In [None]:
def get_visual(data):
    if os.path.exists(SAVE_DIR + "visual.npy"):
        return np.load(SAVE_DIR + "visual.npy")
    else:
        visual = umap.UMAP(min_dist=0).fit_transform(data)
        np.save(SAVE_DIR + "visual.npy", visual)
        return visual

In [None]:
embedding = get_visual(data)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(14, 10))
ax.set_title('UMAP projection of the NVP embeddings', fontsize=18)
ax.scatter(embedding[:, 0], embedding[:, 1], c=labels, cmap='tab20', s=1)
fig.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(14, 10))
ax.set_title('UMAP projection of the NVP embeddings', fontsize=18)
ax.scatter(embedding[:, 0], embedding[:, 1], c=labels, cmap='tab20', s=.1)
fig.show()

In [None]:
hdb = HDBSCAN(min_cluster_size=500)
estimated_visual_labels = hdb.fit_predict(embedding)

fig, ax = plt.subplots(1, 1, figsize=(14, 10))
ax.set_title('HDBSCAN Labels using visual embedding', fontsize=18)
ax.scatter(embedding[:, 0], embedding[:, 1], c=estimated_visual_labels, cmap='tab20', s=.1)
fig.show()

In [None]:
print(adjusted_rand_score(labels, estimated_visual_labels))

In [None]:
augmented_labels = augment_labels(embedding, estimated_visual_labels)

In [None]:
constraints = (
    constraints_from_estimate(
        embedding,
        labels=augmented_labels,
        ground_truth=labels,
        n_samples=100,
        n_subsample=10000,
    )
)
plot_constraints(embedding, labels, constraints)
plot_constraints(embedding, augmented_labels, constraints)

In [None]:
constrained_labels = []
hdb = HDBSCAN(min_cluster_size=500, constraint_mode="t-synthetic")
hdb.fit(embedding, constraints=constraints)
constrained_labels.append(hdb.labels_)

fig, ax = plt.subplots(1, 1, figsize=(14, 10))
ax.set_title('HDBSCAN Labels using visual embedding + constraints', fontsize=18)
ax.scatter(embedding[:, 0], embedding[:, 1], c=constrained_labels[0], cmap='tab20', s=.1)
fig.show()
print(adjusted_rand_score(labels, constrained_labels[0]))

In [None]:
augmented_labels = augment_labels(embedding, constrained_labels[0])

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(14, 10))
ax.set_title('Augmented HDBSCAN Labels using visual embedding + constraints', fontsize=18)
ax.scatter(embedding[:, 0], embedding[:, 1], c=augmented_labels, cmap='tab20', s=.1)
fig.show()
print(adjusted_rand_score(labels, augmented_labels))

In [None]:
np.save(SAVE_DIR + "constrained_labels.npy", augmented_labels)
print(f"Saving to {SAVE_DIR}constrained_labels.npy")

In [None]:
constraints = merge_constraints(
    constraints_from_estimate(
        embedding,
        labels=augmented_labels,
        ground_truth=labels,
        n_samples=100,
        n_subsample=10000,
    ),
    constraints,
)
plot_constraints(embedding, labels, constraints)
plot_constraints(embedding, augmented_labels, constraints)

In [None]:
hdb = HDBSCAN(min_cluster_size=500, constraint_mode="t-synthetic")
hdb.fit(embedding, constraints=constraints)
constrained_labels.append(hdb.labels_)

fig, ax = plt.subplots(1, 1, figsize=(14, 10))
ax.set_title('HDBSCAN Labels using visual embedding + 2x constraints', fontsize=18)
ax.scatter(embedding[:, 0], embedding[:, 1], c=constrained_labels[1], cmap='tab20', s=.1)
fig.show()
print(adjusted_rand_score(labels, constrained_labels[1]))

In [None]:
constraints = merge_constraints(
    constraints_from_estimate(
        embedding,
        labels=constrained_labels[1],
        ground_truth=labels,
        n_samples=1000,
        n_subsample=20000,
    ),
    constraints,
)
plot_constraints(embedding, labels, constraints)
plot_constraints(embedding, constrained_labels[0], constraints)

In [None]:
hdb = HDBSCAN(min_cluster_size=500, constraint_mode="t-synthetic")
hdb.fit(embedding, constraints=constraints)
# constrained_labels.append(hdb.labels_)
clabel = hdb.labels_

fig, ax = plt.subplots(1, 1, figsize=(14, 10))
ax.set_title('HDBSCAN Labels using visual embedding + 3x constraints', fontsize=18)
ax.scatter(embedding[:, 0], embedding[:, 1], c=clabel, cmap='tab20', s=.1)
fig.show()
print(adjusted_rand_score(labels, clabel))

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(14, 10))
ax.set_title('HDBSCAN Labels using visual embedding + 3x constraints', fontsize=18)
ax.scatter(embedding[:, 0], embedding[:, 1], c=clabel, cmap='tab20', s=.1)
fig.show()
print(adjusted_rand_score(labels, clabel))

In [None]:
augmented_labels = augment_labels(embedding, clabel)
print(adjusted_rand_score(labels, augmented_labels))

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(14, 10))
ax.set_title('HDBSCAN Labels using visual embedding + 3x constraints', fontsize=18)
ax.scatter(embedding[:, 0], embedding[:, 1], c=augmented_labels, cmap='tab20', s=.1)
fig.show()

In [None]:
np.save(SAVE_DIR + "constrained_labels.npy", augmented_labels)
print(f"Saving to {SAVE_DIR}constrained_labels.npy")

In [None]:
from torchvision.datasets import FashionMNIST
from torchvision import transforms


fmnist_train = FashionMNIST(
    "FMNIST",
    train=True,
    download=True,
)
control_data = fmnist_train.data.view(-1, 28*28).float().numpy()
control_data /= 255
control_labels = fmnist_train.targets.numpy()

In [None]:
visual = umap.UMAP(n_components=2, min_dist=0).fit_transform(control_data)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(14, 10))
ax.set_title('UMAP projection of the NVP embeddings with estimated labels', fontsize=18)
ax.scatter(visual[:, 0], visual[:, 1], c=fmnist_train.targets, cmap='tab20', s=.1)
fig.show()

In [None]:
hdb = HDBSCAN(min_cluster_size=500, min_samples=5)
estimated_labels = hdb.fit_predict(visual)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(14, 10))
ax.set_title('UMAP projection of the NVP embeddings with estimated labels', fontsize=18)
ax.scatter(visual[:, 0], visual[:, 1], c=estimated_labels, cmap='tab20', s=.1)
fig.show()

In [None]:
print(adjusted_rand_score(fmnist_train.targets, estimated_labels))

In [None]:
hdb = HDBSCAN(min_cluster_size=500, min_samples=5, constraint_mode="synthetic")
constraints = constraints_from_estimate(control_data, estimated_labels, fmnist_train.targets, 100)
hdb.fit(visual, constraints=constraints)
second_estimated_labels = hdb.labels_

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(14, 10))
ax.set_title('UMAP projection of the NVP embeddings with estimated labels', fontsize=18)
ax.scatter(visual[:, 0], visual[:, 1], c=second_estimated_labels, cmap='tab20', s=.1)
fig.show()

In [None]:
print(adjusted_rand_score(fmnist_train.targets, second_estimated_labels))

In [None]:
new_constraints = constraints_from_estimate(control_data, second_estimated_labels, fmnist_train.targets, 100)
constraints['ML'].extend(new_constraints['ML'])
constraints['CL'].extend(new_constraints['CL'])

In [None]:
hdb = HDBSCAN(min_cluster_size=500, min_samples=5, constraint_mode="synthetic")
hdb.fit(visual, constraints=constraints)
third_estimated_labels = hdb.labels_

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(14, 10))
ax.set_title('UMAP projection of the NVP embeddings with estimated labels', fontsize=18)
ax.scatter(visual[:, 0], visual[:, 1], c=third_estimated_labels, cmap='tab20', s=.1)
fig.show()

In [None]:
print(adjusted_rand_score(fmnist_train.targets, third_estimated_labels))

In [None]:
control_embedding = umap.UMAP(n_components=16, min_dist=0).fit_transform(fmnist_train.data.view(-1, 28*28))

In [None]:
import numpy as np
rng = np.random.RandomState(42)
sample_idxs = rng.choice(len(control_embedding), data.shape[0], replace=False)
control_embedding = control_embedding[sample_idxs]


In [None]:
hdb = HDBSCAN(min_cluster_size=50)
estimated_labels_control = hdb.fit_predict(control_embedding)

fig, ax = plt.subplots(1, 1, figsize=(14, 10))
ax.set_title('UMAP projection of the NVP embeddings with control labels', fontsize=18)
ax.scatter(control_visual[:, 0], control_visual[:, 1], c=estimated_labels_control, cmap='tab20', s=.1)
fig.show()

In [None]:
print(adjusted_rand_score(fmnist_train.targets, estimated_labels))