In [3]:
from sklearn.cluster import KMeans
from sklearn import preprocessing
import numpy as np
from scipy.spatial import distance
from rac.correlation_clustering import max_correlation_dynamic_K
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score, v_measure_score

sim_init = 1

def sim_matrix_from_clustering(clustering, N):
    pairwise_similarities = -sim_init*np.ones((N, N))
    for cind in clustering:
        pairwise_similarities[np.ix_(cind, cind)] = sim_init
    return pairwise_similarities


def clustering_from_clustering_solution(clustering_solution):
    num_clusters = np.max(clustering_solution) + 1
    clustering = [[] for _ in range(num_clusters)]
    for i in range(len(clustering_solution)):
        clustering[clustering_solution[i]].append(i)
    return clustering, num_clusters

def test_dataset(X, Y, rs, distance_metric="euclidean", normalize=False, n_clusters=None):
    if n_clusters is None:
        n_classes = len(np.unique(Y))
    else:
        n_classes = n_clusters
    if normalize:
        X = preprocessing.StandardScaler().fit_transform(X)
    kmeans = KMeans(n_clusters=n_classes, random_state=1).fit(X)
    print("kmeans score: ", adjusted_rand_score(Y, kmeans.labels_))
    clustering_solution = np.array(kmeans.labels_)
    clustering, cnum_clusters = clustering_from_clustering_solution(clustering_solution)
    pairwise_similarities_kmeans = sim_matrix_from_clustering(clustering, len(Y))
    np.fill_diagonal(pairwise_similarities_kmeans, 0.0)

    #D = distance.cdist(X, X, distance_metric)
    #sim_matrix = np.max(D) - D + np.min(D)
    #pairwise_similarities_inverse = sim_init * (2 * sim_matrix - np.max(sim_matrix) -  np.min(sim_matrix)) / (np.max(sim_matrix) - np.min(sim_matrix))
    ##pairwise_similarities_inverse = sim_init * (2 * sim_matrix - np.max(sim_matrix) -  np.min(sim_matrix)) / (np.max(sim_matrix) - np.min(sim_matrix))
    
    #np.fill_diagonal(pairwise_similarities_inverse, 0.0)

    labels_kmeans, _ = max_correlation_dynamic_K(pairwise_similarities_kmeans, n_classes, 3, rs)
    #labels_inverse, _ = max_correlation_dynamic_K(pairwise_similarities_inverse, 5, 3, rr)
    print("kmeans score2: ", adjusted_rand_score(Y, labels_kmeans))
    #print("inverse score: ", adjusted_rand_score(Y, labels_inverse))

In [4]:
def random_data_sample(X, Y, size, rs):
    if size <= 1:
        num_samples = int(len(Y)*size)
    else:
        num_samples = np.minimum(size, len(Y))
    inds = rs.choice(len(Y), num_samples)
    return X[inds], Y[inds]

## 20newsgroups

In [None]:
from sklearn import datasets

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
cats = ["rec.sport.baseball", "soc.religion.christian", "rec.autos", "talk.politics.mideast", "misc.forsale"]
data = datasets.fetch_20newsgroups(data_home="../datasets/", subset="all", categories=cats)
Y = data.target
X = data.data

In [None]:
from flair.data import Sentence
from flair.embeddings import TransformerDocumentEmbeddings
from sklearn.feature_extraction.text import TfidfVectorizer
normalize = True
document_embeddings = TransformerDocumentEmbeddings('distilbert-base-uncased', fine_tune=True)
data = None
for sen in X:
    sentence = Sentence(sen)
    document_embeddings.embed(sentence)
    dat = sentence.get_embedding()
    if data is None:
        data = dat.cpu().numpy().reshape(1, 768)
    else:
        data = np.vstack((data, dat.cpu().numpy().reshape(1, 768)))
X = data
#np.save("datasets/20newsgroups_small.npy", X)
#X = np.load("20newsgroups.npy")
#X = TfidfVectorizer().fit_transform(X)

In [40]:
from sklearn import datasets
data = datasets.fetch_20newsgroups(data_home="../datasets/20newsgroups_data/", subset="all")
Y = data.target
X = np.load("../datasets/20newsgroups_data/20newsgroups.npy")
X.shape

(18846, 768)

In [5]:
X = preprocessing.StandardScaler().fit_transform(X)

In [34]:
from sklearn.decomposition import PCA
pca = PCA(n_components=100)
X = pca.fit_transform(X)

In [41]:
rs = np.random.RandomState(19)
X_sample, Y_sample = random_data_sample(X, Y, 1000, rs)

In [42]:
X_sample.shape

(1000, 768)

In [9]:
X_sample = np.load("../datasets/20newsgroups_data/X.npy")
Y_sample = np.load("../datasets/20newsgroups_data/Y.npy")

In [43]:
X_sample.shape

(1000, 768)

In [45]:
rs = np.random.RandomState(19)
test_dataset(X_sample, Y_sample, rs, "euclidean", normalize=False, n_clusters=20)

kmeans score:  0.2132062041869545
kmeans score2:  0.2132062041869545


In [46]:
#np.save("../datasets/20newsgroups_data/X.npy", X_sample)
#np.save("../datasets/20newsgroups_data/Y.npy", Y_sample)

## 20newsgroups_small

In [15]:
from sklearn import datasets
data = datasets.fetch_20newsgroups(data_home="../datasets/20newsgroups_data/", subset="all")
Y = data.target
X = np.load("../datasets/20newsgroups_data/20newsgroups.npy")
X.shape

(18846, 768)

In [16]:
Y.shape

(18846,)

In [6]:
X = preprocessing.StandardScaler().fit_transform(X)

In [11]:
rs = np.random.RandomState(19)
X_sample, Y_sample = random_data_sample(X, Y, 0.025, rs)

In [13]:
X_sample.shape

(471, 768)

In [14]:
np.unique(Y_sample, return_counts=True)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19]),
 array([25, 28, 19, 18, 24, 23, 21, 28, 24, 23, 18, 29, 21, 25, 35, 18, 27,
        30, 23, 12], dtype=int64))

In [15]:
rs = np.random.RandomState(22)
test_dataset(X_sample, Y_sample, rs, "euclidean", normalize=True, n_clusters=20)

kmeans score:  0.1826667801365706
kmeans score2:  0.1826667801365706


In [13]:
#np.save("../datasets/20newsgroups_small_data/X.npy", X_sample)
#np.save("../datasets/20newsgroups_small_data/Y.npy", Y_sample)

In [17]:
np.save("C:\Github_Projects/robust-active-clustering/datasets_small/20newsgroups_data/X.npy", X_sample)
np.save("C:\Github_Projects/robust-active-clustering/datasets_small/20newsgroups_data/Y.npy", Y_sample)

## CIFAR10

In [6]:
import torch
from torchvision import transforms
from torchvision import datasets

cifar_training_transform = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
        ])
cifar10_train = datasets.CIFAR10(
                "../datasets/cifar10_original_data",
                train=True,
                download=True,
                transform=cifar_training_transform,
                target_transform=torch.tensor
            )

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ../datasets/cifar10_original_data\cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:00<?, ?it/s]

Extracting ../datasets/cifar10_original_data\cifar-10-python.tar.gz to ../datasets/cifar10_original_data


In [19]:
X = cifar10_train.data
Y = np.array(cifar10_train.targets)

In [21]:
rs = np.random.RandomState(19)
X_sample, Y_sample = random_data_sample(X, Y, 2000, rs)

In [23]:
print(X_sample.shape)
print(Y_sample.shape)

(2000, 32, 32, 3)
(2000,)


In [None]:
np.save("../datasets/cifar10_original_data/X.npy", X_sample)
np.save("../datasets/cifar10_original_data/Y.npy", Y_sample)

In [47]:
X = np.load("../datasets/cifar10_data/cifar10_embedding.npy")
Y = np.load("../datasets/cifar10_data/cifar10_labels.npy")

In [12]:
from sklearn import preprocessing
X = preprocessing.StandardScaler().fit_transform(X)

In [13]:
from sklearn.decomposition import PCA
pca = PCA(n_components=100)
X = pca.fit_transform(X)

In [48]:
rs = np.random.RandomState(19)
X_sample, Y_sample = random_data_sample(X, Y, 1000, rs)

In [49]:
X_sample.shape

(1000, 512)

In [24]:

#X_sample = np.load("../datasets/cifar10_data/X.npy")
#Y_sample = np.load("../datasets/cifar10_data/Y.npy")

In [50]:
X_sample.shape

(1000, 512)

In [62]:
rs = np.random.RandomState(19)
test_dataset(X_sample, Y_sample, rs, "euclidean", normalize=False, n_clusters=3)

kmeans score:  0.24435933918413816
kmeans score2:  0.24435933918413816


In [61]:
#np.save("../datasets/cifar10_data/X.npy", X_sample)
#np.save("../datasets/cifar10_data/Y.npy", Y_sample)

In [33]:
#np.save("C:\Github_Projects/robust-active-clustering/datasets_small/cifar10_data/X.npy", X_sample)
#np.save("C:\Github_Projects/robust-active-clustering/datasets_small/cifar10_data/Y.npy", Y_sample)

## CIFAR10_small

In [40]:
X = np.load("../datasets/cifar10_data/cifar10_embedding.npy")
Y = np.load("../datasets/cifar10_data/cifar10_labels.npy")

In [41]:
from sklearn import preprocessing
X = preprocessing.StandardScaler().fit_transform(X)

In [42]:
from sklearn.decomposition import PCA
pca = PCA(n_components=100)
X = pca.fit_transform(X)

In [43]:
rs = np.random.RandomState(19)
X_sample, Y_sample = random_data_sample(X, Y, 0.06, rs)

In [44]:
X_sample.shape

(3000, 100)

In [45]:
np.unique(Y_sample, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([278, 284, 310, 272, 302, 340, 289, 312, 308, 305], dtype=int64))

In [26]:
rs = np.random.RandomState(19)
test_dataset(X_sample, Y_sample, rs, "euclidean", normalize=True, n_clusters=20)

kmeans score:  0.437681481133735
kmeans score2:  0.437681481133735


In [27]:
#np.save("../datasets/cifar10_small_data/X.npy", X_sample)
#np.save("../datasets/cifar10_small_data/Y.npy", Y_sample)

## Mushrooms

In [63]:
import pandas as pd
df = pd.read_csv("../datasets/mushrooms_data/mushrooms.csv")

In [64]:
df = df.astype('category')
df.dtypes

class                       category
cap-shape                   category
cap-surface                 category
cap-color                   category
bruises                     category
odor                        category
gill-attachment             category
gill-spacing                category
gill-size                   category
gill-color                  category
stalk-shape                 category
stalk-root                  category
stalk-surface-above-ring    category
stalk-surface-below-ring    category
stalk-color-above-ring      category
stalk-color-below-ring      category
veil-type                   category
veil-color                  category
ring-number                 category
ring-type                   category
spore-print-color           category
population                  category
habitat                     category
dtype: object

In [65]:
from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder()
for column in df.columns:
    df[column] = labelencoder.fit_transform(df[column])

df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [66]:
Y = df["class"].values
X = df.drop(["class"], axis=1).values

In [67]:
rs = np.random.RandomState(19)
X_sample, Y_sample = random_data_sample(X, Y, 1000, rs)

In [68]:
X_sample.shape

(1000, 22)

In [52]:
np.unique(Y_sample, return_counts=True)

(array([0, 1]), array([503, 497], dtype=int64))

In [70]:
rs = np.random.RandomState(19)
test_dataset(X_sample, Y_sample, rs, "euclidean", normalize=False, n_clusters=2)

kmeans score:  0.18257417983623456
kmeans score2:  0.18257417983623456


In [22]:
#np.save("../datasets/mushrooms_data/X.npy", X_sample)
#np.save("../datasets/mushrooms_data/Y.npy", Y_sample)

In [53]:
Xtest = np.load("../datasets/mushrooms_data/X.npy")
ytest = np.load("../datasets/mushrooms_data/Y.npy")
print(Xtest.shape)
print(ytest.shape)

(1000, 22)
(1000,)


In [55]:
rs = np.random.RandomState(19)
test_dataset(Xtest, ytest, rs, "euclidean", normalize=True, n_clusters=2)

kmeans score:  0.19833580805474468
kmeans score2:  0.19833580805474468


In [46]:
np.save("C:\Github_Projects/robust-active-clustering/datasets_small/mushrooms_data/X.npy", X_sample)
np.save("C:\Github_Projects/robust-active-clustering/datasets_small/mushrooms_data/Y.npy", Y_sample)

In [41]:
import numpy as np
np.load("../datasets/mushrooms_data/X.npy").shape

(4874, 22)

## Breast Cancer data

In [92]:
df = pd.read_csv("../datasets/breast_cancer_data/data.csv")
df

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,
1,842517,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,
2,84300903,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,
4,84358402,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,
565,926682,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,
566,926954,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,
567,927241,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,


In [93]:
df.drop(columns=['Unnamed: 32','id'],inplace=True)
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})
df

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,1,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,1,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,1,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,1,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,1,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,1,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,1,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,1,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [94]:
X = df.drop(['diagnosis'], axis=1)
Y = df['diagnosis']

In [95]:
from sklearn import preprocessing
X = preprocessing.StandardScaler().fit_transform(X)

In [98]:
rs = np.random.RandomState(19)
X_sample, Y_sample = random_data_sample(X, Y, 1, rs)

In [99]:
X_sample.shape

(569, 30)

In [100]:
np.unique(Y_sample, return_counts=True)

(array([0, 1], dtype=int64), array([356, 213], dtype=int64))

In [103]:
rs = np.random.RandomState(19)
test_dataset(X_sample, Y_sample, rs, "euclidean", normalize=False, n_clusters=5)

kmeans score:  0.35977281312435305
kmeans score2:  0.35977281312435305


In [31]:
np.save("../datasets/breast_cancer_data/X.npy", X_sample)
np.save("../datasets/breast_cancer_data/Y.npy", Y_sample)

In [65]:
Xtest = np.load("../datasets/breast_cancer_data/X.npy")
ytest = np.load("../datasets/breast_cancer_data/Y.npy")

In [76]:
rs = np.random.RandomState(19)
test_dataset(Xtest, ytest, rs, "euclidean", normalize=True, n_clusters=10)

kmeans score:  0.2234799017682122
kmeans score2:  0.2234799017682122


In [118]:
Xtest.shape

(1, 30)

In [119]:
ytest.shape

(1,)

In [59]:
np.save("C:\Github_Projects/robust-active-clustering/datasets_small/breast_cancer_data/X.npy", X_sample)
np.save("C:\Github_Projects/robust-active-clustering/datasets_small/breast_cancer_data/Y.npy", Y_sample)

## Cardiotocography

In [104]:
import pandas as pd
data = pd.read_excel('../datasets/cardiotocography_data/CTG.xls', sheet_name = 1, skiprows = 1)

In [105]:
data.drop(data.iloc[:, :10], inplace = True, axis = 1) 
data.drop(data.iloc[:, 22:33], inplace = True, axis = 1)
data = data.drop(['Unnamed: 31', 'Unnamed: 44'], axis = 1)
data = data.dropna()
data = data.drop_duplicates()
data

Unnamed: 0,LB,AC.1,FM.1,UC.1,DL.1,DS.1,DP.1,ASTV,MSTV,ALTV,...,Max,Nmax,Nzeros,Mode,Mean,Median,Variance,Tendency,CLASS,NSP
0,120.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,73.0,0.5,43.0,...,126.0,2.0,0.0,120.0,137.0,121.0,73.0,1.0,9.0,2.0
1,132.0,0.006380,0.000000,0.006380,0.003190,0.0,0.0,17.0,2.1,0.0,...,198.0,6.0,1.0,141.0,136.0,140.0,12.0,0.0,6.0,1.0
2,133.0,0.003322,0.000000,0.008306,0.003322,0.0,0.0,16.0,2.1,0.0,...,198.0,5.0,1.0,141.0,135.0,138.0,13.0,0.0,6.0,1.0
3,134.0,0.002561,0.000000,0.007682,0.002561,0.0,0.0,16.0,2.4,0.0,...,170.0,11.0,0.0,137.0,134.0,137.0,13.0,1.0,6.0,1.0
4,132.0,0.006515,0.000000,0.008143,0.000000,0.0,0.0,16.0,2.4,0.0,...,170.0,9.0,0.0,137.0,136.0,138.0,11.0,1.0,2.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2121,140.0,0.000000,0.000000,0.007426,0.000000,0.0,0.0,79.0,0.2,25.0,...,177.0,4.0,0.0,153.0,150.0,152.0,2.0,0.0,5.0,2.0
2122,140.0,0.000775,0.000000,0.006971,0.000000,0.0,0.0,78.0,0.4,22.0,...,169.0,6.0,0.0,152.0,148.0,151.0,3.0,1.0,5.0,2.0
2123,140.0,0.000980,0.000000,0.006863,0.000000,0.0,0.0,79.0,0.4,20.0,...,170.0,5.0,0.0,153.0,148.0,152.0,4.0,1.0,5.0,2.0
2124,140.0,0.000679,0.000000,0.006110,0.000000,0.0,0.0,78.0,0.4,27.0,...,169.0,6.0,0.0,152.0,147.0,151.0,4.0,1.0,5.0,2.0


In [106]:
X = data.drop(['CLASS'], axis=1)
Y = data['CLASS'].to_numpy().astype(int) - 1

In [107]:
from sklearn import preprocessing
X = preprocessing.StandardScaler().fit_transform(X)

In [108]:
rs = np.random.RandomState(19)
X_sample, Y_sample = random_data_sample(X, Y, 1000, rs)

In [109]:
X_sample.shape

(1000, 22)

In [110]:
np.unique(Y_sample, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([180, 275,  27,  35,  31, 148, 114,  62,  28, 100], dtype=int64))

In [112]:
rs = np.random.RandomState(25)
test_dataset(X_sample, Y_sample, rs, "euclidean", normalize=False, n_clusters=5)

kmeans score:  0.16652625565510235
kmeans score2:  0.16652625565510235


In [39]:
np.save("../datasets/cardiotocography_data/X.npy", X_sample)
np.save("../datasets/cardiotocography_data/Y.npy", Y_sample)

In [71]:
np.save("C:\Github_Projects/robust-active-clustering/datasets_small/cardiotocography_data/X.npy", X_sample)
np.save("C:\Github_Projects/robust-active-clustering/datasets_small/cardiotocography_data/Y.npy", Y_sample)

In [84]:
Xtest = np.load("../datasets/cardiotocography_data/X.npy")
ytest = np.load("../datasets/cardiotocography_data/Y.npy")
print(Xtest.shape)
print(Xtest.shape)

(1000, 22)
(1000, 22)


In [87]:
rs = np.random.RandomState(19)
test_dataset(Xtest, ytest, rs, "euclidean", normalize=True, n_clusters=10)

kmeans score:  0.22526070454276287
kmeans score2:  0.22526070454276287


In [104]:
np.unique(Y, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([393, 591,  57,  78,  69, 312, 228, 123,  62, 202], dtype=int64))

In [25]:
type(Y[0])


numpy.int32

## Ecoli

In [113]:
df=pd.read_csv("../datasets/ecoli_data/ecoli.csv")
df

Unnamed: 0,SEQUENCE_NAME,MCG,GVH,LIP,CHG,AAC,ALM1,ALM2,SITE
0,AAT_ECOLI,0.49,0.29,0.48,0.5,0.56,0.24,0.35,cp
1,ACEA_ECOLI,0.07,0.40,0.48,0.5,0.54,0.35,0.44,cp
2,ACEK_ECOLI,0.56,0.40,0.48,0.5,0.49,0.37,0.46,cp
3,ACKA_ECOLI,0.59,0.49,0.48,0.5,0.52,0.45,0.36,cp
4,ADI_ECOLI,0.23,0.32,0.48,0.5,0.55,0.25,0.35,cp
...,...,...,...,...,...,...,...,...,...
331,TREA_ECOLI,0.74,0.56,0.48,0.5,0.47,0.68,0.30,pp
332,UGPB_ECOLI,0.71,0.57,0.48,0.5,0.48,0.35,0.32,pp
333,USHA_ECOLI,0.61,0.60,0.48,0.5,0.44,0.39,0.38,pp
334,XYLF_ECOLI,0.59,0.61,0.48,0.5,0.42,0.42,0.37,pp


In [114]:
df=df.drop(["SEQUENCE_NAME"],axis=1)

In [115]:
from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder()
df["SITE"] = labelencoder.fit_transform(df["SITE"])

In [116]:
X = df.drop(["SITE"],axis=1)
Y = df["SITE"]

In [117]:
from sklearn import preprocessing
X = preprocessing.StandardScaler().fit_transform(X)
Y = Y.to_numpy()

In [118]:
rs = np.random.RandomState(19)
X_sample, Y_sample = random_data_sample(X, Y, 1, rs)

In [119]:
X_sample.shape

(336, 7)

In [47]:
np.unique(Y_sample, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7]),
 array([137,  76,   1,   2,  37,  26,   5,  52], dtype=int64))

In [63]:
np.unique(Y_sample)

array([0, 1, 2, 3, 4, 5, 6, 7])

In [88]:
X_sample = np.load("../datasets/ecoli_data/X.npy")
Y_sample = np.load("../datasets/ecoli_data/Y.npy")

In [121]:
rs = np.random.RandomState(25)
test_dataset(X_sample, Y_sample, rs, "euclidean", normalize=False, n_clusters=10)

kmeans score:  0.35993322338951433
kmeans score2:  0.35993322338951433


In [48]:
np.save("../datasets/ecoli_data/X.npy", X_sample)
np.save("../datasets/ecoli_data/Y.npy", Y_sample)

In [82]:
np.save("C:\Github_Projects/robust-active-clustering/datasets_small/ecoli_data/X.npy", X_sample)
np.save("C:\Github_Projects/robust-active-clustering/datasets_small/ecoli_data/Y.npy", Y_sample)

## Forest Type Mapping

In [152]:
df_train = pd.read_csv("../datasets/ForestTypeMapping_data/training.csv")
df_test = pd.read_csv("../datasets/ForestTypeMapping_data/testing.csv")
#df = pd.concat([df_train, df_test], axis=1)
#df
df = pd.concat([df_train, df_test])

In [153]:
df

Unnamed: 0,class,b1,b2,b3,b4,b5,b6,b7,b8,b9,...,pred_minus_obs_H_b9,pred_minus_obs_S_b1,pred_minus_obs_S_b2,pred_minus_obs_S_b3,pred_minus_obs_S_b4,pred_minus_obs_S_b5,pred_minus_obs_S_b6,pred_minus_obs_S_b7,pred_minus_obs_S_b8,pred_minus_obs_S_b9
0,d,39,36,57,91,59,101,93,27,60,...,-2.36,-18.41,-1.88,-6.43,-21.03,-1.60,-6.18,-22.50,-5.20,-7.86
1,h,84,30,57,112,51,98,92,26,62,...,-2.26,-16.27,-1.95,-6.25,-18.79,-1.99,-6.18,-23.41,-8.87,-10.83
2,s,53,25,49,99,51,93,84,26,58,...,-1.46,-15.92,-1.79,-4.64,-17.73,-0.48,-4.69,-19.97,-4.10,-7.07
3,s,59,26,49,103,47,92,82,25,56,...,2.68,-13.77,-2.53,-6.34,-22.03,-2.34,-6.60,-27.10,-7.99,-10.81
4,d,57,49,66,103,64,106,114,28,59,...,-2.94,-21.74,-1.64,-4.62,-23.74,-0.85,-5.50,-22.83,-2.74,-5.84
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
320,o,55,56,72,91,79,113,86,35,68,...,-12.86,-23.08,-0.08,-3.46,-27.52,-1.04,-4.73,-22.85,-1.49,-4.10
321,d,69,49,76,91,52,91,92,25,57,...,-2.36,-11.47,-0.40,-3.74,-16.90,-0.78,-4.15,-11.13,-1.48,-3.55
322,s,49,26,48,107,59,104,62,22,53,...,2.32,-23.48,1.44,-1.59,-26.98,-1.36,-4.81,-24.50,-2.53,-4.97
323,s,55,26,52,92,55,98,65,23,56,...,-0.77,-23.74,1.27,-1.30,-25.53,-1.21,-4.70,-24.39,-2.21,-4.72


In [154]:
df.isnull().values.any()

False

In [155]:
from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder()
df["class"] = labelencoder.fit_transform(df["class"])

In [156]:
X = df.drop(["class"],axis=1)
Y = df["class"]

In [157]:
from sklearn import preprocessing
X = preprocessing.MinMaxScaler().fit_transform(X)
Y = Y.to_numpy()

In [148]:
X = np.array(X)
Y = np.array(Y)

In [158]:
rs = np.random.RandomState(19)
X_sample, Y_sample = random_data_sample(X, Y, 1, rs)

In [159]:
X_sample.shape

(523, 27)

In [130]:
np.unique(Y_sample, return_counts=True)

(array([0, 1, 2, 3]), array([168,  84,  86, 185], dtype=int64))

In [58]:
np.unique(Y_sample)

array([0, 1, 2, 3])

In [96]:
X_sample = np.load("../datasets/ForestTypeMapping_data/X.npy")
Y_sample = np.load("../datasets/ForestTypeMapping_data/Y.npy")

In [97]:
X_sample.shape

(523, 27)

In [161]:
rs = np.random.RandomState(25)
test_dataset(X_sample, Y_sample, rs, "euclidean", normalize=False, n_clusters=3)

kmeans score:  0.3342099395207977
kmeans score2:  0.3342099395207977


In [59]:
np.save("../datasets/ForestTypeMapping_data/X.npy", X_sample)
np.save("../datasets/ForestTypeMapping_data/Y.npy", Y_sample)

In [96]:
np.save("C:\Github_Projects/robust-active-clustering/datasets_small/ForestTypeMapping_data/X.npy", X_sample)
np.save("C:\Github_Projects/robust-active-clustering/datasets_small/ForestTypeMapping_data/Y.npy", Y_sample)

## User knowledge data

In [163]:
import pandas as pd
import numpy as np
#df=pd.read_csv("../datasets/user_knowledge_data/user_knowledge.csv")
df1 = pd.read_excel('../datasets/user_knowledge_data/user_knowledge_data.xls', sheet_name = 1)
df2 = pd.read_excel('../datasets/user_knowledge_data/user_knowledge_data.xls', sheet_name = 2)
df = pd.concat([df1, df2])
df

Unnamed: 0,STG,SCG,STR,LPR,PEG,UNS,Unnamed: 6,Unnamed: 7,Attribute Information:
0,0.00,0.00,0.00,0.00,0.00,very_low,,,STG (The degree of study time for goal object ...
1,0.08,0.08,0.10,0.24,0.90,High,,,SCG (The degree of repetition number of user f...
2,0.06,0.06,0.05,0.25,0.33,Low,,,STR (The degree of study time of user for rela...
3,0.10,0.10,0.15,0.65,0.30,Middle,,,LPR (The exam performance of user for related ...
4,0.08,0.08,0.08,0.98,0.24,Low,,,PEG (The exam performance of user for goal obj...
...,...,...,...,...,...,...,...,...,...
140,0.90,0.78,0.62,0.32,0.89,High,,,
141,0.85,0.82,0.66,0.83,0.83,High,,,
142,0.56,0.60,0.77,0.13,0.32,Low,,,
143,0.66,0.68,0.81,0.57,0.57,Middle,,,


In [164]:
df = df.drop(['Unnamed: 6', 'Unnamed: 7', "Attribute Information:"], axis = 1)
df

Unnamed: 0,STG,SCG,STR,LPR,PEG,UNS
0,0.00,0.00,0.00,0.00,0.00,very_low
1,0.08,0.08,0.10,0.24,0.90,High
2,0.06,0.06,0.05,0.25,0.33,Low
3,0.10,0.10,0.15,0.65,0.30,Middle
4,0.08,0.08,0.08,0.98,0.24,Low
...,...,...,...,...,...,...
140,0.90,0.78,0.62,0.32,0.89,High
141,0.85,0.82,0.66,0.83,0.83,High
142,0.56,0.60,0.77,0.13,0.32,Low
143,0.66,0.68,0.81,0.57,0.57,Middle


In [165]:
df.columns

Index(['STG', 'SCG', 'STR', 'LPR', 'PEG', ' UNS'], dtype='object')

In [166]:
from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder()
df[" UNS"] = labelencoder.fit_transform(df[" UNS"])
df

Unnamed: 0,STG,SCG,STR,LPR,PEG,UNS
0,0.00,0.00,0.00,0.00,0.00,4
1,0.08,0.08,0.10,0.24,0.90,0
2,0.06,0.06,0.05,0.25,0.33,1
3,0.10,0.10,0.15,0.65,0.30,2
4,0.08,0.08,0.08,0.98,0.24,1
...,...,...,...,...,...,...
140,0.90,0.78,0.62,0.32,0.89,0
141,0.85,0.82,0.66,0.83,0.83,0
142,0.56,0.60,0.77,0.13,0.32,1
143,0.66,0.68,0.81,0.57,0.57,2


In [167]:
X = df.drop([" UNS"],axis=1)
Y = df[" UNS"]

In [168]:
from sklearn import preprocessing
X = preprocessing.StandardScaler().fit_transform(X)
Y = Y.to_numpy()

In [169]:
rs = np.random.RandomState(19)
X_sample, Y_sample = random_data_sample(X, Y, 1, rs)

In [170]:
X_sample.shape

(403, 5)

In [117]:
np.unique(Y_sample, return_counts=True)

(array([0, 1, 2, 3, 4]), array([111, 129, 116,  28,  19], dtype=int64))

In [119]:
X_sample = np.load("../datasets/user_knowledge_data/X.npy")
Y_sample = np.load("../datasets/user_knowledge_data/Y.npy")

In [120]:
X_sample.shape

(403, 5)

In [175]:
rs = np.random.RandomState(25)
test_dataset(X_sample, Y_sample, rs, "euclidean", normalize=False, n_clusters=5)

kmeans score:  0.10627365742250956
kmeans score2:  0.10627365742250956


In [68]:
np.save("../datasets/user_knowledge_data/X.npy", X_sample)
np.save("../datasets/user_knowledge_data/Y.npy", Y_sample)

In [107]:
np.save("C:\Github_Projects/robust-active-clustering/datasets_small/user_knowledge_data/X.npy", X_sample)
np.save("C:\Github_Projects/robust-active-clustering/datasets_small/user_knowledge_data/Y.npy", Y_sample)

In [130]:
file1 = open('../experiment_results/real_world_benchmark/completed_experiments_new.txt', 'r')
Lines = file1.readlines()

new_lines = []
 
count = 0
# Strips the newline character
for line in Lines:
    count += 1
    #if "cardiotocography" not in line:
        #new_lines.append(line)

    if "custom" not in line:
        new_lines.append(line)




In [131]:
#new_lines = []
file1 = open('../experiment_results/real_world_benchmark/completed_experiments_new.txt', 'w')
file1.writelines(new_lines)
file1.close()

## MNIST

In [99]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import numpy as np

# Define the CNN model
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(64 * 7 * 7, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.pool(self.relu(self.conv1(x)))
        x = self.pool(self.relu(self.conv2(x)))
        x = x.view(x.size(0), -1)
        x = self.relu(self.fc1(x))
        x_out = self.fc2(x)
        return x_out, x

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
transform = transforms.Compose([transforms.Resize((28, 28)),
                                    transforms.ToTensor(),
                                    transforms.Normalize((0.5,), (0.5,))])

# Load MNIST data
train_data = datasets.MNIST(root="../datasets/mnist_data", train=True, transform=transform, download=True)
test_data = datasets.MNIST(root="../datasets/mnist_data", train=False, transform=transform, download=True)

train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)

# Train the CNN
model = SimpleCNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
epochs = 10

model.train()
for epoch in range(epochs):
    for i, (images, labels) in enumerate(train_loader):
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs, _ = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

# Extract embeddings and labels of test data
model.eval()
embeddings_list = []
labels_list = []
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        _, embeddings = model(images)
        embeddings_list.append(embeddings.cpu().numpy())
        labels_list.append(labels.cpu().numpy())

# Save embeddings and labels to X.npy and Y.npy
X = np.vstack(embeddings_list)
Y = np.concatenate(labels_list)
np.save("../datasets/mnist_data/X_full.npy", X)
np.save("../datasets/mnist_data/Y_full.npy", Y)

In [176]:
X = np.load("../datasets/mnist_data/X_full.npy")
Y = np.load("../datasets/mnist_data/Y_full.npy")

In [177]:
from sklearn import preprocessing
X = preprocessing.MinMaxScaler().fit_transform(X)
#Y = Y.to_numpy()

In [178]:
from sklearn.decomposition import PCA
pca = PCA(n_components=20)
X = pca.fit_transform(X)

In [179]:
rs = np.random.RandomState(39)
X_sample, Y_sample = random_data_sample(X, Y, 1000, rs)

In [180]:
X_sample.shape

(1000, 20)

In [74]:
np.unique(Y_sample, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int64),
 array([105, 109, 111, 112, 104,  86,  99,  88,  88,  98], dtype=int64))

In [182]:
X_sample = np.load("../datasets/mnist_data/X.npy")
Y_sample = np.load("../datasets/mnist_data/Y.npy")

In [183]:
X_sample.shape

(1000, 20)

In [188]:
rs = np.random.RandomState(25)
test_dataset(X_sample, Y_sample, rs, "euclidean", normalize=False, n_clusters=3)

kmeans score:  0.2399923464484798
kmeans score2:  0.2399923464484798


In [75]:
np.save("../datasets/mnist_data/X.npy", X_sample)
np.save("../datasets/mnist_data/Y.npy", Y_sample)

In [119]:
np.save("C:\Github_Projects/robust-active-clustering/datasets_small/mnist_data/X.npy", X_sample)
np.save("C:\Github_Projects/robust-active-clustering/datasets_small/mnist_data/Y.npy", Y_sample)

## Yeast

In [189]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [190]:
data_set=pd.read_csv("../datasets/yeast_data/yeast.csv")
data_set

Unnamed: 0,mcg,gvh,alm,mit,erl,pox,vac,nuc,name
0,0.58,0.61,0.47,0.13,0.5,0.0,0.48,0.22,MIT
1,0.43,0.67,0.48,0.27,0.5,0.0,0.53,0.22,MIT
2,0.64,0.62,0.49,0.15,0.5,0.0,0.53,0.22,MIT
3,0.58,0.44,0.57,0.13,0.5,0.0,0.54,0.22,NUC
4,0.42,0.44,0.48,0.54,0.5,0.0,0.48,0.22,MIT
...,...,...,...,...,...,...,...,...,...
1479,0.81,0.62,0.43,0.17,0.5,0.0,0.53,0.22,ME2
1480,0.47,0.43,0.61,0.40,0.5,0.0,0.48,0.47,NUC
1481,0.67,0.57,0.36,0.19,0.5,0.0,0.56,0.22,ME2
1482,0.43,0.40,0.60,0.16,0.5,0.0,0.53,0.39,NUC


In [191]:
from sklearn.preprocessing import LabelEncoder
name_encoder=LabelEncoder()
data_set["name"]=name_encoder.fit_transform(data_set["name"].values)

In [192]:
X =data_set.iloc[:,:-1].values
Y =data_set.iloc[:,-1].values

In [193]:
from sklearn import preprocessing
X = preprocessing.MinMaxScaler().fit_transform(X)

In [194]:
rs = np.random.RandomState(39)
X_sample, Y_sample = random_data_sample(X, Y, 1000, rs)

In [195]:
X_sample.shape

(1000, 8)

In [83]:
np.unique(Y_sample, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([319,   4,  31,  17,  28, 131, 169, 271,  12,  18], dtype=int64))

In [199]:
rs = np.random.RandomState(25)
test_dataset(X_sample, Y_sample, rs, "euclidean", normalize=False, n_clusters=10)

kmeans score:  0.14825968169179507
kmeans score2:  0.14825968169179507


In [85]:
np.save("../datasets/yeast_data/X.npy", X_sample)
np.save("../datasets/yeast_data/Y.npy", Y_sample)

In [197]:
X_sample = np.load("../datasets/yeast_data/X.npy")
Y_sample = np.load("../datasets/yeast_data/Y.npy") 

In [130]:
X_sample.shape

(1000, 8)

In [131]:
rs = np.random.RandomState(25)
test_dataset(X_sample, Y_sample, rs, "euclidean", normalize=True, n_clusters=10)

kmeans score:  0.1926289892995529
kmeans score2:  0.1926289892995529


In [135]:
np.save("C:\Github_Projects/robust-active-clustering/datasets_small/yeast_data/X.npy", X_sample)
np.save("C:\Github_Projects/robust-active-clustering/datasets_small/yeast_data/Y.npy", Y_sample)

In [2]:
import numpy as np
def random_row_generator(number):
    return set(np.random.choice(np.arange(1, 36), size=number, replace=False))

In [7]:
l1 = random_row_generator(10)
l2 = random_row_generator(4)
print(l1, l2)

{5, 9, 11, 17, 18, 20, 21, 23, 27, 29} {24, 27, 21, 14}


In [8]:
len(l1.intersection(l2))

2

In [1]:
from sklearn import datasets, metrics
from active_semi_clustering.semi_supervised.pairwise_constraints import PCKMeans, MPCKMeans, COPKMeans
from active_semi_clustering.active.pairwise_constraints import ExampleOracle, ExploreConsolidate, MinMax, NPU
import numpy as np

In [2]:
def clustering_from_clustering_solution(clustering_solution):
    num_clusters = np.max(clustering_solution) + 1
    clustering = [[] for _ in range(num_clusters)]
    for i in range(len(clustering_solution)):
        clustering[clustering_solution[i]].append(i)
    return clustering, num_clusters

def sim_matrix_from_clustering(clustering, N):
    pairwise_similarities = -np.ones((N, N))
    for cind in clustering:
        pairwise_similarities[np.ix_(cind, cind)] = 1
    return pairwise_similarities

In [3]:
def get_constraints(prop_pos, prop_neg, sim_matrix, noise_level=0.0):
    N = sim_matrix.shape[0]
    lower_triangle_indices = np.tril_indices(N, -1)
    num_flips = int(noise_level*len(sim_matrix[lower_triangle_indices]))

    noise_inds = np.random.choice(len(sim_matrix[lower_triangle_indices]), num_flips)
    ind1_noise, ind2_noise = lower_triangle_indices[0][noise_inds], lower_triangle_indices[1][noise_inds]
    sim_matrix[ind1_noise, ind2_noise] *= -1

    ind_pos = np.where(sim_matrix[lower_triangle_indices] == 1)[0]
    ind_neg = np.where(sim_matrix[lower_triangle_indices] == -1)[0]
    num_pos = int(len(ind_pos)*prop_pos)
    num_neg = int(len(ind_neg)*prop_neg)
    print("num_pos: ", num_pos)
    print("num_neg: ", num_neg)
    ind_pos = np.random.choice(ind_pos, num_pos)
    ind_neg = np.random.choice(ind_neg, num_neg)
    
    ind1_pos, ind2_pos = lower_triangle_indices[0][ind_pos], lower_triangle_indices[1][ind_pos]
    ind1_neg, ind2_neg = lower_triangle_indices[0][ind_neg], lower_triangle_indices[1][ind_neg]
    ml = [(i1, i2) for i1, i2 in zip(ind1_pos, ind2_pos)]
    cl = [(i1, i2) for i1, i2 in zip(ind1_neg, ind2_neg)]
    return ml, cl 

In [4]:
X = np.load("../datasets/cifar10_data/X.npy")
y = np.load("../datasets/cifar10_data/Y.npy")
#X, y = datasets.load_iris(return_X_y=True)

In [5]:
X.shape

(1000, 512)

In [6]:
clustering_solution = clustering_from_clustering_solution(y)
sim_matrix = sim_matrix_from_clustering(clustering_solution[0], len(y))

In [7]:
ml, cl = get_constraints(0.002, 0.002, sim_matrix, noise_level=0.4)

num_pos:  363
num_neg:  635


In [8]:
ml

[(927, 244),
 (846, 174),
 (996, 866),
 (816, 624),
 (516, 264),
 (65, 28),
 (668, 438),
 (159, 149),
 (550, 524),
 (784, 532),
 (792, 735),
 (799, 792),
 (883, 287),
 (803, 725),
 (74, 59),
 (698, 207),
 (604, 373),
 (349, 234),
 (609, 463),
 (284, 245),
 (991, 141),
 (799, 246),
 (758, 504),
 (887, 608),
 (395, 285),
 (971, 18),
 (649, 178),
 (543, 340),
 (913, 734),
 (970, 933),
 (822, 342),
 (370, 109),
 (370, 315),
 (397, 0),
 (883, 25),
 (765, 35),
 (801, 152),
 (485, 139),
 (764, 300),
 (824, 481),
 (986, 797),
 (796, 776),
 (740, 390),
 (696, 520),
 (980, 529),
 (810, 521),
 (520, 103),
 (481, 55),
 (953, 391),
 (944, 257),
 (295, 31),
 (889, 321),
 (447, 212),
 (554, 298),
 (860, 212),
 (765, 317),
 (249, 165),
 (429, 13),
 (667, 417),
 (387, 14),
 (950, 415),
 (344, 56),
 (376, 129),
 (675, 368),
 (873, 13),
 (233, 12),
 (527, 86),
 (988, 884),
 (368, 16),
 (909, 783),
 (826, 554),
 (662, 423),
 (154, 21),
 (745, 197),
 (406, 57),
 (258, 50),
 (678, 232),
 (601, 318),
 (680, 

In [9]:
from active_semi_clustering.semi_supervised.pairwise_constraints.constraints import preprocess_constraints

In [10]:
ml_graph, cl_graph, neighborhoods = preprocess_constraints(ml, cl, 1000)

WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS
WAS INCONS

In [11]:
neighborhoods

[[397, 42, 751, 0],
 [472, 835, 878, 527, 86, 5],
 [349, 234, 233, 12, 384, 816, 624, 519, 7],
 [559, 9],
 [655, 11],
 [512, 188, 840, 652, 626, 873, 429, 13],
 [894, 387, 189, 14],
 [978, 675, 368, 16],
 [141, 991, 758, 504, 971, 18],
 [646, 154, 563, 834, 21],
 [357, 22],
 [568, 225, 156, 965, 263, 285, 395, 705, 703, 892, 868, 24],
 [932, 287, 883, 254, 25],
 [861, 27],
 [65, 28],
 [738,
  496,
  426,
  494,
  218,
  938,
  219,
  187,
  284,
  245,
  89,
  169,
  809,
  606,
  295,
  31],
 [680, 317, 765, 35],
 [967, 699, 38],
 [116, 139, 485, 869, 820, 766, 115, 724, 47, 39],
 [57, 406, 979, 687, 91, 249, 615, 165, 374, 396, 597, 303, 136, 958, 470, 40],
 [298, 826, 554, 41],
 [265, 824, 829, 297, 184, 283, 403, 677, 78, 451, 389, 55, 481, 43],
 [362, 46],
 [574, 48],
 [258, 50],
 [149, 159, 859, 51],
 [191, 344, 56],
 [937, 58],
 [74, 924, 59],
 [157, 60],
 [712, 194, 62],
 [113, 802, 67],
 [102, 69],
 [856, 385, 77, 71],
 [449, 80],
 [734, 913, 528, 209, 890, 271, 101, 731, 772,

In [332]:
cl

[(755, 663),
 (974, 176),
 (903, 187),
 (998, 755),
 (446, 355),
 (375, 365),
 (528, 498),
 (604, 273),
 (893, 692),
 (906, 203),
 (203, 169),
 (793, 647),
 (716, 618),
 (941, 752),
 (662, 212),
 (582, 6),
 (460, 326),
 (967, 166),
 (275, 48),
 (540, 438),
 (288, 42),
 (728, 582),
 (860, 785),
 (366, 167),
 (659, 401),
 (632, 47),
 (945, 191),
 (774, 327),
 (904, 489),
 (597, 328),
 (185, 63),
 (891, 186),
 (503, 468),
 (733, 664),
 (508, 214),
 (959, 520),
 (921, 577),
 (970, 518),
 (850, 180),
 (958, 195),
 (514, 416),
 (762, 568),
 (812, 762),
 (744, 319),
 (154, 148),
 (854, 226),
 (515, 220),
 (601, 25),
 (878, 629),
 (755, 292),
 (887, 289),
 (852, 796),
 (361, 215),
 (779, 132),
 (576, 99),
 (958, 413),
 (910, 129),
 (853, 464),
 (941, 92),
 (705, 330),
 (553, 156),
 (857, 614),
 (746, 72),
 (921, 505),
 (717, 525),
 (511, 437),
 (835, 224),
 (809, 278),
 (322, 176),
 (929, 690),
 (645, 294),
 (285, 19),
 (769, 588),
 (929, 51),
 (520, 508),
 (557, 423),
 (551, 407),
 (767, 287)

In [333]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
X = StandardScaler().fit_transform(X)
#X = MinMaxScaler().fit_transform(X)

In [334]:
from sklearn.decomposition import PCA
pca = PCA(n_components=10)
X = pca.fit_transform(X)

In [12]:
clusterer = MPCKMeans(n_clusters=10, max_iter=10, w=1)
clusterer.fit(X, y, ml, cl)

FloatingPointError: overflow encountered in det

In [340]:
len(np.unique(y))

10

In [338]:
metrics.adjusted_rand_score(y, clusterer.labels_)

0.9757615832890811

In [247]:
clusterer2 = COPKMeans(n_clusters=5)
clusterer2.fit(X=X, y=y, ml=ml, cl=cl)

<active_semi_clustering.semi_supervised.pairwise_constraints.copkmeans.COPKMeans at 0x24cf482d310>

In [248]:
metrics.adjusted_rand_score(y, clusterer2.labels_)

0.6166288563335512

In [271]:
clusterer3 = PCKMeans(n_clusters=4)
clusterer3.fit(X=X, y=y, ml=ml, cl=cl)

<active_semi_clustering.semi_supervised.pairwise_constraints.pckmeans.PCKMeans at 0x24ce89c51f0>

In [272]:
metrics.adjusted_rand_score(y, clusterer3.labels_)

0.7054615619182371

In [335]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=10, random_state=0).fit(X)
metrics.adjusted_rand_score(y, kmeans.labels_)

0.9540048625671941