In [1]:
from sklearn.cluster import KMeans
from sklearn import preprocessing
import numpy as np
from scipy.spatial import distance
from rac.correlation_clustering import max_correlation_dynamic_K
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score, v_measure_score

sim_init = 1

def sim_matrix_from_clustering(clustering, N):
    pairwise_similarities = -sim_init*np.ones((N, N))
    for cind in clustering:
        pairwise_similarities[np.ix_(cind, cind)] = sim_init
    return pairwise_similarities


def clustering_from_clustering_solution(clustering_solution):
    num_clusters = np.max(clustering_solution) + 1
    clustering = [[] for _ in range(num_clusters)]
    for i in range(len(clustering_solution)):
        clustering[clustering_solution[i]].append(i)
    return clustering, num_clusters

def test_dataset(X, Y, rs, distance_metric="euclidean", normalize=False, n_clusters=None):
    if n_clusters is None:
        n_classes = len(np.unique(Y))
    else:
        n_classes = n_clusters
    if normalize:
        X = preprocessing.StandardScaler().fit_transform(X)
    kmeans = KMeans(n_clusters=n_classes, random_state=1).fit(X)
    print("kmeans score: ", adjusted_rand_score(Y, kmeans.labels_))
    clustering_solution = np.array(kmeans.labels_)
    clustering, cnum_clusters = clustering_from_clustering_solution(clustering_solution)
    pairwise_similarities_kmeans = sim_matrix_from_clustering(clustering, len(Y))
    np.fill_diagonal(pairwise_similarities_kmeans, 0.0)

    #D = distance.cdist(X, X, distance_metric)
    #sim_matrix = np.max(D) - D + np.min(D)
    #pairwise_similarities_inverse = sim_init * (2 * sim_matrix - np.max(sim_matrix) -  np.min(sim_matrix)) / (np.max(sim_matrix) - np.min(sim_matrix))
    ##pairwise_similarities_inverse = sim_init * (2 * sim_matrix - np.max(sim_matrix) -  np.min(sim_matrix)) / (np.max(sim_matrix) - np.min(sim_matrix))
    
    #np.fill_diagonal(pairwise_similarities_inverse, 0.0)

    labels_kmeans, _ = max_correlation_dynamic_K(pairwise_similarities_kmeans, n_classes, 3, rs)
    #labels_inverse, _ = max_correlation_dynamic_K(pairwise_similarities_inverse, 5, 3, rr)
    print("kmeans score2: ", adjusted_rand_score(Y, labels_kmeans))
    #print("inverse score: ", adjusted_rand_score(Y, labels_inverse))

In [2]:
def random_data_sample(X, Y, size, rs):
    if size <= 1:
        num_samples = int(len(Y)*size)
    else:
        num_samples = np.minimum(size, len(Y))
    inds = rs.choice(len(Y), num_samples)
    return X[inds], Y[inds]

## 20newsgroups

In [None]:
from sklearn import datasets

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
cats = ["rec.sport.baseball", "soc.religion.christian", "rec.autos", "talk.politics.mideast", "misc.forsale"]
data = datasets.fetch_20newsgroups(data_home="../datasets/", subset="all", categories=cats)
Y = data.target
X = data.data

In [None]:
from flair.data import Sentence
from flair.embeddings import TransformerDocumentEmbeddings
from sklearn.feature_extraction.text import TfidfVectorizer
normalize = True
document_embeddings = TransformerDocumentEmbeddings('distilbert-base-uncased', fine_tune=True)
data = None
for sen in X:
    sentence = Sentence(sen)
    document_embeddings.embed(sentence)
    dat = sentence.get_embedding()
    if data is None:
        data = dat.cpu().numpy().reshape(1, 768)
    else:
        data = np.vstack((data, dat.cpu().numpy().reshape(1, 768)))
X = data
#np.save("datasets/20newsgroups_small.npy", X)
#X = np.load("20newsgroups.npy")
#X = TfidfVectorizer().fit_transform(X)

In [9]:
from sklearn import datasets
data = datasets.fetch_20newsgroups(data_home="../datasets/20newsgroups_data/", subset="all")
Y = data.target
X = np.load("../datasets/20newsgroups_data/20newsgroups.npy")
X.shape

(18846, 768)

In [10]:
X = preprocessing.StandardScaler().fit_transform(X)

In [5]:
#from sklearn.decomposition import PCA
#pca = PCA(n_components=100)
#X = pca.fit_transform(X)

In [12]:
rs = np.random.RandomState(19)
X_sample, Y_sample = random_data_sample(X, Y, 2000, rs)

In [13]:
X_sample.shape

(2000, 768)

In [14]:
#np.save("../datasets/20newsgroups_data/X.npy", X_sample)
#np.save("../datasets/20newsgroups_data/Y.npy", Y_sample)

In [9]:
rs = np.random.RandomState(22)
test_dataset(X_sample, Y_sample, rs, "euclidean", normalize=True, n_clusters=10)

kmeans score:  0.21049628881712648
kmeans score2:  0.21049628881712648


## 20newsgroups_small

In [15]:
from sklearn import datasets
data = datasets.fetch_20newsgroups(data_home="../datasets/20newsgroups_data/", subset="all")
Y = data.target
X = np.load("../datasets/20newsgroups_data/20newsgroups.npy")
X.shape

(18846, 768)

In [16]:
Y.shape

(18846,)

In [6]:
X = preprocessing.StandardScaler().fit_transform(X)

In [11]:
rs = np.random.RandomState(19)
X_sample, Y_sample = random_data_sample(X, Y, 0.025, rs)

In [13]:
X_sample.shape

(471, 768)

In [14]:
np.unique(Y_sample, return_counts=True)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19]),
 array([25, 28, 19, 18, 24, 23, 21, 28, 24, 23, 18, 29, 21, 25, 35, 18, 27,
        30, 23, 12], dtype=int64))

In [15]:
rs = np.random.RandomState(22)
test_dataset(X_sample, Y_sample, rs, "euclidean", normalize=True, n_clusters=20)

kmeans score:  0.1826667801365706
kmeans score2:  0.1826667801365706


In [13]:
#np.save("../datasets/20newsgroups_small_data/X.npy", X_sample)
#np.save("../datasets/20newsgroups_small_data/Y.npy", Y_sample)

In [17]:
np.save("C:\Github_Projects/robust-active-clustering/datasets_small/20newsgroups_data/X.npy", X_sample)
np.save("C:\Github_Projects/robust-active-clustering/datasets_small/20newsgroups_data/Y.npy", Y_sample)

## CIFAR10

In [6]:
import torch
from torchvision import transforms
from torchvision import datasets

cifar_training_transform = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
        ])
cifar10_train = datasets.CIFAR10(
                "../datasets/cifar10_original_data",
                train=True,
                download=True,
                transform=cifar_training_transform,
                target_transform=torch.tensor
            )

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ../datasets/cifar10_original_data\cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:00<?, ?it/s]

Extracting ../datasets/cifar10_original_data\cifar-10-python.tar.gz to ../datasets/cifar10_original_data


In [19]:
X = cifar10_train.data
Y = np.array(cifar10_train.targets)

In [21]:
rs = np.random.RandomState(19)
X_sample, Y_sample = random_data_sample(X, Y, 2000, rs)

In [23]:
print(X_sample.shape)
print(Y_sample.shape)

(2000, 32, 32, 3)
(2000,)


In [None]:
np.save("../datasets/cifar10_original_data/X.npy", X_sample)
np.save("../datasets/cifar10_original_data/Y.npy", Y_sample)

In [14]:
cifar10_train.targets

[6,
 9,
 9,
 4,
 1,
 1,
 2,
 7,
 8,
 3,
 4,
 7,
 7,
 2,
 9,
 9,
 9,
 3,
 2,
 6,
 4,
 3,
 6,
 6,
 2,
 6,
 3,
 5,
 4,
 0,
 0,
 9,
 1,
 3,
 4,
 0,
 3,
 7,
 3,
 3,
 5,
 2,
 2,
 7,
 1,
 1,
 1,
 2,
 2,
 0,
 9,
 5,
 7,
 9,
 2,
 2,
 5,
 2,
 4,
 3,
 1,
 1,
 8,
 2,
 1,
 1,
 4,
 9,
 7,
 8,
 5,
 9,
 6,
 7,
 3,
 1,
 9,
 0,
 3,
 1,
 3,
 5,
 4,
 5,
 7,
 7,
 4,
 7,
 9,
 4,
 2,
 3,
 8,
 0,
 1,
 6,
 1,
 1,
 4,
 1,
 8,
 3,
 9,
 6,
 6,
 1,
 8,
 5,
 2,
 9,
 9,
 8,
 1,
 7,
 7,
 0,
 0,
 6,
 9,
 1,
 2,
 2,
 9,
 2,
 6,
 6,
 1,
 9,
 5,
 0,
 4,
 7,
 6,
 7,
 1,
 8,
 1,
 1,
 2,
 8,
 1,
 3,
 3,
 6,
 2,
 4,
 9,
 9,
 5,
 4,
 3,
 6,
 7,
 4,
 6,
 8,
 5,
 5,
 4,
 3,
 1,
 8,
 4,
 7,
 6,
 0,
 9,
 5,
 1,
 3,
 8,
 2,
 7,
 5,
 3,
 4,
 1,
 5,
 7,
 0,
 4,
 7,
 5,
 5,
 1,
 0,
 9,
 6,
 9,
 0,
 8,
 7,
 8,
 8,
 2,
 5,
 2,
 3,
 5,
 0,
 6,
 1,
 9,
 3,
 6,
 9,
 1,
 3,
 9,
 6,
 6,
 7,
 1,
 0,
 9,
 5,
 8,
 5,
 2,
 9,
 0,
 8,
 8,
 0,
 6,
 9,
 1,
 1,
 6,
 3,
 7,
 6,
 6,
 0,
 6,
 6,
 1,
 7,
 1,
 5,
 8,
 3,
 6,
 6,
 8,
 6,
 8,
 4,
 6,
 6,


In [15]:
X = np.load("../datasets/cifar10_data/cifar10_embedding.npy")
Y = np.load("../datasets/cifar10_data/cifar10_labels.npy")

In [16]:
from sklearn import preprocessing
X = preprocessing.StandardScaler().fit_transform(X)

In [17]:
from sklearn.decomposition import PCA
pca = PCA(n_components=100)
X = pca.fit_transform(X)

In [18]:
rs = np.random.RandomState(19)
X_sample, Y_sample = random_data_sample(X, Y, 2000, rs)

In [19]:
X_sample.shape

(2000, 100)

In [31]:
rs = np.random.RandomState(19)
test_dataset(X_sample, Y_sample, rs, "euclidean", normalize=True, n_clusters=10)

kmeans score:  0.14552704721544038
kmeans score2:  0.14552704721544038


In [20]:
#np.save("../datasets/cifar10_data/X.npy", X_sample)
#np.save("../datasets/cifar10_data/Y.npy", Y_sample)

In [33]:
np.save("C:\Github_Projects/robust-active-clustering/datasets_small/cifar10_data/X.npy", X_sample)
np.save("C:\Github_Projects/robust-active-clustering/datasets_small/cifar10_data/Y.npy", Y_sample)

## CIFAR10_small

In [40]:
X = np.load("../datasets/cifar10_data/cifar10_embedding.npy")
Y = np.load("../datasets/cifar10_data/cifar10_labels.npy")

In [41]:
from sklearn import preprocessing
X = preprocessing.StandardScaler().fit_transform(X)

In [42]:
from sklearn.decomposition import PCA
pca = PCA(n_components=100)
X = pca.fit_transform(X)

In [43]:
rs = np.random.RandomState(19)
X_sample, Y_sample = random_data_sample(X, Y, 0.06, rs)

In [44]:
X_sample.shape

(3000, 100)

In [45]:
np.unique(Y_sample, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([278, 284, 310, 272, 302, 340, 289, 312, 308, 305], dtype=int64))

In [26]:
rs = np.random.RandomState(19)
test_dataset(X_sample, Y_sample, rs, "euclidean", normalize=True, n_clusters=20)

kmeans score:  0.437681481133735
kmeans score2:  0.437681481133735


In [27]:
#np.save("../datasets/cifar10_small_data/X.npy", X_sample)
#np.save("../datasets/cifar10_small_data/Y.npy", Y_sample)

## Mushrooms

In [21]:
import pandas as pd
df = pd.read_csv("../datasets/mushrooms_data/mushrooms.csv")

In [22]:
df = df.astype('category')
df.dtypes

class                       category
cap-shape                   category
cap-surface                 category
cap-color                   category
bruises                     category
odor                        category
gill-attachment             category
gill-spacing                category
gill-size                   category
gill-color                  category
stalk-shape                 category
stalk-root                  category
stalk-surface-above-ring    category
stalk-surface-below-ring    category
stalk-color-above-ring      category
stalk-color-below-ring      category
veil-type                   category
veil-color                  category
ring-number                 category
ring-type                   category
spore-print-color           category
population                  category
habitat                     category
dtype: object

In [23]:
from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder()
for column in df.columns:
    df[column] = labelencoder.fit_transform(df[column])

df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [24]:
Y = df["class"].values
X = df.drop(["class"], axis=1).values

In [25]:
rs = np.random.RandomState(19)
X_sample, Y_sample = random_data_sample(X, Y, 2000, rs)

In [26]:
X_sample.shape

(2000, 22)

In [44]:
np.unique(Y_sample, return_counts=True)

(array([0, 1]), array([195, 211], dtype=int64))

In [45]:
rs = np.random.RandomState(19)
test_dataset(X_sample, Y_sample, rs, "euclidean", normalize=True, n_clusters=4)

kmeans score:  0.28843838085278295
kmeans score2:  0.28843838085278295


In [27]:
#np.save("../datasets/mushrooms_data/X.npy", X_sample)
#np.save("../datasets/mushrooms_data/Y.npy", Y_sample)

In [129]:
Xtest = np.load("../datasets/mushrooms_data/X.npy")
ytest = np.load("../datasets/mushrooms_data/Y.npy")
print(Xtest.shape)
print(ytest.shape)

(2000, 22)
(2000,)


In [46]:
np.save("C:\Github_Projects/robust-active-clustering/datasets_small/mushrooms_data/X.npy", X_sample)
np.save("C:\Github_Projects/robust-active-clustering/datasets_small/mushrooms_data/Y.npy", Y_sample)

In [41]:
import numpy as np
np.load("../datasets/mushrooms_data/X.npy").shape

(4874, 22)

## Breast Cancer data

In [120]:
df = pd.read_csv("../datasets/breast_cancer_data/data.csv")
df

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,
1,842517,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,
2,84300903,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,
4,84358402,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,
565,926682,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,
566,926954,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,
567,927241,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,


In [121]:
df.drop(columns=['Unnamed: 32','id'],inplace=True)
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})
df

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,1,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,1,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,1,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,1,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,1,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,1,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,1,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,1,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [122]:
X = df.drop(['diagnosis'], axis=1)
Y = df['diagnosis']

In [123]:
from sklearn import preprocessing
X = preprocessing.StandardScaler().fit_transform(X)

In [124]:
rs = np.random.RandomState(19)
X_sample, Y_sample = random_data_sample(X, Y, 1, rs)

In [125]:
X_sample.shape

(569, 30)

In [126]:
np.unique(Y_sample, return_counts=True)

(array([0, 1], dtype=int64), array([356, 213], dtype=int64))

In [58]:
rs = np.random.RandomState(19)
test_dataset(X_sample, Y_sample, rs, "euclidean", normalize=True, n_clusters=5)

kmeans score:  0.37709037338532786
kmeans score2:  0.37709037338532786


In [128]:
#np.save("../datasets/breast_cancer_data/X.npy", X_sample)
#np.save("../datasets/breast_cancer_data/Y.npy", Y_sample)

In [117]:
Xtest = np.load("../datasets/breast_cancer_data/X.npy")
ytest = np.load("../datasets/breast_cancer_data/Y.npy")

In [118]:
Xtest.shape

(1, 30)

In [119]:
ytest.shape

(1,)

In [59]:
np.save("C:\Github_Projects/robust-active-clustering/datasets_small/breast_cancer_data/X.npy", X_sample)
np.save("C:\Github_Projects/robust-active-clustering/datasets_small/breast_cancer_data/Y.npy", Y_sample)

## Cardiotocography

In [45]:
import pandas as pd
data = pd.read_excel('../datasets/cardiotocography_data/CTG.xls', sheet_name = 1, skiprows = 1)

In [46]:
data.drop(data.iloc[:, :10], inplace = True, axis = 1) 
data.drop(data.iloc[:, 22:33], inplace = True, axis = 1)
data = data.drop(['Unnamed: 31', 'Unnamed: 44'], axis = 1)
data = data.dropna()
data = data.drop_duplicates()
data

Unnamed: 0,LB,AC.1,FM.1,UC.1,DL.1,DS.1,DP.1,ASTV,MSTV,ALTV,...,Max,Nmax,Nzeros,Mode,Mean,Median,Variance,Tendency,CLASS,NSP
0,120.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,73.0,0.5,43.0,...,126.0,2.0,0.0,120.0,137.0,121.0,73.0,1.0,9.0,2.0
1,132.0,0.006380,0.000000,0.006380,0.003190,0.0,0.0,17.0,2.1,0.0,...,198.0,6.0,1.0,141.0,136.0,140.0,12.0,0.0,6.0,1.0
2,133.0,0.003322,0.000000,0.008306,0.003322,0.0,0.0,16.0,2.1,0.0,...,198.0,5.0,1.0,141.0,135.0,138.0,13.0,0.0,6.0,1.0
3,134.0,0.002561,0.000000,0.007682,0.002561,0.0,0.0,16.0,2.4,0.0,...,170.0,11.0,0.0,137.0,134.0,137.0,13.0,1.0,6.0,1.0
4,132.0,0.006515,0.000000,0.008143,0.000000,0.0,0.0,16.0,2.4,0.0,...,170.0,9.0,0.0,137.0,136.0,138.0,11.0,1.0,2.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2121,140.0,0.000000,0.000000,0.007426,0.000000,0.0,0.0,79.0,0.2,25.0,...,177.0,4.0,0.0,153.0,150.0,152.0,2.0,0.0,5.0,2.0
2122,140.0,0.000775,0.000000,0.006971,0.000000,0.0,0.0,78.0,0.4,22.0,...,169.0,6.0,0.0,152.0,148.0,151.0,3.0,1.0,5.0,2.0
2123,140.0,0.000980,0.000000,0.006863,0.000000,0.0,0.0,79.0,0.4,20.0,...,170.0,5.0,0.0,153.0,148.0,152.0,4.0,1.0,5.0,2.0
2124,140.0,0.000679,0.000000,0.006110,0.000000,0.0,0.0,78.0,0.4,27.0,...,169.0,6.0,0.0,152.0,147.0,151.0,4.0,1.0,5.0,2.0


In [47]:
X = data.drop(['CLASS'], axis=1)
Y = data['CLASS'].to_numpy().astype(int) - 1

In [48]:
from sklearn import preprocessing
X = preprocessing.StandardScaler().fit_transform(X)

In [51]:
rs = np.random.RandomState(19)
X_sample, Y_sample = random_data_sample(X, Y, 2000, rs)

In [52]:
X_sample.shape

(2000, 22)

In [53]:
np.unique(Y_sample, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([373, 555,  54,  68,  66, 294, 217, 116,  61, 196], dtype=int64))

In [70]:
rs = np.random.RandomState(25)
test_dataset(X_sample, Y_sample, rs, "euclidean", normalize=True, n_clusters=5)

kmeans score:  0.23830570835370218
kmeans score2:  0.23830570835370218


In [54]:
#np.save("../datasets/cardiotocography_data/X.npy", X_sample)
#np.save("../datasets/cardiotocography_data/Y.npy", Y_sample)

In [71]:
np.save("C:\Github_Projects/robust-active-clustering/datasets_small/cardiotocography_data/X.npy", X_sample)
np.save("C:\Github_Projects/robust-active-clustering/datasets_small/cardiotocography_data/Y.npy", Y_sample)

In [130]:
Xtest = np.load("../datasets/cardiotocography_data/X.npy")
ytest = np.load("../datasets/cardiotocography_data/Y.npy")
print(Xtest.shape)
print(ytest.shape)

(2000, 22)
(2000,)


In [104]:
np.unique(Y, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([393, 591,  57,  78,  69, 312, 228, 123,  62, 202], dtype=int64))

In [25]:
type(Y[0])


numpy.int32

## Ecoli

In [55]:
df=pd.read_csv("../datasets/ecoli_data/ecoli.csv")
df

Unnamed: 0,SEQUENCE_NAME,MCG,GVH,LIP,CHG,AAC,ALM1,ALM2,SITE
0,AAT_ECOLI,0.49,0.29,0.48,0.5,0.56,0.24,0.35,cp
1,ACEA_ECOLI,0.07,0.40,0.48,0.5,0.54,0.35,0.44,cp
2,ACEK_ECOLI,0.56,0.40,0.48,0.5,0.49,0.37,0.46,cp
3,ACKA_ECOLI,0.59,0.49,0.48,0.5,0.52,0.45,0.36,cp
4,ADI_ECOLI,0.23,0.32,0.48,0.5,0.55,0.25,0.35,cp
...,...,...,...,...,...,...,...,...,...
331,TREA_ECOLI,0.74,0.56,0.48,0.5,0.47,0.68,0.30,pp
332,UGPB_ECOLI,0.71,0.57,0.48,0.5,0.48,0.35,0.32,pp
333,USHA_ECOLI,0.61,0.60,0.48,0.5,0.44,0.39,0.38,pp
334,XYLF_ECOLI,0.59,0.61,0.48,0.5,0.42,0.42,0.37,pp


In [56]:
df=df.drop(["SEQUENCE_NAME"],axis=1)

In [57]:
from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder()
df["SITE"] = labelencoder.fit_transform(df["SITE"])

In [58]:
X = df.drop(["SITE"],axis=1)
Y = df["SITE"]

In [59]:
from sklearn import preprocessing
X = preprocessing.StandardScaler().fit_transform(X)
Y = Y.to_numpy()

In [60]:
rs = np.random.RandomState(19)
X_sample, Y_sample = random_data_sample(X, Y, 1, rs)

In [61]:
X_sample.shape

(336, 7)

In [62]:
np.unique(Y_sample, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7]),
 array([137,  76,   1,   2,  37,  26,   5,  52], dtype=int64))

In [63]:
np.unique(Y_sample)

array([0, 1, 2, 3, 4, 5, 6, 7])

In [81]:
rs = np.random.RandomState(25)
test_dataset(X_sample, Y_sample, rs, "euclidean", normalize=True, n_clusters=10)

kmeans score:  0.3944918184999811
kmeans score2:  0.3944918184999811


In [64]:
#np.save("../datasets/ecoli_data/X.npy", X_sample)
#np.save("../datasets/ecoli_data/Y.npy", Y_sample)

In [82]:
np.save("C:\Github_Projects/robust-active-clustering/datasets_small/ecoli_data/X.npy", X_sample)
np.save("C:\Github_Projects/robust-active-clustering/datasets_small/ecoli_data/Y.npy", Y_sample)

## Forest Type Mapping

In [65]:
df_train = pd.read_csv("../datasets/ForestTypeMapping_data/training.csv")
df_test = pd.read_csv("../datasets/ForestTypeMapping_data/testing.csv")
#df = pd.concat([df_train, df_test], axis=1)
#df
df = pd.concat([df_train, df_test])

In [66]:
df

Unnamed: 0,class,b1,b2,b3,b4,b5,b6,b7,b8,b9,...,pred_minus_obs_H_b9,pred_minus_obs_S_b1,pred_minus_obs_S_b2,pred_minus_obs_S_b3,pred_minus_obs_S_b4,pred_minus_obs_S_b5,pred_minus_obs_S_b6,pred_minus_obs_S_b7,pred_minus_obs_S_b8,pred_minus_obs_S_b9
0,d,39,36,57,91,59,101,93,27,60,...,-2.36,-18.41,-1.88,-6.43,-21.03,-1.60,-6.18,-22.50,-5.20,-7.86
1,h,84,30,57,112,51,98,92,26,62,...,-2.26,-16.27,-1.95,-6.25,-18.79,-1.99,-6.18,-23.41,-8.87,-10.83
2,s,53,25,49,99,51,93,84,26,58,...,-1.46,-15.92,-1.79,-4.64,-17.73,-0.48,-4.69,-19.97,-4.10,-7.07
3,s,59,26,49,103,47,92,82,25,56,...,2.68,-13.77,-2.53,-6.34,-22.03,-2.34,-6.60,-27.10,-7.99,-10.81
4,d,57,49,66,103,64,106,114,28,59,...,-2.94,-21.74,-1.64,-4.62,-23.74,-0.85,-5.50,-22.83,-2.74,-5.84
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
320,o,55,56,72,91,79,113,86,35,68,...,-12.86,-23.08,-0.08,-3.46,-27.52,-1.04,-4.73,-22.85,-1.49,-4.10
321,d,69,49,76,91,52,91,92,25,57,...,-2.36,-11.47,-0.40,-3.74,-16.90,-0.78,-4.15,-11.13,-1.48,-3.55
322,s,49,26,48,107,59,104,62,22,53,...,2.32,-23.48,1.44,-1.59,-26.98,-1.36,-4.81,-24.50,-2.53,-4.97
323,s,55,26,52,92,55,98,65,23,56,...,-0.77,-23.74,1.27,-1.30,-25.53,-1.21,-4.70,-24.39,-2.21,-4.72


In [67]:
df.isnull().values.any()

False

In [68]:
from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder()
df["class"] = labelencoder.fit_transform(df["class"])

In [69]:
X = df.drop(["class"],axis=1)
Y = df["class"]

In [70]:
from sklearn import preprocessing
X = preprocessing.MinMaxScaler().fit_transform(X)
Y = Y.to_numpy()

In [71]:
rs = np.random.RandomState(19)
X_sample, Y_sample = random_data_sample(X, Y, 1, rs)

In [72]:
X_sample.shape

(523, 27)

In [73]:
np.unique(Y_sample, return_counts=True)

(array([0, 1, 2, 3]), array([168,  84,  86, 185], dtype=int64))

In [74]:
np.unique(Y_sample)

array([0, 1, 2, 3])

In [95]:
rs = np.random.RandomState(25)
test_dataset(X_sample, Y_sample, rs, "euclidean", normalize=True, n_clusters=7)

kmeans score:  0.2948734133616552
kmeans score2:  0.2948734133616552


In [75]:
#np.save("../datasets/ForestTypeMapping_data/X.npy", X_sample)
#np.save("../datasets/ForestTypeMapping_data/Y.npy", Y_sample)

In [96]:
np.save("C:\Github_Projects/robust-active-clustering/datasets_small/ForestTypeMapping_data/X.npy", X_sample)
np.save("C:\Github_Projects/robust-active-clustering/datasets_small/ForestTypeMapping_data/Y.npy", Y_sample)

## User knowledge data

In [86]:
import pandas as pd
import numpy as np
#df=pd.read_csv("../datasets/user_knowledge_data/user_knowledge.csv")
df1 = pd.read_excel('../datasets/user_knowledge_data/user_knowledge_data.xls', sheet_name = 1)
df2 = pd.read_excel('../datasets/user_knowledge_data/user_knowledge_data.xls', sheet_name = 2)
df = pd.concat([df1, df2])
df

Unnamed: 0,STG,SCG,STR,LPR,PEG,UNS,Unnamed: 6,Unnamed: 7,Attribute Information:
0,0.00,0.00,0.00,0.00,0.00,very_low,,,STG (The degree of study time for goal object ...
1,0.08,0.08,0.10,0.24,0.90,High,,,SCG (The degree of repetition number of user f...
2,0.06,0.06,0.05,0.25,0.33,Low,,,STR (The degree of study time of user for rela...
3,0.10,0.10,0.15,0.65,0.30,Middle,,,LPR (The exam performance of user for related ...
4,0.08,0.08,0.08,0.98,0.24,Low,,,PEG (The exam performance of user for goal obj...
...,...,...,...,...,...,...,...,...,...
140,0.90,0.78,0.62,0.32,0.89,High,,,
141,0.85,0.82,0.66,0.83,0.83,High,,,
142,0.56,0.60,0.77,0.13,0.32,Low,,,
143,0.66,0.68,0.81,0.57,0.57,Middle,,,


In [87]:
df = df.drop(['Unnamed: 6', 'Unnamed: 7', "Attribute Information:"], axis = 1)
df

Unnamed: 0,STG,SCG,STR,LPR,PEG,UNS
0,0.00,0.00,0.00,0.00,0.00,very_low
1,0.08,0.08,0.10,0.24,0.90,High
2,0.06,0.06,0.05,0.25,0.33,Low
3,0.10,0.10,0.15,0.65,0.30,Middle
4,0.08,0.08,0.08,0.98,0.24,Low
...,...,...,...,...,...,...
140,0.90,0.78,0.62,0.32,0.89,High
141,0.85,0.82,0.66,0.83,0.83,High
142,0.56,0.60,0.77,0.13,0.32,Low
143,0.66,0.68,0.81,0.57,0.57,Middle


In [88]:
df.columns

Index(['STG', 'SCG', 'STR', 'LPR', 'PEG', ' UNS'], dtype='object')

In [89]:
from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder()
df[" UNS"] = labelencoder.fit_transform(df[" UNS"])
df

Unnamed: 0,STG,SCG,STR,LPR,PEG,UNS
0,0.00,0.00,0.00,0.00,0.00,4
1,0.08,0.08,0.10,0.24,0.90,0
2,0.06,0.06,0.05,0.25,0.33,1
3,0.10,0.10,0.15,0.65,0.30,2
4,0.08,0.08,0.08,0.98,0.24,1
...,...,...,...,...,...,...
140,0.90,0.78,0.62,0.32,0.89,0
141,0.85,0.82,0.66,0.83,0.83,0
142,0.56,0.60,0.77,0.13,0.32,1
143,0.66,0.68,0.81,0.57,0.57,2


In [90]:
X = df.drop([" UNS"],axis=1)
Y = df[" UNS"]

In [91]:
from sklearn import preprocessing
X = preprocessing.StandardScaler().fit_transform(X)
Y = Y.to_numpy()

In [92]:
rs = np.random.RandomState(19)
X_sample, Y_sample = random_data_sample(X, Y, 1, rs)

In [93]:
X_sample.shape

(403, 5)

In [94]:
np.unique(Y_sample, return_counts=True)

(array([0, 1, 2, 3, 4]), array([111, 129, 116,  28,  19], dtype=int64))

In [106]:
rs = np.random.RandomState(25)
test_dataset(X_sample, Y_sample, rs, "euclidean", normalize=True, n_clusters=2)

kmeans score:  0.3956886570821377
kmeans score2:  0.3956886570821377


In [95]:
#np.save("../datasets/user_knowledge_data/X.npy", X_sample)
#np.save("../datasets/user_knowledge_data/Y.npy", Y_sample)

In [107]:
np.save("C:\Github_Projects/robust-active-clustering/datasets_small/user_knowledge_data/X.npy", X_sample)
np.save("C:\Github_Projects/robust-active-clustering/datasets_small/user_knowledge_data/Y.npy", Y_sample)

In [130]:
file1 = open('../experiment_results/real_world_benchmark/completed_experiments_new.txt', 'r')
Lines = file1.readlines()

new_lines = []
 
count = 0
# Strips the newline character
for line in Lines:
    count += 1
    #if "cardiotocography" not in line:
        #new_lines.append(line)

    if "custom" not in line:
        new_lines.append(line)




In [131]:
#new_lines = []
file1 = open('../experiment_results/real_world_benchmark/completed_experiments_new.txt', 'w')
file1.writelines(new_lines)
file1.close()

## MNIST

In [99]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import numpy as np

# Define the CNN model
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(64 * 7 * 7, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.pool(self.relu(self.conv1(x)))
        x = self.pool(self.relu(self.conv2(x)))
        x = x.view(x.size(0), -1)
        x = self.relu(self.fc1(x))
        x_out = self.fc2(x)
        return x_out, x

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
transform = transforms.Compose([transforms.Resize((28, 28)),
                                    transforms.ToTensor(),
                                    transforms.Normalize((0.5,), (0.5,))])

# Load MNIST data
train_data = datasets.MNIST(root="../datasets/mnist_data", train=True, transform=transform, download=True)
test_data = datasets.MNIST(root="../datasets/mnist_data", train=False, transform=transform, download=True)

train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)

# Train the CNN
model = SimpleCNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
epochs = 10

model.train()
for epoch in range(epochs):
    for i, (images, labels) in enumerate(train_loader):
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs, _ = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

# Extract embeddings and labels of test data
model.eval()
embeddings_list = []
labels_list = []
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        _, embeddings = model(images)
        embeddings_list.append(embeddings.cpu().numpy())
        labels_list.append(labels.cpu().numpy())

# Save embeddings and labels to X.npy and Y.npy
X = np.vstack(embeddings_list)
Y = np.concatenate(labels_list)
np.save("../datasets/mnist_data/X_full.npy", X)
np.save("../datasets/mnist_data/Y_full.npy", Y)

In [100]:
X = np.load("../datasets/mnist_data/X_full.npy")
Y = np.load("../datasets/mnist_data/Y_full.npy")

In [101]:
from sklearn import preprocessing
X = preprocessing.MinMaxScaler().fit_transform(X)
#Y = Y.to_numpy()

In [102]:
from sklearn.decomposition import PCA
pca = PCA(n_components=20)
X = pca.fit_transform(X)

In [103]:
rs = np.random.RandomState(39)
X_sample, Y_sample = random_data_sample(X, Y, 2000, rs)

In [104]:
X_sample.shape

(2000, 20)

In [105]:
np.unique(Y_sample, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int64),
 array([207, 230, 218, 205, 201, 173, 194, 192, 175, 205], dtype=int64))

In [106]:
rs = np.random.RandomState(25)
test_dataset(X_sample, Y_sample, rs, "euclidean", normalize=True, n_clusters=4)

kmeans score:  0.4236933244817407
kmeans score2:  0.4236933244817407


In [107]:
#np.save("../datasets/mnist_data/X.npy", X_sample)
#np.save("../datasets/mnist_data/Y.npy", Y_sample)

In [119]:
np.save("C:\Github_Projects/robust-active-clustering/datasets_small/mnist_data/X.npy", X_sample)
np.save("C:\Github_Projects/robust-active-clustering/datasets_small/mnist_data/Y.npy", Y_sample)

## Yeast

In [108]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [109]:
data_set=pd.read_csv("../datasets/yeast_data/yeast.csv")
data_set

Unnamed: 0,mcg,gvh,alm,mit,erl,pox,vac,nuc,name
0,0.58,0.61,0.47,0.13,0.5,0.0,0.48,0.22,MIT
1,0.43,0.67,0.48,0.27,0.5,0.0,0.53,0.22,MIT
2,0.64,0.62,0.49,0.15,0.5,0.0,0.53,0.22,MIT
3,0.58,0.44,0.57,0.13,0.5,0.0,0.54,0.22,NUC
4,0.42,0.44,0.48,0.54,0.5,0.0,0.48,0.22,MIT
...,...,...,...,...,...,...,...,...,...
1479,0.81,0.62,0.43,0.17,0.5,0.0,0.53,0.22,ME2
1480,0.47,0.43,0.61,0.40,0.5,0.0,0.48,0.47,NUC
1481,0.67,0.57,0.36,0.19,0.5,0.0,0.56,0.22,ME2
1482,0.43,0.40,0.60,0.16,0.5,0.0,0.53,0.39,NUC


In [110]:
from sklearn.preprocessing import LabelEncoder
name_encoder=LabelEncoder()
data_set["name"]=name_encoder.fit_transform(data_set["name"].values)

In [111]:
X =data_set.iloc[:,:-1].values
Y =data_set.iloc[:,-1].values

In [112]:
from sklearn import preprocessing
X = preprocessing.MinMaxScaler().fit_transform(X)

In [113]:
rs = np.random.RandomState(39)
X_sample, Y_sample = random_data_sample(X, Y, 2000, rs)

In [114]:
X_sample.shape

(1484, 8)

In [115]:
np.unique(Y_sample, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([467,   5,  47,  34,  43, 193, 257, 395,  20,  23], dtype=int64))

In [132]:
rs = np.random.RandomState(25)
test_dataset(X_sample, Y_sample, rs, "euclidean", normalize=True, n_clusters=10)

kmeans score:  0.2023352752459405
kmeans score2:  0.2023352752459405


In [116]:
#np.save("../datasets/yeast_data/X.npy", X_sample)
#np.save("../datasets/yeast_data/Y.npy", Y_sample)

In [134]:
np.load("../datasets/yeast_data/X.npy").shape

(1484, 8)

In [135]:
np.save("C:\Github_Projects/robust-active-clustering/datasets_small/yeast_data/X.npy", X_sample)
np.save("C:\Github_Projects/robust-active-clustering/datasets_small/yeast_data/Y.npy", Y_sample)