In [96]:
from sklearn.cluster import KMeans
from sklearn import preprocessing
import numpy as np
import pandas as pd
from scipy.spatial import distance
from rac.correlation_clustering import max_correlation_dynamic_K
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score, v_measure_score
from ucimlrepo import fetch_ucirepo 

def test_dataset(X, Y, seed=1):
    n_classes = len(np.unique(Y))
    kmeans = KMeans(n_clusters=n_classes, random_state=seed).fit(X)
    print("kmeans score: ", adjusted_rand_score(Y, kmeans.labels_))

    X_sc = preprocessing.StandardScaler().fit_transform(X)
    kmeans = KMeans(n_clusters=n_classes, random_state=seed).fit(X_sc)
    print("kmeans score (SC): ", adjusted_rand_score(Y, kmeans.labels_))

    X_mm = preprocessing.MinMaxScaler().fit_transform(X)
    kmeans = KMeans(n_clusters=n_classes, random_state=seed).fit(X_mm)
    print("kmeans score (MM): ", adjusted_rand_score(Y, kmeans.labels_))

def random_data_sample(X, Y, size, rs):
    if size <= 1:
        num_samples = int(len(Y)*size)
    else:
        num_samples = np.minimum(size, len(Y))
    inds = rs.choice(len(Y), num_samples)
    return X[inds], Y[inds]


## 20newsgroups

In [None]:
from sklearn import datasets

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
cats = ["rec.sport.baseball", "soc.religion.christian", "rec.autos", "talk.politics.mideast", "misc.forsale"]
data = datasets.fetch_20newsgroups(data_home="../datasets/", subset="all", categories=cats)
Y = data.target
X = data.data

In [None]:
from flair.data import Sentence
from flair.embeddings import TransformerDocumentEmbeddings
from sklearn.feature_extraction.text import TfidfVectorizer
normalize = True
document_embeddings = TransformerDocumentEmbeddings('distilbert-base-uncased', fine_tune=True)
data = None
for sen in X:
    sentence = Sentence(sen)
    document_embeddings.embed(sentence)
    dat = sentence.get_embedding()
    if data is None:
        data = dat.cpu().numpy().reshape(1, 768)
    else:
        data = np.vstack((data, dat.cpu().numpy().reshape(1, 768)))
X = data
#np.save("datasets/20newsgroups_small.npy", X)
#X = np.load("20newsgroups.npy")
#X = TfidfVectorizer().fit_transform(X)

In [47]:
from sklearn import datasets
data = datasets.fetch_20newsgroups(data_home="../datasets/20newsgroups_data/", subset="all")
Y = data.target
X = np.load("../datasets/20newsgroups_data/20newsgroups.npy")
X.shape

(18846, 768)

In [57]:
from sklearn.decomposition import PCA
pca = PCA(n_components=50)
X = pca.fit_transform(X)

In [58]:
rs = np.random.RandomState(19)
X_sample, Y_sample = random_data_sample(X, Y, 1000, rs)

In [65]:
test_dataset(X_sample, Y_sample, seed=79)

kmeans score:  0.2314524110145444
kmeans score (SC):  0.24636367652759042
kmeans score (MM):  0.25051530744696626


In [66]:
X_sample = preprocessing.MinMaxScaler().fit_transform(X_sample)

In [67]:
X_sample.shape

(1000, 50)

In [68]:
#np.save("../datasets/20newsgroups_data/X.npy", X_sample)
#np.save("../datasets/20newsgroups_data/Y.npy", Y_sample)

In [85]:
X_sample = np.load("../datasets/20newsgroups_data/X.npy")
Y_sample = np.load("../datasets/20newsgroups_data/Y.npy")

In [69]:
np.unique(Y_sample, return_counts=True)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19]),
 array([46, 57, 45, 48, 63, 47, 48, 65, 51, 53, 39, 57, 46, 56, 65, 47, 52,
        48, 38, 29], dtype=int64))

## CIFAR10

In [70]:
# see cifar10_prep.ipynb for the code to prepare embeddings
X = np.load("../datasets/cifar10_data/cifar10_embedding.npy")
Y = np.load("../datasets/cifar10_data/cifar10_labels.npy")

In [71]:
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
X = pca.fit_transform(X)

In [72]:
rs = np.random.RandomState(19)
X_sample, Y_sample = random_data_sample(X, Y, 1000, rs)

In [73]:
X_sample.shape

(1000, 3)

In [78]:
test_dataset(X_sample, Y_sample, seed=25)

kmeans score:  0.6773329544559815
kmeans score (SC):  0.6594862669369495
kmeans score (MM):  0.6861314428049142


In [109]:
#np.save("../datasets/cifar10_data/X.npy", X_sample)
#np.save("../datasets/cifar10_data/Y.npy", Y_sample)

In [113]:
X_sample = np.load("../datasets/cifar10_data/X.npy")
Y_sample = np.load("../datasets/cifar10_data/Y.npy")

In [114]:
np.unique(Y_sample, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([ 91,  96, 107,  89,  99, 113,  96,  93, 112, 104], dtype=int64))

## Mushrooms

In [79]:
import pandas as pd
mushroom = fetch_ucirepo(id=73) 
X = mushroom.data.features 
Y = mushroom.data.targets 

In [80]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
Y = label_encoder.fit_transform(Y)
X = pd.get_dummies(X, columns=X.columns).values

  y = column_or_1d(y, warn=True)


In [81]:
rs = np.random.RandomState(19)
X_sample, Y_sample = random_data_sample(X, Y, 1000, rs)

In [82]:
X_sample.shape

(1000, 116)

In [86]:
test_dataset(X_sample, Y_sample, seed=45)

kmeans score:  0.611150390579939
kmeans score (SC):  0.5179268416222097
kmeans score (MM):  0.611150390579939


In [87]:
np.unique(Y_sample, return_counts=True)

(array([0, 1]), array([503, 497], dtype=int64))

In [291]:
#np.save("../datasets/mushrooms_data/X.npy", X_sample)
#np.save("../datasets/mushrooms_data/Y.npy", Y_sample)

In [196]:
X_sample = np.load("../datasets/mushrooms_data/X.npy")
Y_sample = np.load("../datasets/mushrooms_data/Y.npy")

## Breast Cancer data

In [97]:
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17) 
X = breast_cancer_wisconsin_diagnostic.data.features.values
Y = breast_cancer_wisconsin_diagnostic.data.targets 

In [98]:
Y = Y['Diagnosis'].map({'M': 1, 'B': 0})

In [99]:
rs = np.random.RandomState(19)
X_sample, Y_sample = random_data_sample(X, Y, 1000, rs)

In [100]:
X_sample.shape

(569, 30)

In [101]:
np.unique(Y_sample, return_counts=True)

(array([0, 1], dtype=int64), array([356, 213], dtype=int64))

In [103]:
test_dataset(X_sample, Y_sample, seed=55)

kmeans score:  0.486626341880326
kmeans score (SC):  0.7186399055093501
kmeans score (MM):  0.8050419613541757


In [321]:
X_sample = preprocessing.MinMaxScaler().fit_transform(X_sample)

In [323]:
#np.save("../datasets/breast_cancer_data/X.npy", X_sample)
#np.save("../datasets/breast_cancer_data/Y.npy", Y_sample)

## Cardiotocography

In [143]:
from ucimlrepo import fetch_ucirepo 
cardiotocography = fetch_ucirepo(id=193) 
X = cardiotocography.data.features.values
Y = cardiotocography.data.targets 
Y = Y['CLASS'].to_numpy().astype(int) - 1

In [144]:
cardiotocography.data.features

Unnamed: 0,LB,AC,FM,UC,DL,DS,DP,ASTV,MSTV,ALTV,...,Width,Min,Max,Nmax,Nzeros,Mode,Mean,Median,Variance,Tendency
0,120,0.000,0.000,0.000,0.000,0.0,0.0,73,0.5,43,...,64,62,126,2,0,120,137,121,73,1
1,132,0.006,0.000,0.006,0.003,0.0,0.0,17,2.1,0,...,130,68,198,6,1,141,136,140,12,0
2,133,0.003,0.000,0.008,0.003,0.0,0.0,16,2.1,0,...,130,68,198,5,1,141,135,138,13,0
3,134,0.003,0.000,0.008,0.003,0.0,0.0,16,2.4,0,...,117,53,170,11,0,137,134,137,13,1
4,132,0.007,0.000,0.008,0.000,0.0,0.0,16,2.4,0,...,117,53,170,9,0,137,136,138,11,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2121,140,0.000,0.000,0.007,0.000,0.0,0.0,79,0.2,25,...,40,137,177,4,0,153,150,152,2,0
2122,140,0.001,0.000,0.007,0.000,0.0,0.0,78,0.4,22,...,66,103,169,6,0,152,148,151,3,1
2123,140,0.001,0.000,0.007,0.000,0.0,0.0,79,0.4,20,...,67,103,170,5,0,153,148,152,4,1
2124,140,0.001,0.000,0.006,0.000,0.0,0.0,78,0.4,27,...,66,103,169,6,0,152,147,151,4,1


In [145]:
rs = np.random.RandomState(19)
X_sample, Y_sample = random_data_sample(X, Y, 1000, rs)

In [146]:
X_sample.shape

(1000, 21)

In [147]:
np.unique(Y_sample, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([183, 259,  26,  36,  43, 161, 127,  44,  27,  94], dtype=int64))

In [150]:
test_dataset(X_sample, Y_sample, seed=10)

kmeans score:  0.1860288398150711
kmeans score (SC):  0.19692264903491544
kmeans score (MM):  0.1860288398150711


In [149]:
X_sample = preprocessing.MinMaxScaler().fit_transform(X_sample)

In [117]:
#np.save("../datasets/cardiotocography_data/X.npy", X_sample)
#np.save("../datasets/cardiotocography_data/Y.npy", Y_sample)

In [119]:
X_sample = np.load("../datasets/cardiotocography_data/X.npy")
Y_sample = np.load("../datasets/cardiotocography_data/Y.npy")


In [120]:
np.unique(Y_sample, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([183, 259,  26,  36,  43, 161, 127,  44,  27,  94], dtype=int64))

## Ecoli

In [173]:
ecoli = fetch_ucirepo(id=39) 
X = ecoli.data.features.values
Y = ecoli.data.targets 

In [174]:
Y = label_encoder.fit_transform(Y)

  y = column_or_1d(y, warn=True)


In [175]:
rs = np.random.RandomState(19)
X_sample, Y_sample = random_data_sample(X, Y, 1000, rs)

In [176]:
X_sample.shape

(336, 7)

In [196]:
np.unique(Y_sample, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7]),
 array([137,  76,   1,   2,  37,  26,   5,  52], dtype=int64))

In [193]:
test_dataset(X_sample, Y_sample, seed=125)

kmeans score:  0.46094126907932487
kmeans score (SC):  0.46094126907932487
kmeans score (MM):  0.46589614653749645


In [192]:
X_sample = preprocessing.StandardScaler().fit_transform(X_sample)

In [194]:
#np.save("../datasets/ecoli_data/X.npy", X_sample)
#np.save("../datasets/ecoli_data/Y.npy", Y_sample)

In [195]:
X_sample = np.load("../datasets/ecoli_data/X.npy")
Y_sample = np.load("../datasets/ecoli_data/Y.npy")

## Forest Type Mapping

In [214]:
import pandas as pd
import requests
import zipfile
import io

# Step 1: Download the dataset
url = "https://archive.ics.uci.edu/static/public/333/forest+type+mapping.zip"
response = requests.get(url)

# Step 2: Unzip the dataset
with zipfile.ZipFile(io.BytesIO(response.content)) as z:
    # List the contents of the zip file
    print(z.namelist())  # Uncomment to see the list of files

    # Load the specific CSV files (assuming 'training.csv' and 'testing.csv' are the main datasets)
    with z.open('training.csv') as f:
        train_df = pd.read_csv(f)
        
    with z.open('testing.csv') as f:
        test_df = pd.read_csv(f)

# Step 3: Concatenate the training and test DataFrames
df = pd.concat([train_df, test_df], ignore_index=True)

['training.csv', 'testing.csv']


pandas.core.frame.DataFrame

In [215]:
df.isnull().values.any()

False

In [216]:
from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder()
df["class"] = labelencoder.fit_transform(df["class"])

In [217]:
X = df.drop(["class"],axis=1).values
Y = df["class"].values

In [218]:
rs = np.random.RandomState(19)
X_sample, Y_sample = random_data_sample(X, Y, 1000, rs)

In [227]:
X_sample.shape

(523, 27)

In [238]:
np.unique(Y_sample, return_counts=True)

(array([0, 1, 2, 3]), array([168,  84,  86, 185], dtype=int64))

In [237]:
test_dataset(X_sample, Y_sample, seed=105)

kmeans score:  0.46999210897765215
kmeans score (SC):  0.20779091514147743
kmeans score (MM):  0.46999210897765215


In [234]:
X_sample = preprocessing.MinMaxScaler().fit_transform(X_sample)

In [235]:
#np.save("../datasets/ForestTypeMapping_data/X.npy", X_sample)
#np.save("../datasets/ForestTypeMapping_data/Y.npy", Y_sample)

In [236]:
X_sample = np.load("../datasets/ForestTypeMapping_data/X.npy")
Y_sample = np.load("../datasets/ForestTypeMapping_data/Y.npy")

## User knowledge data

In [249]:
# fetch dataset 
user_knowledge_modeling = fetch_ucirepo(id=257) 
  
# data (as pandas dataframes) 
X = user_knowledge_modeling.data.features.values
Y = user_knowledge_modeling.data.targets 

In [250]:
from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder()
Y = labelencoder.fit_transform(Y)

  y = column_or_1d(y, warn=True)


In [251]:
rs = np.random.RandomState(19)
X_sample, Y_sample = random_data_sample(X, Y, 1000, rs)

In [252]:
X_sample.shape

(403, 5)

In [253]:
np.unique(Y_sample, return_counts=True)

(array([0, 1, 2, 3, 4]), array([111, 129, 116,  28,  19], dtype=int64))

In [263]:
test_dataset(X_sample, Y_sample, seed=105)

kmeans score:  0.25701347578335876
kmeans score (SC):  0.06420688553233227
kmeans score (MM):  0.25701347578335876


In [260]:
X_sample = preprocessing.MinMaxScaler().fit_transform(X_sample)

In [261]:
#np.save("../datasets/user_knowledge_data/X.npy", X_sample)
#np.save("../datasets/user_knowledge_data/Y.npy", Y_sample)

In [262]:
X_sample = np.load("../datasets/user_knowledge_data/X.npy")
Y_sample = np.load("../datasets/user_knowledge_data/Y.npy")

## MNIST

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import numpy as np

# Define the CNN model
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(64 * 7 * 7, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.pool(self.relu(self.conv1(x)))
        x = self.pool(self.relu(self.conv2(x)))
        x = x.view(x.size(0), -1)
        x = self.relu(self.fc1(x))
        x_out = self.fc2(x)
        return x_out, x

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
transform = transforms.Compose([transforms.Resize((28, 28)),
                                    transforms.ToTensor(),
                                    transforms.Normalize((0.5,), (0.5,))])

# Load MNIST data
train_data = datasets.MNIST(root="../datasets/mnist_data", train=True, transform=transform, download=True)
test_data = datasets.MNIST(root="../datasets/mnist_data", train=False, transform=transform, download=True)

train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)

# Train the CNN
model = SimpleCNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
epochs = 10

model.train()
for epoch in range(epochs):
    for i, (images, labels) in enumerate(train_loader):
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs, _ = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

# Extract embeddings and labels of test data
model.eval()
embeddings_list = []
labels_list = []

with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        _, embeddings = model(images)
        embeddings_list.append(embeddings.cpu().numpy())
        labels_list.append(labels.cpu().numpy())

# Save embeddings and labels to X.npy and Y.npy
X = np.vstack(embeddings_list)
Y = np.concatenate(labels_list)
#np.save("../datasets/mnist_data/X_full.npy", X)
#np.save("../datasets/mnist_data/Y_full.npy", Y)

In [24]:
test_data.data.shape

torch.Size([10000, 28, 28])

In [290]:
X = np.load("../datasets/mnist_data/X_full.npy")
Y = np.load("../datasets/mnist_data/Y_full.npy")

In [291]:
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
X = pca.fit_transform(X)

In [292]:
rs = np.random.RandomState(39)
X_sample, Y_sample = random_data_sample(X, Y, 1000, rs)

In [293]:
X_sample.shape

(1000, 3)

In [294]:
np.unique(Y_sample, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int64),
 array([105, 109, 111, 112, 104,  86,  99,  88,  88,  98], dtype=int64))

In [295]:
test_dataset(X_sample, Y_sample)

kmeans score:  0.814716314574021
kmeans score (SC):  0.8032048085421609
kmeans score (MM):  0.8156188428041204


In [297]:
#np.save("../datasets/mnist_data/X.npy", X_sample)
#np.save("../datasets/mnist_data/Y.npy", Y_sample)

In [None]:
X_sample = np.load("../datasets/mnist_data/X.npy")
Y_sample = np.load("../datasets/mnist_data/Y.npy")

## Yeast

In [328]:
yeast = fetch_ucirepo(id=110) 
X = yeast.data.features.values
Y = yeast.data.targets 

In [329]:
from sklearn.preprocessing import LabelEncoder
name_encoder=LabelEncoder()
Y=name_encoder.fit_transform(Y)

  y = column_or_1d(y, warn=True)


In [330]:
rs = np.random.RandomState(39)
X_sample, Y_sample = random_data_sample(X, Y, 1000, rs)

In [331]:
X_sample.shape

(1000, 8)

In [332]:
np.unique(Y_sample, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([319,   4,  31,  17,  28, 131, 169, 271,  12,  18], dtype=int64))

In [335]:
rs = np.random.RandomState(25)
test_dataset(X_sample, Y_sample, seed=165)

kmeans score:  0.1908858405237394
kmeans score (SC):  0.1908858405237394
kmeans score (MM):  0.14679384946227442


In [334]:
X_sample = preprocessing.StandardScaler().fit_transform(X_sample)

In [336]:
#np.save("../datasets/yeast_data/X.npy", X_sample)
#np.save("../datasets/yeast_data/Y.npy", Y_sample)

In [122]:
X_sample = np.load("../datasets/yeast_data/X.npy")
Y_sample = np.load("../datasets/yeast_data/Y.npy") 