## Experiment 1 : Comparing our implementation of CopKMeans with an existing one

### Loading useful libraries

In [1]:
import numpy as np
from sklearn.datasets import make_classification
from cop_kmeans import CopKMeans
import matplotlib.pyplot as plt
from active_semi_clustering.semi_supervised.pairwise_constraints import COPKMeans
from sklearn.metrics import mutual_info_score, adjusted_mutual_info_score, normalized_mutual_info_score
import os
import pandas as pd
from itertools import product

### Getting the datasets names

In [2]:
datasets = []
folder_data = 'data_processed/'
for file in os.listdir(folder_data):
    datasets.append('data_processed/' + file)

In [3]:
for file in datasets:
    df = pd.read_csv(file)
    print(file, len(df))

data_processed/magic04.csv 19020
data_processed/connect-4.csv 67557
data_processed/dermatology.csv 358
data_processed/nursery.csv 12960
data_processed/tae.csv 151
data_processed/winequality-white.csv 4898
data_processed/dataset_18_mfeat-morphological.csv 2000
data_processed/lymphography.csv 148
data_processed/balance-scale.csv 625
data_processed/car.csv 1728
data_processed/appendicitis.csv 106
data_processed/monks-problems-2.csv 601
data_processed/jungle_chess_2pcs_raw_endgame_complete.csv 44819
data_processed/heart_failure_clinical_records_dataset.csv 299
data_processed/online_shoppers_intention.csv 12245
data_processed/irish.csv 474
data_processed/blood-transfusion-service-center.csv 748
data_processed/analcatdata_chlamydia.csv 100
data_processed/hayes_roth.csv 132
data_processed/artificial-characters.csv 10218


### Function to calculate and return a percentage of all constraints

In [4]:
def calculate_constraints(X, y, perc):
    ml = []
    cl = []

    n = len(y)
    idx1 = np.random.choice(list(range(n)), int(np.sqrt(perc)*len(y)))

    idx2 = np.random.choice(list(range(n)), int(np.sqrt(perc)*len(y)))

    sampled_pairs = list(product(y[idx1], y[idx2]))


    for pt1, pt2 in sampled_pairs:
        if y[pt1] == y[pt2] and pt1 != pt2:
            ml.append((pt1, pt2))
        elif y[pt1] != y[pt2]:
            cl.append((pt1, pt2))

    # Sampling 10% of ml and 10% of cl
    ml, cl = np.array(ml), np.array(cl)
    # ml_subset = ml[np.random.choice(len(ml), int(perc*len(ml)))]
    # cl_subset = cl[np.random.choice(len(cl), int(perc*len(cl)))]

    ml = [tuple(l) for l in ml]
    cl = [tuple(l) for l in cl]
    
    return ml, cl

In [10]:
def our_cop(X, y, ml, cl):
    model = CopKMeans(3, 200)

    # model.fit(X, pairwise_constraints[0], pairwise_constraints[1])
    model.fit(X, ml, cl)

    try:
        prediction = model.predict(X)
    except:
        return -1, -1

    nmi = normalized_mutual_info_score(y, prediction)
    ami = adjusted_mutual_info_score(y, prediction)

    print(f"Our model : {nmi = }, {ami = }")
    
    return nmi, ami

In [26]:
def theirs(X, y, ml, cl):

    model = COPKMeans(3, 200)

    try:
        model.fit(X, ml, cl)
    # prediction = model.predict(X)
    except:
        return -1, -1

    nmi = normalized_mutual_info_score(y, model.labels_)
    ami = adjusted_mutual_info_score(y, model.labels_)

    print(f"Theirs : {nmi = }, {ami = }")
    
    return nmi, ami

### Running both algorithms on 20 datasets

In [31]:
# Dataset, algorithm, metric
scores = np.zeros((len(datasets), 2, 2))

for idx_data, file in enumerate(datasets):
    # if idx_data in [0, 1, 3, 5, 12, 14, 19]:
    #     continue
    if idx_data in [1, 12]:
        continue
    
    data = pd.read_csv(file)
    print(idx_data, file, data.shape)

    X, y = data.iloc[:,:-1].to_numpy(), data.iloc[:,-1].to_numpy().reshape(-1)

    ml, cl = calculate_constraints(X, y, 0.1)

    # print(pairwise_constraints)
    # print(len(pairwise_constraints[0]), len(pairwise_constraints[1]))
    # ml, cl = pairwise_constraints[0], pairwise_constraints[1]

    nmi, ami = our_cop(X, y, ml, cl)
    scores[idx_data, 0] = [nmi, ami]

    nmi, ami = theirs(X, y, ml, cl)
    scores[idx_data, 1] = [nmi, ami]


0 data_processed/magic04.csv (19020, 11)
Itération 0
Itération 50
Itération 100
Itération 150
Our model : nmi = 0.015047632276512672, ami = 0.014985965631531293


: 

In [28]:
# For 50% of links
print(scores)

[[[ 0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00]]

 [[ 0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00]]

 [[-1.00000000e+00 -1.00000000e+00]
  [ 6.87273964e-02  5.90395260e-02]]

 [[ 0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00]]

 [[ 5.75504223e-03 -6.60985774e-03]
  [ 3.20808765e-03 -9.23970856e-03]]

 [[ 0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00]]

 [[ 4.20659860e-01  4.19100477e-01]
  [ 4.08175527e-01  4.06577316e-01]]

 [[-1.00000000e+00 -1.00000000e+00]
  [ 8.63445144e-02  5.00059127e-02]]

 [[ 8.56550857e-02  8.27103837e-02]
  [-1.00000000e+00 -1.00000000e+00]]

 [[ 7.96126712e-02  7.79231975e-02]
  [ 2.86459276e-02  2.68824414e-02]]

 [[ 1.79390182e-01  1.68684364e-01]
  [ 2.16174758e-01  2.05916740e-01]]

 [[ 5.20504489e-03  3.22291706e-03]
  [ 1.77124596e-03 -2.17642413e-04]]

 [[ 0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00]]

 [[ 6.85967600e-03  2.64688636e-03]
  

In [30]:
# For 10% of links
print(scores)

[[[ 0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00]]

 [[ 0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00]]

 [[-1.00000000e+00 -1.00000000e+00]
  [ 5.35928902e-02  4.38029955e-02]]

 [[ 0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00]]

 [[ 1.44323841e-02  1.96936674e-03]
  [ 8.31684029e-03 -4.13977000e-03]]

 [[ 0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00]]

 [[ 4.17527483e-01  4.15959212e-01]
  [ 4.08175527e-01  4.06577316e-01]]

 [[-1.00000000e+00 -1.00000000e+00]
  [ 3.69143999e-02 -4.03753955e-03]]

 [[ 3.27724892e-02  2.96540240e-02]
  [-1.00000000e+00 -1.00000000e+00]]

 [[ 2.94194587e-02  2.76586973e-02]
  [ 2.25039944e-01  2.23635897e-01]]

 [[ 1.60870372e-01  1.49085710e-01]
  [ 1.57240611e-01  1.45185096e-01]]

 [[ 5.12172234e-04 -1.46643864e-03]
  [ 3.55585629e-03  1.59069933e-03]]

 [[ 0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00]]

 [[ 9.34456284e-03  5.00773949e-03]
  