# Example
## Imports and set-up

In [4]:
from pathlib import Path
from sklearn.metrics import adjusted_rand_score
from noise_robust_cobras.cobras import COBRAS
from noise_robust_cobras.querier.noisy_labelquerier import ProbabilisticNoisyQuerier
import numpy as np

dataset_path = Path('iris.data').absolute()
assert dataset_path.exists(), f"the dataset does not exist here the root path is {Path().absolute()}"

querier_seed = 123

e:\Projecten\Unif_proj\ThesisCode\example\iris.data


Let's start of with loading a dataset

In [3]:
dataset = np.loadtxt(dataset_path, delimiter=',')
data = dataset[:, 1:]
target = dataset[:, 0]
print(f"data shape: {data.shape}")

data shape: (147, 4)


Create a noisy oracle to simulate a domain expert that makes some mistakes

In [9]:
noisy_querier = ProbabilisticNoisyQuerier(None, target, 0.1,100, random_seed=querier_seed)

Create a COBRAS object to cluster the data based on the supervision that the noisy querier gives:

In [10]:
clusterer = COBRAS(noise_probability=0.1,minimum_approximation_order=3, maximum_approximation_order=6)
# only store the first two return values
all_clusters, runtimes, *_ = clusterer.fit(data, -1, None, noisy_querier)
best_clustering_robust = all_clusters[-1]
runtime_robust = runtimes[-1]

Evaluate the clustering quality

In [11]:
ARI_score_robust = adjusted_rand_score(target, best_clustering_robust)
print(f"Clustering took {runtime_robust:0.3f}, ARI = {ARI_score_robust:0.3f}")

Clustering took 32.644, ARI = 0.882


For reference also run cobras with no noise handling mechanism

In [12]:
# make a new querier (take care these are statefull! so make a new querier every time)
noisy_querier = ProbabilisticNoisyQuerier(None, target, 0.1,100, random_seed=querier_seed)

# make new COBRAS
clusterer = COBRAS(correct_noise=False)
all_clusters, runtimes, *_ = clusterer.fit(data, -1, None, noisy_querier)
best_clustering = all_clusters[-1]
runtime = runtimes[-1]

ARI_score = adjusted_rand_score(target, best_clustering)
print(f"Clustering took {runtime:0.3f}, ARI = {ARI_score:0.3f}")

Clustering took 1.092, ARI = 0.482


So as you can see nCOBRAS performs significantly better than COBRAS in the presence of noise.
However, the reasoning about noisy constraints takes time and thus nCOBRAS is also considerably slower than COBRAS.
