In [3]:
from rac.experiment_data import ExperimentReader
exp_reader = ExperimentReader(["rand"])

datasets = ["20newsgroups", "cifar10", "mnist", "cardiotocography", "yeast", "breast_cancer", "ecoli", "forest_type_mapping", "mushrooms", "user_knowledge"]

start_index = 1
config = {
    "general_options": {
        "experiment_name": "info_gain_exp",
        "num_repeats": 14,
        "n_workers": 14,
        "local": True,
        "verbose": False
    },
    "experiment_options": {
        "seed": 33,
        "num_feedback": 0.01,
        "noise_level": [0.4],
        "persistent_noise_level": 0.0,
        "save_matrix_data": False,
        "infer_sims": False,
        "predict_sims": False,
        "clustering_alg": "CC",
        "warm_start": [0, 0.01, 0.02, 0.05]
    },

    "sim_init_options": {
        "K_init": 10,
        "sim_init": [0.01],
        "sim_init_type": ["zeros"]
    },
    "query_strategy_options": {
        "acq_fn": ["freq", "entropy", "info_gain_object", "cluster_incon", "maxexp"],
        "eps": 0.3,
        "beta": 8,
        "tau": [7],
        "alpha": 1,
        "num_maxmin_edges": -1,

        "use_power": [True],
        "use_grumbel": [True],
        "power_beta": [1],
        "sparse_sim_matrix": [True],
        "mean_field_beta": [20],
        "info_gain_lambda": [20],
        "running_avg": True,
        "num_edges_info_gain": [50],
        "U_size": [1],
        "G_size": [1],
        "info_gain_object_mode": ["uniform"],
        "info_gain_pair_mode": ["uniform", "entropy"],
        "mf_iterations": [50],
    },
    "dataset_options": {
        "dataset": datasets
    }
}

start_index = exp_reader.generate_experiments(
    folder="../configs/info_gain_exp/", 
    options_to_keep=[],
    start_index=start_index,
    **config
)

In [5]:
from rac.experiment_data import ExperimentReader
metrics = ["rand", "ami"]
ac = ExperimentReader(metrics=metrics)

datasets = ["20newsgroups", "cifar10", "mnist", "cardiotocography", "yeast", "breast_cancer", "ecoli", "forest_type_mapping", "mushrooms", "user_knowledge"]

config = {
    "general_options": {
        "experiment_name": "info_gain_exp",
        "num_repeats": 14,
        "n_workers": 14,
        "local": True,
        "verbose": False
    },
    "experiment_options": {
        "seed": 33,
        "num_feedback": 0.01,
        "noise_level": [0.4],
        "persistent_noise_level": 0.0,
        "save_matrix_data": False,
        "infer_sims": False,
        "predict_sims": False,
        "clustering_alg": "CC",
        "warm_start": [0, 0.01, 0.02, 0.05]
    },

    "sim_init_options": {
        "K_init": 10,
        "sim_init": [0.01],
        "sim_init_type": ["zeros"]
    },
    "query_strategy_options": {
        "acq_fn": ["freq", "entropy", "info_gain_object", "cluster_incon", "maxexp"],
        "eps": 0.3,
        "beta": 8,
        "tau": [7],
        "alpha": 1,
        "num_maxmin_edges": -1,

        "use_power": [True],
        "use_grumbel": [True],
        "power_beta": [1],
        "sparse_sim_matrix": [True],
        "mean_field_beta": [20],
        "info_gain_lambda": [20],
        "running_avg": True,
        "num_edges_info_gain": [50],
        "U_size": [1],
        "G_size": [1],
        "info_gain_object_mode": ["uniform"],
        "info_gain_pair_mode": ["uniform", "entropy"],
        "mf_iterations": [50],
    },
    "dataset_options": {
        "dataset": datasets
    }
}


data = ac.read_all_data(folder="../experiment_results_local/info_gain_exp/")
ac.generate_AL_curves(
    data,
    save_location="../plots/info_gain_exp/",
    categorize=["dataset", "warm_start", "use_grumbel", "power_beta", "info_gain_pair_mode"],
    compare=["acq_fn"],
    vary=["x"],
    auc=True,
    summary_method="auc_max_ind",
    indices=[], 
    threshold=1,
    err_style="band",
    marker="o",
    markersize=6,
    capsize=6,
    linestyle="solid",
    **config
)

KeyboardInterrupt: 

In [1]:
import numpy as np
from scipy.stats import entropy as scipy_entropy
from scipy.special import softmax as scipy_softmax
def select_objects_info_gain(q, U_size, x, y, mode="uniform"):
    N = q.shape[0]
    if mode == "uniform":
        return np.setdiff1d(np.random.choice(N, U_size, replace=False), [x, y])
    elif mode == "entropy":
        # Exclude x and y from the computation
        indices = np.arange(N) != x
        indices &= np.arange(N) != y

        # Compute P(e_ix | Q) and P(e_iy | Q) for all i (except x and y)
        P_e_ix_Q = np.sum(q[indices, :] * q[x, :], axis=1)
        P_e_iy_Q = np.sum(q[indices, :] * q[y, :], axis=1)

        # Compute entropy of P(e_ix | Q) and P(e_iy | Q)
        entropy_e_ix_Q = scipy_entropy(np.stack((P_e_ix_Q, 1 - P_e_ix_Q), axis=1))
        entropy_e_iy_Q = scipy_entropy(np.stack((P_e_iy_Q, 1 - P_e_iy_Q), axis=1))

        # Compute the average entropy
        avg_entropy = (entropy_e_ix_Q + entropy_e_iy_Q) / 2

        # Rank objects based on average entropy and select top U_size objects
        ranked_indices = np.argsort(avg_entropy)[::-1][:U_size]

        # Extract the top U_size indices, excluding x and y
        top_U_indices = np.arange(N)[indices][ranked_indices]
        return np.setdiff1d(top_U_indices, [x, y])
    elif mode == "uniform_varying":
        return np.array([])
    else:
        raise ValueError("Invalid mode (objects): {}".format(mode))

In [2]:
from scipy.stats import entropy as scipy_entropy
from scipy.special import softmax as scipy_softmax
import numpy as np
def update_mean_fields(q_0, h_0, S, x, y, lmbda, beta, L, U_size, G_size):

    q = np.copy(q_0)
    h = np.copy(h_0)
    q_prev = np.copy(q_0)
    N = q.shape[0]

    U_size = int(U_size * N)
    #G_size = int(G_size * self.ac.N)
    #G_size = U_size

    # Initialize U^0 as an empty set
    U_prev = np.array([])

    U_all = np.array([x, y])
    #U_t = select_objects_info_gain(mode="uniform", q=q, U_size=U_size, x=x, y=y)
    U_t = np.arange(N)

    #delta_q = np.zeros(h.shape)
    for t in range(1, L + 1):
        #if t == 1:
        #    h[x, :] += S[x, y] * q_0[y, :] - lmbda * q_0[y, :]
        #    h[y, :] += S[y, x] * q_0[x, :] - lmbda * q_0[x, :]
        #else:

        if t == 1:
            h = -np.dot(S, q)
            q_prev = np.copy(q)
        else:
        
            #if self.ac.info_gain_object_mode == "uniform_varying":
            #U_t = np.setdiff1d(np.random.choice(N, U_size, replace=False), [x, y])
            #G = np.setdiff1d(np.random.choice(U_t, np.minimum(G_size, len(U_prev)), replace=False), [x, y]).astype(int)
            #G = U_prev
            G = np.setdiff1d(np.arange(N), [x, y])
            G = G.astype(int)

            U_all = np.union1d(U_all, U_t).astype(int)

            G_xy = np.append(G, [x, y]).astype(int)
            ######## h = -np.dot(S, q)
            ##### q[[x, y]] = scipy_softmax(-self.ac.mean_field_beta*h[[x, y]], axis=1)



            q = scipy_softmax(-beta*h, axis=1)
            delta_q = (q_prev - q) 
            print(delta_q)

            #h[x, :] += S[x, G].dot(delta_q[G]).reshape(h[x, :].shape)
            #h[y, :] += S[y, G].dot(delta_q[G]).reshape(h[y, :].shape)
            #h[x, :] += lmbda * (q_prev[y, :] - q[y, :])
            #h[y, :] += lmbda * (q_prev[x, :] - q[x, :])

            # update for objects in U_t
            ####delta_q_xy = (q_prev[G_xy, :] - q[G_xy, :])
            h += S.dot(delta_q)



            q_prev = np.copy(q)
            U_prev = U_t

    q[U_all] = scipy_softmax(-beta*h[U_all], axis=1)
    return q, U_all

In [399]:
def update_mean_fields(q_0, h_0, S, x, y, lmbda, beta, L, U_size, G_size):
    h = np.copy(h_0)
    q = np.copy(q_0)
    q_prev = np.copy(q_0)
    N = h.shape[0]

    #U_size = int(U_size * N)
    #G_size = int(G_size * self.ac.N)
    #G_size = U_size

    # Initialize U^0 as an empty set
    U_prev = np.array([])
    U_all = np.array([x, y])
    delta_q = np.zeros(h.shape)
    #U_t = select_objects_info_gain(mode="uniform", q=q, U_size=U_size, x=x, y=y)
    #U_t = np.setdiff1d(np.random.choice(N, U_size, replace=False), [x, y])
    U_t = np.setdiff1d(np.arange(N), [x, y])
    for t in range(1, L + 1):
        if t == 1:
            h[x, :] += S[x, y] * q_0[y, :] - lmbda * q_0[y, :]
            h[y, :] += S[y, x] * q_0[x, :] - lmbda * q_0[x, :]
        else:
            #if self.ac.info_gain_object_mode == "uniform_varying":
            #U_t = np.setdiff1d(np.random.choice(N, U_size, replace=False), [x, y])
            #G = np.setdiff1d(np.random.choice(U_prev, np.minimum(G_size, len(U_prev)), replace=False), [x, y]).astype(int)
            G = U_prev
            G = G.astype(int)
            U_all = np.union1d(U_all, U_t).astype(int)

            G_xy = np.append(G, [x, y]).astype(int)
            q[G_xy] = scipy_softmax(-beta*h[G_xy], axis=1)
            #q[[x, y]] = scipy_softmax(-self.ac.mean_field_beta*h[[x, y]], axis=1)
            delta_q[G_xy, :] = (q_prev[G_xy, :] - q[G_xy, :])

            # update for x and y
            h[x, :] += S[x, G].dot(delta_q[G]).reshape(h[x, :].shape)
            h[y, :] += S[y, G].dot(delta_q[G]).reshape(h[y, :].shape)
            h[x, :] += lmbda * (q_prev[y, :] - q[y, :])
            h[y, :] += lmbda * (q_prev[x, :] - q[x, :])

            # update for objects in U_t
            h[U_t, :] += S[U_t][:, G_xy].dot(delta_q[G_xy])


            q_prev = np.copy(q)
            U_prev = U_t

    q[U_all] = scipy_softmax(-beta*h[U_all], axis=1)
    return q, U_all

In [3]:

from rac.correlation_clustering import max_correlation, max_correlation_dynamic_K
from scipy.special import softmax as scipy_softmax
from scipy import sparse
def mean_field_clustering(S, K, betas, true_labels, max_iter=100, tol=1e-6, noise_level=0.0, is_sparse=False, predicted_labels=None, h=None, q=None):
    np.fill_diagonal(S, 0)
    N = S.shape[0]
    if predicted_labels is None and h is None:
        predicted_labels, _ = max_correlation_dynamic_K(S, K, 5)
    beta = betas[0]

    K = len(np.unique(predicted_labels))

    
    if h is None:
        h = np.zeros((N, K))
        for k in range(K):
            cluster_indices = np.where(predicted_labels == k)[0]
            for i in range(N):
                h[i, k] = S[i, cluster_indices].sum()
        
        #beta = 50
        q = scipy_softmax(beta*h, axis=1)
    #print("INITIAL Q: ", q)

    #n_level = 0.3
    #noise = n_level * (np.random.rand(N, K) - 0.5)
    #q += noise
    #q = np.maximum(q, 0)  # Ensure q stays non-negative
    #q /= np.sum(q, axis=1, keepdims=True)  # Re-normalize q

    if is_sparse and not sparse.issparse(S):
        S = sparse.csr_matrix(S)
    
    #max_iter = 1000
    #betas = [1]
    #tol = 1e-10
    #old_diff = np.inf
    for beta in betas:
        for iteration in range(max_iter):
            h = -S.dot(q)
            #h = -np.dot(S, q)
            q_new = scipy_softmax(beta*-h, axis=1)
            #print("--------")
            
            #current_solution = np.argmax(q_new, axis=1)
            #current_ari = adjusted_rand_score(current_solution, predicted_labels)
            #current_ari2 = adjusted_rand_score(current_solution, true_labels)
            #current_ari3 = adjusted_rand_score(predicted_labels, true_labels)
            # Check for convergence
            diff = np.linalg.norm(q_new - q)
            #print("iteration: ", iteration, " diff: ", diff, " beta: ", beta, " ari: ", current_ari, "mf: ", current_ari2, "local search: ", current_ari3)
            #if np.abs(diff - old_diff) < tol:
            if diff < tol:
                print(f'Converged after {iteration} iterations')
                break

            #old_diff = diff
            q = q_new

            # Inject noise
            #noise = noise_level * (np.random.rand(N, K) - 0.5)
            #q += noise
            #q = np.maximum(q, 0)  # Ensure q stays non-negative
            #q /= np.sum(q, axis=1, keepdims=True)  # Re-normalize q
    return np.argmax(q, axis=1), q, h

In [4]:
def generate_matrix(n_samples, n_features, n_clusters, gamma, random_state=None):
    """
    Generate a similarity matrix with noise, with specified number of clusters.
    Includes diagnostic print statements to investigate the issue with noise injection.

    Parameters:
    n_samples (int): Number of samples in the dataset.
    n_features (int): Number of features in the dataset.
    n_clusters (int): Number of clusters (classes) in the dataset.
    gamma (float): Probability of noise injection, in the range [0, 1].
    random_state (int, optional): Seed for random number generator.

    Returns:
    np.ndarray: The similarity matrix with noise.
    """
    from sklearn.datasets import make_classification
    import numpy as np

    # Generate a synthetic dataset with specified number of clusters
    X, y = make_classification(n_samples=n_samples, n_features=n_features, 
                               n_clusters_per_class=1, n_classes=n_clusters, 
                               random_state=random_state, n_informative=n_features, n_redundant=0, n_repeated=0)

    # Create the initial similarity matrix based on labels
    S = np.where(y[:, None] == y, 1, -1).astype(float)  # Ensure it's a float matrix
    np.fill_diagonal(S, 0)

    # Print the data type of S for diagnostic purposes

    # Inject noise into the similarity matrix
    for i in range(n_samples):  # Limiting the loop for diagnostic purposes
        for j in range(0, i):
            if np.random.rand() < gamma:
                noise = np.random.uniform(-1, 1)
                S[i, j] = noise
                S[j, i] = noise

    return S, y

# Run the diagnostic version of the function




In [117]:
n_clusters = 10
n_samples = 100
S, true_labels = generate_matrix(n_samples=n_samples, n_features=5, n_clusters=n_clusters, gamma=0.7, random_state=34)

In [118]:
beta = 1
clust_sol, q, h = mean_field_clustering(
    S, n_clusters, betas=[beta],
    true_labels=true_labels, max_iter=100, tol=1e-10, noise_level=0.0, 
    is_sparse=False, predicted_labels=None
)

In [119]:
predicted_labels, _ = max_correlation_dynamic_K(S, n_clusters, 5)

In [120]:
from sklearn.metrics import adjusted_rand_score
current_ari = adjusted_rand_score(clust_sol, true_labels)
print(current_ari)

0.4182636925332496


In [121]:
current_ari = adjusted_rand_score(predicted_labels, true_labels)
print(current_ari)

0.4841764841764842


In [122]:
current_ari = adjusted_rand_score(predicted_labels, clust_sol)
print(current_ari)

0.6121424779388329


In [410]:
beta = 1
x = 1
y = 5
xi = 100

In [411]:
old_val = S[x, y]
S[x, y] = xi
S[y, x] = xi
clust_sol_new, q_new, h_new = mean_field_clustering(
    S, n_clusters, betas=[beta],
    true_labels=true_labels, max_iter=100, tol=1e-10, noise_level=0.0, 
    is_sparse=True, predicted_labels=None, h=h, q=q
)
S[x, y] = old_val
S[y, x] = old_val

In [412]:
current_ari = adjusted_rand_score(clust_sol, clust_sol_new)
print(current_ari)

0.8049240990838687


In [416]:
U_size = 1
G_size = 1
S = sparse.csr_matrix(S)
q_new_fast, U = update_mean_fields(q, h, S, x, y, xi, beta, 100, U_size, G_size)
clust_sol_fast = np.argmax(q_new_fast, axis=1)

In [417]:
current_ari = adjusted_rand_score(clust_sol_new, clust_sol_fast)
print(current_ari)

0.9626609965936589


In [418]:
current_ari = adjusted_rand_score(clust_sol, clust_sol_fast)
print(current_ari)

0.8185350832172447


In [409]:
clust_sol_fast

array([12,  3,  9,  6, 11,  2,  2, 12,  1,  9,  9, 10,  8,  9,  8,  9,  5,
        8,  3,  5,  2, 10,  2,  8, 12,  9,  9,  4,  8,  1, 10,  8,  2,  5,
       12, 12,  1,  2, 12,  9,  8,  9,  3,  3,  3, 12, 12,  5,  4, 11,  9,
        9,  2,  4,  5,  4,  4,  1, 10,  2,  1,  8,  2, 10, 10,  5,  3,  3,
        9,  3,  1,  5,  9, 10,  8,  8, 12,  1,  5,  2, 11,  8,  5,  3, 10,
        8,  9,  9, 10, 10,  8,  8, 12, 12,  5, 11, 11, 10,  4,  2],
      dtype=int64)

In [7]:
beta = 10
h = np.random.rand(n_samples, n_clusters)
q = scipy_softmax(-beta*h, axis=1)

In [8]:
x = 50
y = 210
xi = -100

In [9]:
old_val = S[x, y]
S[x, y] = xi
S[y, x] = xi
clust_sol_new, q_new, h_new = mean_field_clustering(
    S, n_clusters, betas=[beta],
    true_labels=true_labels, max_iter=100, tol=1e-10, noise_level=0.0, 
    is_sparse=False, predicted_labels=None, h=h, q=q
)
S[x, y] = old_val
S[y, x] = old_val

Converged after 14 iterations


In [10]:
from sklearn.metrics import adjusted_rand_score
current_ari = adjusted_rand_score(clust_sol_new, true_labels)
print(current_ari)

1.0


In [29]:
U_size = 1
G_size = 1
q_new_fast, U = update_mean_fields(q, h, S, x, y, xi, beta, 100, U_size, G_size)
clust_sol_fast = np.argmax(q_new_fast, axis=1)

In [30]:
from sklearn.metrics import adjusted_rand_score
current_ari = adjusted_rand_score(clust_sol_fast, true_labels)
print(current_ari)

0.1003578075282969


In [31]:
from sklearn.metrics import adjusted_rand_score
current_ari = adjusted_rand_score(clust_sol_fast, clust_sol_new)
print(current_ari)

0.1003578075282969
