In [None]:
import numpy as np
import pandas as pd
from algorithms.state_of_the_art_algorithms import FuzzyDoubleKmeans, WFDK
from algorithms.kernel_fuzzy_coclustering import GKFDK, WGKFDK
from algorithms.double_subspace_coclustering import DWGKFDK
from sklearn.metrics  import adjusted_rand_score
from metrics import frigui_index, fuzzy_to_crisp
import warnings
warnings.filterwarnings('ignore')
import os
import pickle
import copy 

In [None]:
def get_metrics(y, U):
    y_hat = fuzzy_to_crisp(U)
    metrics = np.full(2,np.nan)
    metrics[0] = adjusted_rand_score(y, y_hat)
    metrics[1] = frigui_index(U,y)
    return metrics


def apply_N_times(X, model, y , n = 100, path = None, random_state = 26):
    rs = pd.DataFrame(np.zeros((n,2)))
    for i in range(n):
        mdl = copy.deepcopy(model)
        seed = random_state + n + i
        fitted_model = mdl.fit(X = X, random_state = seed) # training step
        rs.loc[i,:] = get_metrics(y=y, U=fitted_model.U)
        seed = seed + 1  
        print(f"Iteração: {i+1}/{n}", end='\r')
    rs.columns = [ 'ARI', 'FRG']
    if path is not None:
        rs.to_csv(path + '.txt', header= True, index = False)
    else:
        return rs

def standardization(X):
    norm = lambda x: (x - np.mean(x))/(np.std(x))
    if type(X) == np.ndarray:
        X = pd.DataFrame(X)
    X_scale = X.apply(norm, axis=0)
    X_scale = X_scale.fillna(0)
    return X_scale

def numerical_class(y):
    return np.unique(y, return_inverse=True)[1]

def load_datasets(path):
    with open(path, 'rb') as f:
        info = pickle.load(f)
    return info

In [None]:
dtsets = ['wdbc', 'vehicle', 'chemical_composition', 'vertebral_column_2C', 'vertebral_column', 'breast_tissue', 'abalone', 'fruit', 'gtzan', 'tox_171']

In [None]:
rs = np.zeros((len(dtsets), 3))
for i in range(len(dtsets)):
    path = 'datasets/' + dtsets[i]
    X, y,_ = load_datasets(path)
    N,P = X.shape
    K = len(np.unique(y))
    rs[i] = np.array([N,P,K])
rs = rs.astype('int')
pd.DataFrame(rs, index = dtsets, columns=['N','P','K'])


### FDK

In [None]:
best_hyperparameters = pd.read_csv('configurations/FDK.csv', index_col=0)
#path_final = 'Resultados/real_data'
path_final = 'results_real'
model = 'fdk'
for i in range(len(dtsets)):
    dt = dtsets[i]
    path1 = path_final + '/' + dt
    if not os.path.exists(path1):
        os.makedirs(path1)
        
    path2 = path1 + '/' + model
    m = best_hyperparameters.loc[dt,'m']
    n = best_hyperparameters.loc[dt,'n']
    X, y,_ = load_datasets('datasets/' + dt)
    X = standardization(X)
    y = numerical_class(y)
    n_clusters = len(np.unique(y))
    mdl = FuzzyDoubleKmeans(K = n_clusters, H = n_clusters, m = m, n = n)
    print(f"Dataset: {dt}(K = H = {n_clusters})")
    apply_N_times(X = X, model = mdl, y = y, n = 100, path = path2 , random_state = 100)
    

### WFDK

In [None]:
best_hyperparameters = pd.read_csv('configurations/WFDK.csv', index_col=0)
path_final = 'results_real'
model = 'wfdk'
for i in range(len(dtsets)):
    dt = dtsets[i]
    path1 = path_final + '/' + dt
    if not os.path.exists(path1):
        os.makedirs(path1)
        
    path2 = path1 + '/' + model
    m = best_hyperparameters.loc[dt,'m']
    n = best_hyperparameters.loc[dt,'n']
    gamma = best_hyperparameters.loc[dt,'gamma']
    X, y,_ = load_datasets('datasets/' + dt)
    X = standardization(X)
    y = numerical_class(y)
    n_clusters = len(np.unique(y))
    mdl = WFDK(K = n_clusters, H = n_clusters, m = m, n = n, gamma = gamma)
    print(f"Dataset: {dt}(K = H = {n_clusters})")
    apply_N_times(X = X, model = mdl, y = y, n = 100, path = path2 , random_state = 100)
    

### GKFDK

In [None]:
caputo = pd.read_csv('configurations/caputo.csv', index_col=0)
best_hyperparameters = pd.read_csv('configurations/GKFDK.csv', index_col=0)
path_final = 'results_real'
model = 'gkfdk'
for i in range(len(dtsets)):
    dt = dtsets[i]
    path1 = path_final + '/' + dt
    if not os.path.exists(path1):
        os.makedirs(path1)
        
    path2 = path1 + '/' + model
    sig2 = caputo.loc[dt,'sigma2']
    m = best_hyperparameters.loc[dt,'m']
    n = best_hyperparameters.loc[dt,'n']
    X, y,_ = load_datasets('datasets/' + dt)
    X = standardization(X)
    y = numerical_class(y)
    n_clusters = len(np.unique(y))
    mdl = GKFDK(K = n_clusters, H = n_clusters, m = m, n = n,sigma2 = sig2, epsilon = 1e-5)
    print(f"Dataset: {dt}(K = H = {n_clusters})")
    apply_N_times(X = X, model = mdl, y = y, n = 100, path = path2 , random_state = 100)
    

### WGKFDK

In [None]:
caputo = pd.read_csv('configurations/caputo.csv', index_col=0)
best_hyperparameters = pd.read_csv('configurations/WGKFDK.csv', index_col=0)
path_final = 'results_real'
model = 'wgkfdk'
for i in range(len(dtsets)):
    dt = dtsets[i]
    path1 = path_final + '/' + dt
    if not os.path.exists(path1):
        os.makedirs(path1)
        
    path2 = path1 + '/' + model
    sig2 = caputo.loc[dt,'sigma2']
    m = best_hyperparameters.loc[dt,'m']
    n = best_hyperparameters.loc[dt,'n']
    X, y,_ = load_datasets('datasets/' + dt)
    X = standardization(X)
    y = numerical_class(y)
    n_clusters = len(np.unique(y))
    mdl = WGKFDK(K = n_clusters, H = n_clusters, m = m, n = n,sigma2 = sig2, epsilon = 1e-5)
    print(f"Dataset: {dt}(K = H = {n_clusters})")
    apply_N_times(X = X, model = mdl, y = y, n = 100, path = path2 , random_state = 100)
    

### DWGKFDK

In [None]:
caputo = pd.read_csv('configurations/caputo.csv', index_col=0)
best_hyperparameters = pd.read_csv('configurations/DWGKFDK.csv', index_col=0)
path_final = 'results_real'
model = 'dwgkfdk'
for i in range(len(dtsets)):
    dt = dtsets[i]
    path1 = path_final + '/' + dt
    if not os.path.exists(path1):
        os.makedirs(path1)
        
    path2 = path1 + '/' + model
    sig2 = caputo.loc[dt,'sigma2']
    m = best_hyperparameters.loc[dt,'m']
    n = best_hyperparameters.loc[dt,'n']
    X, y,_ = load_datasets('datasets/' + dt)
    X = standardization(X)
    y = numerical_class(y)
    n_clusters = len(np.unique(y))
    mdl = DWGKFDK(K = n_clusters, H = n_clusters, m = m, n = n,sigma2 = sig2, epsilon = 1e-5)
    print(f"Dataset: {dt}(K = H = {n_clusters})")
    apply_N_times(X = X, model = mdl, y = y, n = 100, path = path2 , random_state = 100)
    