In [1]:
import numpy as np
import pandas as pd
import itertools
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import time
warnings.filterwarnings("ignore", "is_categorical_dtype")
warnings.filterwarnings("ignore", "use_inf_as_na")

# Normalización
from sklearn.preprocessing import StandardScaler

# Reducción de Dimensionalidad
from umap import UMAP
from sklearn.decomposition import PCA

# Fuzzy Clustering
import skfuzzy as fuzz

# Metricas
from sklearn.metrics import rand_score, adjusted_rand_score



2023-10-12 17:26:47.962131: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-10-12 17:26:47.996213: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-10-12 17:26:48.232339: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-10-12 17:26:48.234447: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def seleccionar_muestra(sample_length = None, input_folder="output/datasets/", prefijo="features_completos_", sufijo=".csv.gz"):
    DEFAULT_LENGTH = 15000 
    
    clases = ["Arborio", "Basmati", "Ipsala", "Jasmine", "Karacadag"]
       
    sample_df = pd.DataFrame()
    for clase in clases:
        path = input_folder+prefijo+clase+sufijo
        ids = np.random.choice(np.arange(0,DEFAULT_LENGTH,1), size=sample_length, replace=False)
        df = pd.read_csv(path) 
        df = df.loc[ids, :].reset_index(drop=True)
        sample_df = pd.concat([sample_df, df], axis=0)
    print(f"Stratified Sample of {sample_df.shape[0]}")
    return sample_df 

def seleccionar_features(X:pd.DataFrame, features:str): 

    # feautres : str puede ser 'morfologicos', 'conv2d', 'both'
    
    default_cols = ['image_id','class_name']
    morphological_features = ['area','eccentricity','perimeter', 'orientation','axis_major_length','axis_minor_length']
    conv2d_features = [str(i) for i in range(4096)]
    
    if features == "morfologicos": 
        sample_features = X.drop(columns = default_cols + conv2d_features)
        
    elif features == "conv2d": 
        sample_features = X.drop(columns = default_cols + morphological_features)
    
    elif features == "both":
        sample_features = X.drop(columns = default_cols)
    else: 
        raise ValueError("El parámetro 'features' solo acepta 'morfologicos', 'conv2d', 'both'")
    sample_labels = X.class_name.to_list()
    return sample_features, sample_labels

     
def hacer_reduccion(X:pd.DataFrame, normalizacion:bool, metodo:str, umap_params:dict):

    # method : str puede ser 'pca', 'umap', 'both'
    # umap_params: dict {'n_neighbors':int, 'min_dist':float, 'n_components':int, 'metric':str}
    
    if normalizacion: 
        scaler = StandardScaler()
        X = scaler.fit_transform(X)
    
    if (metodo == "pca") or (metodo == 'both'):
        print("Reduction Method PCA") 
        DEFAULT_MIN_COMP = 100
        ncols = X.shape[1]
        n_components = np.min([DEFAULT_MIN_COMP, ncols])

        pca = PCA(n_components=n_components)
        scaled_pca = pca.fit(X)
        X = pca.transform(X)

    elif (metodo == 'umap') or (metodo == 'both'): 
        print("Reduction Method UMAP") 
        if umap_params:
            n_neighbors = umap_params['n_neighbors']
            min_dist = umap_params['min_dist']
            n_components = umap_params['n_components']
            metric = umap_params['metric']
            umap = UMAP(n_neighbors=n_neighbors,
                        min_dist=min_dist,
                        n_components=n_components,
                        metric=metric)
        else:
            umap = UMAP()
        X = umap.fit_transform(X)
    else: 
        raise ValueError("El parámetro 'metodo' solo acepta  'pca', 'umap', 'both'") 

    return X       
        

def obtener_fuzzy_clusters(X:pd.DataFrame, fuzzy_params:dict):
    
    # fuzzy_params: dict {'n_clusters':int, 'm':int}
    DEFAULT_ERROR = 0.001
    DEFAULT_MAX_ITER = 1000
    if fuzzy_params: 
        n_clusters = fuzzy_params['n_clusters']
        m = fuzzy_params['m']
        

    _, u, _, _, _, _, _ = fuzz.cluster.cmeans(data = X.T, 
                                                    c = n_clusters, 
                                                    m = m, 
                                                    error=DEFAULT_ERROR, 
                                                    maxiter=DEFAULT_MAX_ITER, 
                                                    init=None,
                                                    seed = 123)
    
    # Plot assigned clusters, for each data point in training set
    cluster_membership = np.argmax(u, axis=0)
    print("Fuzzy Clustering already finish!!!")
    return cluster_membership



def make_grid(*args):
    return list(itertools.product(*args))

def vanDongen(ct):
    n2=2*(sum(ct.apply(sum,axis=1)))
    sumi = sum(ct.apply(np.max,axis=1))
    sumj = sum(ct.apply(np.max,axis=0))
    maxsumi = np.max(ct.apply(sum,axis=1))
    maxsumj = np.max(ct.apply(sum,axis=0))
    vd = (n2 - sumi - sumj)/(n2 - maxsumi - maxsumj)
    return vd

def cross_tab(Labels_orig, Labels_clust):
     '''crea matriz de confusión para evaluar etiquetado
     labels_orig  = etiquetas originales - reales
     labels_test  = etiquetas halladeas por el algoritmo'''
     tmp = pd.DataFrame({'Labels_orig': Labels_orig, 'Labels_clust': Labels_clust})
     ct = pd.crosstab(tmp['Labels_clust'],tmp['Labels_orig']) # Create crosstab: ct
     rand = rand_score(Labels_orig, Labels_clust)
     arand= adjusted_rand_score(Labels_orig, Labels_clust)
     vandon =vanDongen(ct)
     print(f'RAND score={rand:.4f}, Ajusted RAND={arand:.4f}, vanDongen={vandon:.4f} cantidad_de_muestras={len(Labels_orig):,d}')
     return ct, rand, arand, vandon


def grid_search(grid_params:dict):
    
    n_samples_list = grid_params['n_samples_list']
    feature_selection_list = grid_params['feature_selection_list']
    reduction_method_list = grid_params['reduction_method_list']
    umap_n_neighbors_list = grid_params['umap_params_grid']['n_neighbors']
    umap_min_dist_list = grid_params['umap_params_grid']['min_dist']
    umap_n_components_list = grid_params['umap_params_grid']['n_components']
    umap_metric_list = grid_params['umap_params_grid']['metric']
    fuzzy_n_clusters_list = grid_params['fuzzy_params_grid']['n_clusters']
    m_list = grid_params['fuzzy_params_grid']['m']
    
    grid = make_grid(n_samples_list, feature_selection_list, reduction_method_list, 
                     umap_n_neighbors_list, umap_min_dist_list, umap_n_components_list, umap_metric_list,
                     fuzzy_n_clusters_list, m_list)
    entries = []
    for n, f, r, unn, umd, unc, um, fnc, m in grid:
        print("Parámetros: ", n, f, r, unn, umd, unc, fnc, m)
        start_time1 = time.time()
        X = seleccionar_muestra(sample_length=n)
        start_time2 = time.time()
        sample_features, sample_labels = seleccionar_features(X, features=f)
        umap_params = {'n_neighbors':unn, 'min_dist':umd, 'n_components':unc, 'metric':um}
        fuzzy_params = {'n_clusters': fnc, 'm': m}
        start_time3 = time.time()
        reduced_data = hacer_reduccion(X = sample_features, normalizacion=True, metodo=r, umap_params = umap_params)
        cluster_membership = obtener_fuzzy_clusters(X = reduced_data, fuzzy_params= fuzzy_params)
        _, rand, arand, vandongen = cross_tab(sample_labels, cluster_membership)
        #rand = rand_score(labels_true=sample_labels, labels_pred=cluster_membership)
        #arand= adjusted_rand_score(labels_true=sample_labels, labels_pred=cluster_membership)
        elapsed_time = (time.time() - start_time1)
        time_excluding_data_loading = (time.time() - start_time2)
        time_excluding_feature_selection = (time.time() - start_time3)
        entry = [n*5, f, r, unn, umd, unc, um, fnc, m, rand, arand, vandongen,  elapsed_time, time_excluding_data_loading, time_excluding_feature_selection]
#        print(f"RAND {rand} y ARAND {arand}")
        print(f"Elapsed time {elapsed_time}\n")
        entries.append(entry)
        
    
    output_cols = ['n_samples', 'feature_selection','reduction_method',
                   'umap_n_neighbors', 'umap_min_dist','umap_n_components','umap_metric',
                   'fuzzy_n_clusters','m', 'rand','arand', 'vandongen',
                   'elapsed_time', 'time_excluding_data_loading', 'time_excluding_feature_selection']
    return pd.DataFrame(entries, columns=output_cols)



In [None]:
# En algunos casos tiene seteado parámetros de prueba, hay que ponerle los que están comentados



result_grid_search = grid_search(grid_params=grid_params)