<a href="https://colab.research.google.com/github/Ignacio-Ibarra/unsupervised-rice-image-segmentation/blob/main/fuzzy_clust_threadings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
import itertools
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import time
warnings.filterwarnings("ignore", "is_categorical_dtype")
warnings.filterwarnings("ignore", "use_inf_as_na")

# Normalización
from sklearn.preprocessing import StandardScaler

# Reducción de Dimensionalidad
!pip install umap-learn
from umap import UMAP
from sklearn.decomposition import PCA

# Fuzzy Clustering
!pip install scikit-fuzzy
import skfuzzy as fuzz

# Metricas
from sklearn.metrics import rand_score, adjusted_rand_score



Collecting umap-learn
  Downloading umap-learn-0.5.4.tar.gz (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.8/90.8 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pynndescent>=0.5 (from umap-learn)
  Downloading pynndescent-0.5.10.tar.gz (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m42.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: umap-learn, pynndescent
  Building wheel for umap-learn (setup.py) ... [?25l[?25hdone
  Created wheel for umap-learn: filename=umap_learn-0.5.4-py3-none-any.whl size=86770 sha256=c96951a11e6deff568f05890b6b0a993e90e9210566b53906f44be0465beb1f5
  Stored in directory: /root/.cache/pip/wheels/fb/66/29/199acf5784d0f7b8add6d466175ab45506c96e386ed5dd0633
  Building wheel for pynndescent (setup.py) ... [?25l[?25hdone
  Created wheel for py

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [13]:
def seleccionar_muestra(sample_length = None, input_folder="/content/drive/MyDrive/Rice_Image_Dataset/", prefijo="features_completos_", sufijo=".csv.gz"):
    DEFAULT_LENGTH = 15000

    clases = ["Arborio", "Basmati", "Ipsala", "Jasmine", "Karacadag"]

    sample_df = pd.DataFrame()
    for clase in clases:
        path = input_folder+prefijo+clase+sufijo
        ids = np.random.choice(np.arange(0,DEFAULT_LENGTH,1), size=sample_length, replace=False)
        df = pd.read_csv(path)
        df = df.loc[ids, :].reset_index(drop=True)
        sample_df = pd.concat([sample_df, df], axis=0)
    print(f"Stratified Sample of {sample_df.shape[0]}")
    return sample_df

def seleccionar_features(X:pd.DataFrame, features:str):

    # feautres : str puede ser 'morfologicos', 'conv2d', 'both'

    default_cols = ['image_id','class_name']
    morphological_features = ['area','eccentricity','perimeter', 'orientation','axis_major_length','axis_minor_length']
    conv2d_features = [str(i) for i in range(4096)]

    if features == "morfologicos":
        sample_features = X.drop(columns = default_cols + conv2d_features)

    elif features == "conv2d":
        sample_features = X.drop(columns = default_cols + morphological_features)

    elif features == "both":
        sample_features = X.drop(columns = default_cols)
    else:
        raise ValueError("El parámetro 'features' solo acepta 'morfologicos', 'conv2d', 'both'")
    sample_labels = X.class_name.to_list()
    return sample_features, sample_labels


def hacer_reduccion(X:pd.DataFrame, normalizacion:bool, metodo:str, umap_params:dict):

    # method : str puede ser 'pca', 'umap', 'both'
    # umap_params: dict {'n_neighbors':int, 'min_dist':float, 'n_components':int, 'metric':str}

    if normalizacion:
        scaler = StandardScaler()
        X = scaler.fit_transform(X)

    if (metodo == "pca") or (metodo == 'both'):
        print("Reduction Method PCA")
        DEFAULT_MIN_COMP = 100
        ncols = X.shape[1]
        n_components = np.min([DEFAULT_MIN_COMP, ncols])

        pca = PCA(n_components=n_components)
        scaled_pca = pca.fit(X)
        X = pca.transform(X)

    elif (metodo == 'umap') or (metodo == 'both'):
        print("Reduction Method UMAP")
        if umap_params:
            n_neighbors = umap_params['n_neighbors']
            min_dist = umap_params['min_dist']
            n_components = umap_params['n_components']
            metric = umap_params['metric']
            umap = UMAP(n_neighbors=n_neighbors,
                        min_dist=min_dist,
                        n_components=n_components,
                        metric=metric)
        else:
            umap = UMAP()
        X = umap.fit_transform(X)
    else:
        raise ValueError("El parámetro 'metodo' solo acepta  'pca', 'umap', 'both'")

    return X


def obtener_fuzzy_clusters(X:pd.DataFrame, fuzzy_params:dict):

    # fuzzy_params: dict {'n_clusters':int, 'm':int}
    DEFAULT_ERROR = 0.001
    DEFAULT_MAX_ITER = 1000
    if fuzzy_params:
        n_clusters = fuzzy_params['n_clusters']
        m = fuzzy_params['m']


    _, u, _, _, _, _, _ = fuzz.cluster.cmeans(data = X.T,
                                                    c = n_clusters,
                                                    m = m,
                                                    error=DEFAULT_ERROR,
                                                    maxiter=DEFAULT_MAX_ITER,
                                                    init=None,
                                                    seed = 123)

    # Plot assigned clusters, for each data point in training set
    cluster_membership = np.argmax(u, axis=0)
    print("Fuzzy Clustering already finish!!!")
    return cluster_membership



def make_grid(*args):
    return list(itertools.product(*args))

def vanDongen(ct):
    n2=2*(sum(ct.apply(sum,axis=1)))
    sumi = sum(ct.apply(np.max,axis=1))
    sumj = sum(ct.apply(np.max,axis=0))
    maxsumi = np.max(ct.apply(sum,axis=1))
    maxsumj = np.max(ct.apply(sum,axis=0))
    vd = (n2 - sumi - sumj)/(n2 - maxsumi - maxsumj)
    return vd

def cross_tab(Labels_orig, Labels_clust):
     '''crea matriz de confusión para evaluar etiquetado
     labels_orig  = etiquetas originales - reales
     labels_test  = etiquetas halladeas por el algoritmo'''
     tmp = pd.DataFrame({'Labels_orig': Labels_orig, 'Labels_clust': Labels_clust})
     ct = pd.crosstab(tmp['Labels_clust'],tmp['Labels_orig']) # Create crosstab: ct
     rand = rand_score(Labels_orig, Labels_clust)
     arand= adjusted_rand_score(Labels_orig, Labels_clust)
     vandon =vanDongen(ct)
     print(f'RAND score={rand:.4f}, Ajusted RAND={arand:.4f}, vanDongen={vandon:.4f} cantidad_de_muestras={len(Labels_orig):,d}')
     return ct, rand, arand, vandon


def get_grid_from_dict(grid_params:dict):
    n_samples_list = grid_params['n_samples_list']
    feature_selection_list = grid_params['feature_selection_list']
    reduction_method_list = grid_params['reduction_method_list']
    umap_n_neighbors_list = grid_params['umap_params_grid']['n_neighbors']
    umap_min_dist_list = grid_params['umap_params_grid']['min_dist']
    umap_n_components_list = grid_params['umap_params_grid']['n_components']
    umap_metric_list = grid_params['umap_params_grid']['metric']
    fuzzy_n_clusters_list = grid_params['fuzzy_params_grid']['n_clusters']
    m_list = grid_params['fuzzy_params_grid']['m']

    grid = make_grid(n_samples_list, feature_selection_list, reduction_method_list,
                     umap_n_neighbors_list, umap_min_dist_list, umap_n_components_list, umap_metric_list,
                     fuzzy_n_clusters_list, m_list)

    return grid

def grid_search(grid_params:dict):

    n_samples_list = grid_params['n_samples_list']
    feature_selection_list = grid_params['feature_selection_list']
    reduction_method_list = grid_params['reduction_method_list']
    umap_n_neighbors_list = grid_params['umap_params_grid']['n_neighbors']
    umap_min_dist_list = grid_params['umap_params_grid']['min_dist']
    umap_n_components_list = grid_params['umap_params_grid']['n_components']
    umap_metric_list = grid_params['umap_params_grid']['metric']
    fuzzy_n_clusters_list = grid_params['fuzzy_params_grid']['n_clusters']
    m_list = grid_params['fuzzy_params_grid']['m']

    grid = make_grid(n_samples_list, feature_selection_list, reduction_method_list,
                     umap_n_neighbors_list, umap_min_dist_list, umap_n_components_list, umap_metric_list,
                     fuzzy_n_clusters_list, m_list)
    entries = []
    for n, f, r, unn, umd, unc, um, fnc, m in grid:
        print("Parámetros: ", n, f, r, unn, umd, unc, fnc, m)
        start_time1 = time.time()
        X = seleccionar_muestra(sample_length=n)
        start_time2 = time.time()
        sample_features, sample_labels = seleccionar_features(X, features=f)
        umap_params = {'n_neighbors':unn, 'min_dist':umd, 'n_components':unc, 'metric':um}
        fuzzy_params = {'n_clusters': fnc, 'm': m}
        start_time3 = time.time()
        reduced_data = hacer_reduccion(X = sample_features, normalizacion=True, metodo=r, umap_params = umap_params)
        cluster_membership = obtener_fuzzy_clusters(X = reduced_data, fuzzy_params= fuzzy_params)
        _, rand, arand, vandongen = cross_tab(sample_labels, cluster_membership)
        #rand = rand_score(labels_true=sample_labels, labels_pred=cluster_membership)
        #arand= adjusted_rand_score(labels_true=sample_labels, labels_pred=cluster_membership)
        elapsed_time = (time.time() - start_time1)
        time_excluding_data_loading = (time.time() - start_time2)
        time_excluding_feature_selection = (time.time() - start_time3)
        entry = [n*5, f, r, unn, umd, unc, um, fnc, m, rand, arand, vandongen,  elapsed_time, time_excluding_data_loading, time_excluding_feature_selection]
#        print(f"RAND {rand} y ARAND {arand}")
        print(f"Elapsed time {elapsed_time}\n")
        entries.append(entry)


    output_cols = ['n_samples', 'feature_selection','reduction_method',
                   'umap_n_neighbors', 'umap_min_dist','umap_n_components','umap_metric',
                   'fuzzy_n_clusters','m', 'rand','arand', 'vandongen',
                   'elapsed_time', 'time_excluding_data_loading', 'time_excluding_feature_selection']
    return pd.DataFrame(entries, columns=output_cols)



In [18]:
# En algunos casos tiene seteado parámetros de prueba, hay que ponerle los que están comentados

grid_params = {
    'n_samples_list' : [5, 10, 100, 1000], #[5, 10, 100, 1000, 5000, 10000]
    'feature_selection_list': ['morfologicos'], #['morfologicos', 'conv2d', 'both']
    'reduction_method_list': ['pca'], #['pca', 'umap', 'both'],
    'umap_params_grid':{
        'n_neighbors':[15], #[15, 30, 50, 100]
        'min_dist':[0.01], #[0,0, 0.01, 0.1]
        'n_components':[4], #[2, 3, 4, 5]
        'metric': ['euclidean'],  # ['euclidean', 'manhattan', 'chebyshev', 'cosine']
        },
    'fuzzy_params_grid': {
        'n_clusters': [5], # [2, 3, 4, 5, 6, 7]
        'm': [2]
    }
}

grid = get_grid_from_dict(grid_params)
# result_grid_search = grid_search(grid_params=grid_params)

In [19]:
#Fijamos la cantidad de hilos (threadings)
n_threads = 4 #fijo cinco pero se puede poner cualquier valor entre 1 y len(cuits)


#Vamos a determinar la cantidad de elementos que va a tener que analizar cada thread.
p=len(grid)//n_threads # Acá hacemos división entera para que me divida la cantidad de elementos en n partes iguales.
inicios = []
fines = []
inicio=0
fin=p


#Acá vamos a crear los inicios y los fines.
for i in range(n_threads):
  inicios.append(inicio)
  fines.append(fin)
  inicio= inicio + p
  fin= fin + p


In [22]:
entries = []

from tqdm import tqdm

def worker_thread(inicio, fin):

  for i in tqdm(range(inicio, fin)):

    n, f, r, unn, umd, unc, um, fnc, m = grid[i]
    print("Parámetros: ", n, f, r, unn, umd, unc, fnc, m)
    start_time1 = time.time()
    X = seleccionar_muestra(sample_length=n, input_folder="/content/drive/MyDrive/Rice_Image_Dataset/")
    start_time2 = time.time()
    sample_features, sample_labels = seleccionar_features(X, features=f)
    umap_params = {'n_neighbors': unn, 'min_dist': umd, 'n_components': unc, 'metric': um}
    fuzzy_params = {'n_clusters': fnc, 'm': m}
    start_time3 = time.time()
    reduced_data = hacer_reduccion(X=sample_features, normalizacion=True, metodo=r, umap_params=umap_params)
    cluster_membership = obtener_fuzzy_clusters(X=reduced_data, fuzzy_params=fuzzy_params)
    _, rand, arand, vandongen = cross_tab(sample_labels, cluster_membership)
    elapsed_time = (time.time() - start_time1)
    time_excluding_data_loading = (time.time() - start_time2)
    time_excluding_feature_selection = (time.time() - start_time3)
    entry = [n*5, f, r, unn, umd, unc, um, fnc, m, rand, arand, vandongen,  elapsed_time, time_excluding_data_loading, time_excluding_feature_selection]
    print(f"Elapsed time {elapsed_time}\n")
    entries.append(entry)

  return entries

In [23]:
import threading

threads=[]
for i in range(len(inicios)):
  t=threading.Thread(target=worker_thread, args=(inicios[i], fines[i],))
  threads.append(t)
  t.start()

for t in threads:
  t.join()


  0%|          | 0/1 [00:00<?, ?it/s][A

Parámetros:  10 morfologicos 

  0%|          | 0/1 [00:00<?, ?it/s]

pca 15 0.01 4 5 2




  0%|          | 0/1 [00:00<?, ?it/s][A[A


  0%|          | 0/1 [00:00<?, ?it/s][A[A[A

Parámetros:  5 morfologicos pca 15 0.01 4 5 2
Parámetros:  100 morfologicos pca 15 0.01 4 5 2
Parámetros:  1000 morfologicos pca 15 0.01 4 5 2




100%|██████████| 1/1 [05:44<00:00, 344.57s/it]

Stratified Sample of 500
Reduction Method PCA
Fuzzy Clustering already finish!!!
RAND score=0.8911, Ajusted RAND=0.6580, vanDongen=0.2194 cantidad_de_muestras=500
Elapsed time 344.5553729534149




100%|██████████| 1/1 [06:03<00:00, 363.58s/it]

Stratified Sample of 25
Reduction Method PCA
Fuzzy Clustering already finish!!!
RAND score=0.9467, Ajusted RAND=0.8080, vanDongen=0.1000 cantidad_de_muestras=25
Elapsed time 363.55637383461






Stratified Sample of 5000
Reduction Method PCA





100%|██████████| 1/1 [06:12<00:00, 372.41s/it]

Fuzzy Clustering already finish!!!
RAND score=0.8823, Ajusted RAND=0.6331, vanDongen=0.2710 cantidad_de_muestras=5,000
Elapsed time 372.40913462638855





100%|██████████| 1/1 [06:19<00:00, 379.48s/it]

Stratified Sample of 50
Reduction Method PCA
Fuzzy Clustering already finish!!!
RAND score=0.8759, Ajusted RAND=0.5932, vanDongen=0.2597 cantidad_de_muestras=50
Elapsed time 379.4719412326813






In [24]:
output_cols = ['n_samples', 'feature_selection', 'reduction_method',
                   'umap_n_neighbors', 'umap_min_dist', 'umap_n_components', 'umap_metric',
                   'fuzzy_n_clusters', 'm', 'rand', 'arand', 'vandongen',
                   'elapsed_time', 'time_excluding_data_loading', 'time_excluding_feature_selection']
df= pd.DataFrame(entries, columns=output_cols)

Unnamed: 0,n_samples,feature_selection,reduction_method,umap_n_neighbors,umap_min_dist,umap_n_components,umap_metric,fuzzy_n_clusters,m,rand,arand,vandongen,elapsed_time,time_excluding_data_loading,time_excluding_feature_selection
0,500,morfologicos,pca,15,0.01,4,euclidean,5,2,0.89111,0.65804,0.21942,344.555373,0.150544,0.143813
1,25,morfologicos,pca,15,0.01,4,euclidean,5,2,0.946667,0.808,0.1,363.556374,0.054778,0.044037
2,5000,morfologicos,pca,15,0.01,4,euclidean,5,2,0.882348,0.633119,0.270966,372.409135,0.834155,0.829433
3,50,morfologicos,pca,15,0.01,4,euclidean,5,2,0.875918,0.593228,0.25974,379.471941,0.025526,0.020998
