In [8]:
import numpy as np
import pandas as pd
import itertools
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import time
warnings.filterwarnings("ignore", "is_categorical_dtype")
warnings.filterwarnings("ignore", "use_inf_as_na")

# Normalización
from sklearn.preprocessing import StandardScaler

# Reducción de Dimensionalidad
from umap import UMAP
from sklearn.decomposition import PCA

# Fuzzy Clustering
import skfuzzy as fuzz

# Metricas
from sklearn.metrics import rand_score, adjusted_rand_score

# Rondom Rows Selection
from sklearn.model_selection import train_test_split



In [13]:
def leo_y_consolido_features(input_path='./input/'):
    df1 = pd.read_csv(input_path+'features_Arborio.csv')
    df2 = pd.read_csv(input_path+'features_Basmati.csv')
    df3 = pd.read_csv(input_path+'features_Ipsala.csv')
    df4 = pd.read_csv(input_path+'features_Jasmine.csv')
    df5 = pd.read_csv(input_path+'features_Karacadag.csv')
    features = np.array(pd.concat([df1,df2,df3,df4,df5], axis =0))
    return features


def hacer_reduccion(X:pd.DataFrame, normalizacion:bool, metodo:str, umap_params:dict):

    # method : str puede ser 'pca', 'umap', 'both'
    # umap_params: dict {'n_neighbors':int, 'min_dist':float, 'n_components':int, 'metric':str}
    
    if normalizacion: 
        scaler = StandardScaler()
        X = scaler.fit_transform(X)
    
    if (metodo == "pca") or (metodo == 'both'):
        print("Reduction Method PCA") 
        DEFAULT_MIN_COMP = 100
        ncols = X.shape[1]
        n_components = np.min([DEFAULT_MIN_COMP, ncols])

        pca = PCA(n_components=n_components)
        scaled_pca = pca.fit(X)
        X = pca.transform(X)

    elif (metodo == 'umap') or (metodo == 'both'): 
        print("Reduction Method UMAP") 
        if umap_params:
            n_neighbors = umap_params['n_neighbors']
            min_dist = umap_params['min_dist']
            n_components = umap_params['n_components']
            metric = umap_params['metric']
            umap = UMAP(n_neighbors=n_neighbors,
                        min_dist=min_dist,
                        n_components=n_components,
                        metric=metric)
        else:
            umap = UMAP()
        X = umap.fit_transform(X)
    else: 
        raise ValueError("El parámetro 'metodo' solo acepta  'pca', 'umap', 'both'") 

    return X       
        

def obtener_fuzzy_clusters(X:pd.DataFrame, fuzzy_params:dict):
    
    # fuzzy_params: dict {'n_clusters':int, 'm':int}
    DEFAULT_ERROR = 0.001
    DEFAULT_MAX_ITER = 1000
    n_clusters = fuzzy_params['n_clusters']
    m = fuzzy_params['m']
        

    _, u, _, _, _, _, _ = fuzz.cluster.cmeans(data = X.T, 
                                                    c = n_clusters, 
                                                    m = m, 
                                                    error=DEFAULT_ERROR, 
                                                    maxiter=DEFAULT_MAX_ITER, 
                                                    init=None,
                                                    seed = 123)
    
    # Plot assigned clusters, for each data point in training set
    cluster_membership = np.argmax(u, axis=0)
    print("Fuzzy Clustering already finish!!!")
    return cluster_membership

def vanDongen(ct):
    n2=2*(sum(ct.apply(sum,axis=1)))
    sumi = sum(ct.apply(np.max,axis=1))
    sumj = sum(ct.apply(np.max,axis=0))
    maxsumi = np.max(ct.apply(sum,axis=1))
    maxsumj = np.max(ct.apply(sum,axis=0))
    vd = (n2 - sumi - sumj)/(n2 - maxsumi - maxsumj)
    return vd

def cross_tab(Labels_orig, Labels_clust):
     '''crea matriz de confusión para evaluar etiquetado
     labels_orig  = etiquetas originales - reales
     labels_test  = etiquetas halladeas por el algoritmo'''
     tmp = pd.DataFrame({'Labels_orig': Labels_orig, 'Labels_clust': Labels_clust})
     ct = pd.crosstab(tmp['Labels_clust'],tmp['Labels_orig']) # Create crosstab: ct
     rand = rand_score(Labels_orig, Labels_clust)
     arand= adjusted_rand_score(Labels_orig, Labels_clust)
     vandon = vanDongen(ct)
     print(f'RAND score={rand:.4f}, Ajusted RAND={arand:.4f}, vanDongen={vandon:.4f} cantidad_de_muestras={len(Labels_orig):,d}')
     return ct, rand, arand, vandon

In [11]:
X = leo_y_consolido_features(input_path='outputs/datasets/')

# como están ordenadas por clase voy a hacer una lista con las etiquetas ahora antes de mezclarlas
clases_dict = {0:'Arborio', 1:'Basmati', 2:'Ipsala', 3:'Jasmine', 4:'Karacadag'}
lista_clases = ([0]*15000 + [1]*15000 + [2]*15000 + [3]*15000 + [4]*15000)


# separo en train_test y validación con sus respectivas etiquetas y los mezclo
X, _ , true_labels, _  = train_test_split(X, lista_clases, test_size=0.2, random_state=42, shuffle=True)

true_labels = pd.Series(true_labels).map(clases_dict).to_list()   

In [29]:
# PCA

# Hago con 100 componentes
Xpca = hacer_reduccion(X=X, normalizacion=True, metodo="pca", umap_params=None)

fuzzy_params = {'n_clusters': 5, 'm':2}

pca_fz_labels = obtener_fuzzy_clusters(Xpca, fuzzy_params=fuzzy_params)

ct, rand, arand, vandon = cross_tab(Labels_orig=true_labels, Labels_clust=pca_fz_labels)

Reduction Method PCA
Fuzzy Clustering already finish!!!
RAND score=0.6510, Ajusted RAND=0.3017, vanDongen=0.5312 cantidad_de_muestras=60,000


Labels_orig,Arborio,Basmati,Ipsala,Jasmine,Karacadag
Labels_clust,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1797,12005,2015,11662,1
2,78,0,99,4,0
3,15,0,12,0,0
4,10113,0,9791,338,12070


In [21]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5, random_state=42, init='k-means++', n_init='auto')
kmeans.fit(Xpca)

pca_km_labels = kmeans.labels_

ct2, rand, arand, vandon = cross_tab(Labels_orig=true_labels, Labels_clust=pca_km_labels)

RAND score=0.8116, Ajusted RAND=0.4133, vanDongen=0.5264 cantidad_de_muestras=60,000


In [28]:
ct2

Labels_orig,Arborio,Basmati,Ipsala,Jasmine,Karacadag
Labels_clust,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,5577,15,5920,736,227
1,653,0,16,0,10696
2,34,5652,10,4249,0
3,118,6333,67,6701,0
4,5621,5,5904,318,1148


In [24]:
ct2

Labels_orig,Arborio,Basmati,Ipsala,Jasmine,Karacadag
Labels_clust,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,5577,15,5920,736,227
1,653,0,16,0,10696
2,34,5652,10,4249,0
3,118,6333,67,6701,0
4,5621,5,5904,318,1148


In [25]:
# UMAP 

umap_params = {
    'n_neighbors': 20,
    'min_dist': 0.0,
    'n_components':2,
    'metric':'euclidean'
}

Xumap = hacer_reduccion(X=X, normalizacion=True, metodo="umap", umap_params=umap_params)

fuzzy_params = {'n_clusters': 5, 'm':2}

umap_fz_labels = obtener_fuzzy_clusters(Xumap, fuzzy_params=fuzzy_params)

ct3, rand, arand, vandon = cross_tab(Labels_orig=true_labels, Labels_clust=umap_fz_labels)

Reduction Method UMAP
Fuzzy Clustering already finish!!!
RAND score=0.9843, Ajusted RAND=0.9509, vanDongen=0.0250 cantidad_de_muestras=60,000
