In [1]:
try:
    from ucimlrepo import fetch_ucirepo 
    import pandas as pd
    import numpy as np  
    from numpy.fft import fft
    import random 
    from scipy.stats import mode
    import glob
    import os 
    import matplotlib.pyplot as plt
    from rich import print as rprint
    import sklearn 
    import librosa 
    import json 
    from mpl_toolkits.mplot3d import Axes3D
    from sklearn.preprocessing import MinMaxScaler, StandardScaler
except ImportError:
    import subprocess
    import sys
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'ucimlrepo', 'pandas', 'numpy', 'scipy', 'matplotlib', 'librosa', 'rich', 'sklearn', 'librosa' ])
    # Una vez instaladas las bibliotecas, intenta importarlas nuevamente
    from ucimlrepo import fetch_ucirepo 
    import pandas as pd
    import numpy as np  
    from numpy.fft import fft
    from scipy.stats import mode
    import librosa 
    import json 

#Aumentar visualización de datos ya que notebok limita el numero de columnas a observar 
np.set_printoptions(threshold=np.inf, linewidth=np.inf)
#Aumentar visualización de datos ya que notebok limita el numero de columnas a observar 
np.set_printoptions(threshold=np.inf, linewidth=np.inf)

In [2]:
data_wine_raw = pd.read_csv("./wine_data.csv")
data_wine_raw
#Extracción de caracteristicas
data_wine_nc = data_wine_raw.iloc[:,1:]
#Extracción de labels
labels = data_wine_raw.iloc[:,0]

In [7]:
import time 
def cal_PCA_percent(percent_query) -> tuple:
    #Datos

    #-Query
    nume_sample_query = int((len(data_wine_nc)) * percent_query/100)
    query_index_samples = random.sample(list(range(0,178)), nume_sample_query)
    query_index_samples.sort()
    query_samples = data_wine_nc.iloc[query_index_samples]

    #-Training samples
    training_samples = data_wine_nc.drop(query_index_samples)

    #-Labels 
    query_samples_labels = labels[query_index_samples]
    training_samples_labels = labels.drop(query_index_samples) 

    #Normalización 

    #Creamos una matriz (filas_training, 13)
    promedios_training = np.array(training_samples.mean())
    promedios_training_t = np.tile(promedios_training, (training_samples.shape[0], 1))
    promedios_training_q = np.tile(promedios_training, (query_samples.shape[0],1))

    #Creamos una matriz diagonal con los valores de las desviaciones std de cada fila 
    std_training = np.array(training_samples.std()) ** -1
    dig_std_training = np.eye(len(std_training))
    np.fill_diagonal(dig_std_training, std_training)

    #Normalización para training ---------------------------------------------
    #Aplicando primero -> x - µ = ß
    normal_training = training_samples - promedios_training_t
    #Luego ł = ß / σ
    #Para ello ł se multiplicara por una matriz diagonal en donde la diagonal contiene sus desviaciones std
    normal_training = (normal_training @ dig_std_training).T

    #Normalización para query ------------------------------------------------
    #Aplicando primero -> x - µ = ß
    normal_query = query_samples - promedios_training_q
    #Luego ł = ß / σ
    #Para ello ł se multiplicara por una matriz diagonal en donde la diagonal contiene sus desviaciones std
    normal_query = (normal_query @ dig_std_training)

    #PCA--------------------------------------------------------------------
    #Dimension wine_normal_query -> (35, 13)
    start = time.time()
    matrix_cov = np.cov(normal_training)
    eigen_val, eigen_vec = np.linalg.eig(matrix_cov)
        
    print("Time Consumed")
    print("% s seconds" % (time.time() - start))
    ind = np.argsort( eigen_val)[::-1]
    eigen_vec_ord = eigen_vec[:, ind]
    eigenvectores_visualizar = [0, 1, 2]
    matriz_de_transformacion = eigen_vec_ord[:,eigenvectores_visualizar]
    #La matriz_de_transformacion tiene las dimensiones de -> (13,3)
    data_reducido_q = normal_query @ matriz_de_transformacion
    normal_training = normal_training.T
    data_reducido_t = normal_training @ matriz_de_transformacion
    return data_reducido_q, query_samples_labels, data_reducido_t, training_samples_labels

In [4]:
def pruebaI(query_samples,training_samples,query_samples_label,training_samples_label, k_s = 3) -> float:
    k = k_s                               
    
    #Elementos en total de la query
    totalQ = len(query_samples)
    totalT = len(training_samples)
    size_q  = len(query_samples)                                                        #Size de query
    k_DI = []  
    cont_Error = 0
    #Algoritmo kNN --------------------------------  
    # query_samples_label.iloc[0]                                   #Aqui se almacenan las k distancias mas pequeñas                                                                          #

    for j in range(size_q):  
        #print(j)                                                        #iteramos por todos los elementos de la query
        Q = np.tile(query_samples.iloc[j,:],(training_samples.shape[0],1))              #Igual que 'repmat'
        Z = Q - training_samples                                                        
        
        S_with_garbage = Z @ Z.T                                                        #puede usar tambien np.dot(m1,m2)
        distancia_S = np.diag(S_with_garbage) ** (1 / 2)                                #Obtener la diagonal y elevar sus elem a 1/2
        
        sort_index = np.argsort(distancia_S)                                            #Obtener los indices ordenados
        
        k_index = sort_index[:k]                                                        #Seleccionamos los k primeros
        k_dis = distancia_S[k_index]                                                    #Almacenamos las k distancias minimas

        mode_k_dis = mode(k_dis)[0]                                                     #mode retorna (moda, repeticiones)
        index_mode = np.where(distancia_S == mode_k_dis)[0][0]                        #buscamos indice en distancia_S de mode_k_dis
        k_DI.append(index_mode) 
        if query_samples_label.iloc[j] != training_samples_label.iloc[index_mode]:
            #print("Error:------------------------")
            #print(query_samples_label.iloc[j],"--",training_samples_label.iloc[index_mode])
            cont_Error += 1
        #else:
            #print("Success:------------------------")
            #print(query_samples_label.iloc[j],"--",training_samples_label.iloc[index_mode])


    #print(cont_Error)
    percent_success = (totalQ - cont_Error) * (100 / totalQ)
    return percent_success
    #return (totalQ, totalT, k, cont_Error, percent_success)

In [5]:
#Cambiar el rango a n pruebas 
porcentajes = []
for _ in range(10):
    q, lbq, t, lbt = cal_PCA_percent(30)
    porcentajes.append(pruebaI(q, t, lbq, lbt))#q,t,ql,tl, k_s = 3

In [6]:
np.mean(porcentajes)

93.77358490566039

In [8]:
todo = cal_PCA_percent(10)
todo

Time Consumed
0.0014977455139160156 seconds


(            0         1         2
 0   -3.240281 -1.536973 -0.174601
 10  -3.413953 -1.448675 -0.484242
 36  -1.350165 -0.719553  0.474267
 41  -0.618913 -0.230936 -0.838152
 45  -1.039573 -1.821404 -0.006682
 54  -2.076537 -1.070059 -0.948497
 56  -2.665564 -1.513720 -0.635773
 61   1.833113  0.825954 -1.434020
 83   2.523412  0.064445  0.473256
 86   0.721429  2.175545  0.832163
 92   1.864677  1.523334 -0.032480
 124 -1.013152  1.378736  1.245827
 125  0.045033  2.066703  0.414619
 140  2.769369 -0.245669  0.667970
 148  2.836274 -1.521053 -0.493744
 150  2.371619 -2.255320  0.407115
 159  1.610361 -2.404901  0.447784,
 0      1
 10     1
 36     1
 41     1
 45     1
 54     1
 56     1
 61     2
 83     2
 86     2
 92     2
 124    2
 125    2
 140    3
 148    3
 150    3
 159    3
 Name: class_label, dtype: int64,
             0         1         2
 1   -2.150271  0.250461 -1.983409
 2   -2.490356 -1.156119  0.854712
 3   -3.648801 -2.874296 -0.272488
 4   -0.990048 -0.929837 