## Clasificador KNN

In [1]:
import pandas as pd
import numpy as np
import math
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import confusion_matrix, accuracy_score 

In [2]:
obesity = pd.read_csv('dataset/obesity_clean.csv')
parkinsons = pd.read_csv("dataset/parkinsons_disease_data.csv")
diabetes = pd.read_csv("dataset/diabetes.csv")

In [3]:
featuresObesity = obesity[['Height','Weight','BMI','ObesityCategory_encoded']]
featuresDiabetes = diabetes[['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','Pedigree','Age','Outcome']]
featuresParkinsons = parkinsons[['UPDRS','FunctionalAssessment','MoCA','Tremor','Rigidity','Bradykinesia','Age','DietQuality',
                                 'CholesterolTotal','CholesterolLDL','CholesterolTriglycerides','SystolicBP','CholesterolHDL',
                                 'AlcoholConsumption','Diagnosis']]

## Metodos de validacion

### Holdout

In [4]:
def holdout_validation(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,stratify=y,random_state=42)
    return X_train, X_test, y_train, y_test

### Fold Cross-Validation

In [5]:
def k_fold_cross_validation(X, y, K=10):
    X = X.apply(pd.to_numeric, errors='coerce')
    y = pd.to_numeric(y, errors='coerce')
    
    X = X.fillna(X.mean())
    
    skf = StratifiedKFold(n_splits=K, shuffle=True, random_state=42)
    folds = []

    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index].values, X.iloc[test_index].values
        y_train, y_test = y.iloc[train_index].values, y.iloc[test_index].values
        
        folds.append((X_train, X_test, y_train, y_test))
    
    return folds


### Leave-One-Out

In [6]:
def leave_one_out_validation(X, y):
    X = X.apply(pd.to_numeric, errors='coerce')
    X = X.fillna(X.mean())
    
    loo = LeaveOneOut()
    leaveVali = []
    
    for train_index, test_index in loo.split(X):
        # Tomar los datos de entrenamiento y prueba como matrices numpy
        X_train, X_test = X.iloc[train_index].values, X.iloc[test_index].values
        y_train, y_test = y.iloc[train_index].values, y.iloc[test_index].values
        
        leaveVali.append((X_train, X_test, y_train, y_test))

    return leaveVali

## Clasificador KNN

In [7]:
def knn(X_train, y_train, X_test, k):
    clases = []
    for punto_prueba in X_test:
        distancias = []
        for i, punto_entrenamiento in enumerate(X_train):
            # Cálculo de la distancia Euclidiana
            distancia = math.sqrt(sum((t - x) ** 2 for t, x in zip(punto_entrenamiento, punto_prueba)))
            distancias.append((distancia, y_train[i]))  # y_train[i] debe ser un valor individual, no un array
        
        # Ordenar distancias y obtener las k etiquetas más cercanas
        distancias.sort(key=lambda x: x[0])
        k_cercanos = [n for _, n in distancias[:k]]
        
        # Seleccionar la clase más común
        mas_comun = Counter(k_cercanos).most_common(1)[0][0]
        clases.append(mas_comun)
    
    return clases


In [8]:
# Dataset Obesity

print("\n Validaciones para el Dataset Obesity \n")
X = featuresObesity[['Height', 'Weight', 'BMI']]
Y = featuresObesity['ObesityCategory_encoded']

k_obesity = 5

# Validación Holdout
obesityX_train, obesityX_test, obesityY_train, obesityY_test = holdout_validation(X.values, Y.values)
predicciones_knn = knn(obesityX_train, obesityY_train, obesityX_test, k_obesity)
conf_matrix_knn = confusion_matrix(obesityY_test, predicciones_knn)
accuracy_knn = accuracy_score(obesityY_test, predicciones_knn)

print("    Hold-out")
print("Matriz de Confusión:")
print(conf_matrix_knn)
print("\nPrecisión", accuracy_knn)

# Validación K-Fold
folds = k_fold_cross_validation(X, Y)
conf_matrix_kfold = np.zeros((len(set(Y)), len(set(Y))))
accuracy_kfold = []

for obesityX_train, obesityX_test, obesityY_train, obesityY_test in folds: 
    predicciones_knn = knn(obesityX_train, obesityY_train, obesityX_test, k_obesity)
    conf_matrix_kfold += confusion_matrix(obesityY_test, predicciones_knn)
    accuracy_kfold.append(accuracy_score(obesityY_test, predicciones_knn))

average_accuracy_kfold = np.mean(accuracy_kfold)

print("\n    K-Fold")
print("Matriz de Confusión:")
print(conf_matrix_kfold)
print("\nPrecisión:", average_accuracy_kfold)

# Validación Leave-One-Out
num_clases = len(set(Y))
conf_matrix_leave_one_out = np.zeros((num_clases, num_clases))
accuracy_leave_one_out = []

# Ejecutar Leave-One-Out
leaveVali = leave_one_out_validation(X, Y)
for obesityX_train, obesityX_test, obesityY_train, obesityY_test in leaveVali:
    predicciones_knn = knn(obesityX_train, obesityY_train, obesityX_test, k_obesity)
    conf_matrix_leave_one_out += confusion_matrix(obesityY_test, predicciones_knn, labels=range(num_clases))
    accuracy_leave_one_out.append(accuracy_score(obesityY_test, predicciones_knn))

average_accuracy_leave_one_out = np.mean(accuracy_leave_one_out)

print("\n    Leave-One-Out")
print("Matriz de Confusión:")
print(conf_matrix_leave_one_out)
print("\nPrecisión:", average_accuracy_leave_one_out)


 Validaciones para el Dataset Obesity 

    Hold-out
Matriz de Confusión:
[[109   0   2   0]
 [  0  55   2   0]
 [  3   0  86   0]
 [  2   0   0  41]]

Precisión 0.97

    K-Fold
Matriz de Confusión:
[[366.   0.   3.   2.]
 [  0. 181.  10.   0.]
 [  7.   5. 283.   0.]
 [  6.   0.   0. 137.]]

Precisión: 0.9670000000000002

    Leave-One-Out
Matriz de Confusión:
[[365.   0.   5.   1.]
 [  0. 182.   9.   0.]
 [  6.   4. 285.   0.]
 [  5.   0.   0. 138.]]

Precisión: 0.97


In [9]:
# Dataset parkinson

print("\n Validaciones para el Dataset Parkinsons \n")
X = featuresParkinsons[['UPDRS','FunctionalAssessment','MoCA','Tremor','Rigidity','Bradykinesia','Age','DietQuality',
                        'CholesterolTotal','CholesterolLDL','CholesterolTriglycerides','SystolicBP','CholesterolHDL','AlcoholConsumption']]
Y = featuresParkinsons['Diagnosis']

k_parkinson = 5

# Validación Holdout
parkinsonsX_train, parkinsonsX_test, parkinsonsY_train, parkinsonsY_test = holdout_validation(X, Y)
predicciones_knn = knn(parkinsonsX_train.values, parkinsonsY_train.values, parkinsonsX_test.values, k_parkinson)
conf_matrix_knn = confusion_matrix(parkinsonsY_test, predicciones_knn)
accuracy_knn = accuracy_score(parkinsonsY_test, predicciones_knn)

print("    Hold-out")
print("Matriz de Confusión:")
print(conf_matrix_knn)
print("\nPrecisión", accuracy_knn)

# Validación K-Fold
folds = k_fold_cross_validation(X, Y)
conf_matrix_kfold = np.zeros((len(set(Y)), len(set(Y))))
accuracy_kfold = []

for parkinsonsX_train, parkinsonsX_test, parkinsonsY_train, parkinsonsY_test in folds:
    predicciones_knn = knn(parkinsonsX_train, parkinsonsY_train, parkinsonsX_test, k_parkinson)
    conf_matrix_kfold += confusion_matrix(parkinsonsY_test, predicciones_knn)
    accuracy_kfold.append(accuracy_score(parkinsonsY_test, predicciones_knn))

average_accuracy_kfold = np.mean(accuracy_kfold)

print("\n    K-Fold")
print("Matriz de Confusión:")
print(conf_matrix_kfold)
print("\nPrecisión:", average_accuracy_kfold)

# Validación Leave-One-Out
num_clases = len(set(Y))
conf_matrix_leave_one_out = np.zeros((num_clases, num_clases))
accuracy_leave_one_out = []

# Ejecutar Leave-One-Out
leaveVali = leave_one_out_validation(X, Y)
for parkinsonsX_train, parkinsonsX_test, parkinsonsY_train, parkinsonsY_test in leaveVali:
    predicciones_knn = knn(parkinsonsX_train, parkinsonsY_train, parkinsonsX_test, k_parkinson)
    conf_matrix_leave_one_out += confusion_matrix(parkinsonsY_test, predicciones_knn, labels=range(num_clases))
    accuracy_leave_one_out.append(accuracy_score(parkinsonsY_test, predicciones_knn))

average_accuracy_leave_one_out = np.mean(accuracy_leave_one_out)

print("\n    Leave-One-Out")
print("Matriz de Confusión:")
print(conf_matrix_leave_one_out)
print("\nPrecisión:", average_accuracy_leave_one_out)


 Validaciones para el Dataset Parkinsons 

    Hold-out
Matriz de Confusión:
[[124 116]
 [ 67 325]]

Precisión 0.7104430379746836

    K-Fold
Matriz de Confusión:
[[ 403.  398.]
 [ 219. 1085.]]

Precisión: 0.7068742947415932

    Leave-One-Out
Matriz de Confusión:
[[ 411.  390.]
 [ 205. 1099.]]

Precisión: 0.7173396674584323


In [10]:
# Dataset diabetes

print("\n Validaciones para el Dataset Diabetes \n")
X = featuresDiabetes[['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','Pedigree','Age']]
Y = featuresDiabetes['Outcome']

k_diabetes = 5

# Validación Holdout
diabetesX_train, diabetesX_test, diabetesY_train, diabetesY_test = holdout_validation(X, Y)
predicciones_knn = knn(diabetesX_train.values, diabetesY_train.values, diabetesX_test.values, k_diabetes)
conf_matrix_knn = confusion_matrix(diabetesY_test, predicciones_knn)
accuracy_knn = accuracy_score(diabetesY_test, predicciones_knn)

print("    Hold-out")
print("Matriz de Confusión:")
print(conf_matrix_knn)
print("\nPrecisión", accuracy_knn)

# Validación K-Fold
folds = k_fold_cross_validation(X, Y)
conf_matrix_kfold = np.zeros((len(set(Y)), len(set(Y))))
accuracy_kfold = []

for diabetesX_train, diabetesX_test, diabetesY_train, diabetesY_test in folds:  
    predicciones_knn = knn(diabetesX_train, diabetesY_train, diabetesX_test, k_diabetes)
    conf_matrix_kfold += confusion_matrix(diabetesY_test, predicciones_knn)
    accuracy_kfold.append(accuracy_score(diabetesY_test, predicciones_knn))

average_accuracy_kfold = np.mean(accuracy_kfold)

print("\n    K-Fold")
print("Matriz de Confusión:")
print(conf_matrix_kfold)
print("\nPrecisión:", average_accuracy_kfold)

# Validación Leave-One-Out
num_clases = len(set(Y))
conf_matrix_leave_one_out = np.zeros((num_clases, num_clases))
accuracy_leave_one_out = []

# Ejecutar Leave-One-Out
leaveVali = leave_one_out_validation(X, Y)
for diabetesX_train, diabetesX_test, diabetesY_train, diabetesY_test in leaveVali:
    predicciones_knn = knn(diabetesX_train, diabetesY_train, diabetesX_test, k_diabetes)
    conf_matrix_leave_one_out += confusion_matrix(diabetesY_test, predicciones_knn, labels=range(num_clases))
    accuracy_leave_one_out.append(accuracy_score(diabetesY_test, predicciones_knn))

average_accuracy_leave_one_out = np.mean(accuracy_leave_one_out)

print("\n    Leave-One-Out")
print("Matriz de Confusión:")
print(conf_matrix_leave_one_out)
print("\nPrecisión:", average_accuracy_leave_one_out)


 Validaciones para el Dataset Diabetes 

    Hold-out
Matriz de Confusión:
[[125  25]
 [ 40  41]]

Precisión 0.7186147186147186

    K-Fold
Matriz de Confusión:
[[407.  93.]
 [129. 139.]]

Precisión: 0.7108680792891319

    Leave-One-Out
Matriz de Confusión:
[[409.  91.]
 [128. 140.]]

Precisión: 0.71484375
