# Lab 6 Clasificadores de la distancia mínima y 1NN

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import confusion_matrix, accuracy_score

In [2]:
iris = pd.read_csv('iris_clean.csv')
creditCard = pd.read_csv('creditCard_clean.csv')
obesity = pd.read_csv('obesity_clean.csv')

In [3]:
featuresIris = iris[['sepal length','sepal width','petal length','petal width','class_encoded']]
featuresCredit = creditCard[['Debt','Married','Industry_encoded','YearsEmployed','PriorDefault','Employed','CreditScore','Approved']]
featuresObesity = obesity[['Height','Weight','BMI','ObesityCategory_encoded']]

## Metodos de validacion

### Holdout

In [4]:
def holdout_validation(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,stratify=y,random_state=42)
    return X_train, X_test, y_train, y_test

### Fold Cross-Validation

In [5]:
def k_fold_cross_validation(X, y, K=10):
    skf = StratifiedKFold(n_splits=K, shuffle=True, random_state=42)
    
    folds = []

    for train_index, test_index in skf.split(X,y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        folds.append((X_train, X_test, y_train, y_test))
    
    return folds

### Leave-One-Out.

In [6]:
def leave_one_out_validation(X, y):
    loo = LeaveOneOut()
    leaveVali = []
    
    for train_index, test_index in loo.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        leaveVali.append((X_train, X_test, y_train, y_test))

    return leaveVali

## Clasificador de distancia minima

In [7]:
# Definir el clasificador de distancia mínima
class ClasificadorDistanciaMinima:
    def ajustar(self, X_entrenamiento, y_entrenamiento):
        # Calcula el centroide para cada clase en el conjunto de entrenamiento
        self.centroides = {}
        clases = set(y_entrenamiento)
        for clase in clases:
            muestras_clase = [x for x, y in zip(X_entrenamiento.values, y_entrenamiento) if y == clase]
            centroide = [sum(caracteristica) / len(muestras_clase) for caracteristica in zip(*muestras_clase)]
            self.centroides[clase] = centroide

    def predecir(self, X_prueba):
        # Predice la clase más cercana basado en la distancia euclidiana a los centroides
        predicciones = []
        for x in X_prueba.values:
            distancias = {clase: sum((xi - ci) ** 2 for xi, ci in zip(x, centroide)) ** 0.5 
                          for clase, centroide in self.centroides.items()}
            predicciones.append(min(distancias, key=distancias.get))
        return predicciones

In [8]:
# Instanciamos el clasificador
clasificador = ClasificadorDistanciaMinima()

In [9]:
# Dataset Iris
print("\n--- Validaciones para el Dataset Iris ---")
X = featuresIris[['sepal length', 'sepal width', 'petal length', 'petal width']]
Y = featuresIris['class_encoded']

# Validación Holdout
irisX_train, irisX_test, irisY_train, irisY_test = holdout_validation(X, Y)
clasificador.ajustar(irisX_train, irisY_train)
predicciones_holdout = clasificador.predecir(irisX_test)
conf_matrix_holdout = confusion_matrix(irisY_test, predicciones_holdout)
accuracy_holdout = accuracy_score(irisY_test, predicciones_holdout)

print("Matriz de Confusión - Holdout:")
print(conf_matrix_holdout)
print("Precisión - Holdout:", accuracy_holdout)

# Validación K-Fold
folds = k_fold_cross_validation(X, Y)
conf_matrix_kfold = np.zeros((len(set(Y)), len(set(Y))))
accuracy_kfold = []

for irisX_train, irisX_test, irisY_train, irisY_test in folds:
    clasificador.ajustar(irisX_train, irisY_train)
    predicciones_fold = clasificador.predecir(irisX_test)
    conf_matrix_kfold += confusion_matrix(irisY_test, predicciones_fold)
    accuracy_kfold.append(accuracy_score(irisY_test, predicciones_fold))

average_accuracy_kfold = np.mean(accuracy_kfold)

print("\nMatriz de Confusión - K-Fold (Acumulada):")
print(conf_matrix_kfold)
print("Precisión Promedio - K-Fold:", average_accuracy_kfold)

# Validación Leave-One-Out
num_clases = len(set(Y))
conf_matrix_leave_one_out = np.zeros((num_clases, num_clases))
accuracy_leave_one_out = []

leaveVali = leave_one_out_validation(X, Y)
for irisX_train, irisX_test, irisY_train, irisY_test in leaveVali:
    clasificador.ajustar(irisX_train, irisY_train)
    predicciones_leave = clasificador.predecir(irisX_test)
    conf_matrix_leave_one_out += confusion_matrix(irisY_test, predicciones_leave, labels=range(num_clases))
    accuracy_leave_one_out.append(accuracy_score(irisY_test, predicciones_leave))

average_accuracy_leave_one_out = np.mean(accuracy_leave_one_out)

print("\nMatriz de Confusión - Leave-One-Out (Acumulada):")
print(conf_matrix_leave_one_out)
print("Precisión Promedio - Leave-One-Out:", average_accuracy_leave_one_out)


--- Validaciones para el Dataset Iris ---
Matriz de Confusión - Holdout:
[[15  0  0]
 [ 0 14  1]
 [ 0  3 12]]
Precisión - Holdout: 0.9111111111111111

Matriz de Confusión - K-Fold (Acumulada):
[[50.  0.  0.]
 [ 0. 45.  5.]
 [ 0.  7. 43.]]
Precisión Promedio - K-Fold: 0.9200000000000002

Matriz de Confusión - Leave-One-Out (Acumulada):
[[50.  0.  0.]
 [ 0. 45.  5.]
 [ 0.  7. 43.]]
Precisión Promedio - Leave-One-Out: 0.92


In [10]:
# Dataset Credit Card
print("\n--- Validaciones para el Dataset Credit Card ---")
X = featuresCredit[['Debt', 'Married', 'Industry_encoded', 'YearsEmployed', 'PriorDefault', 'Employed', 'CreditScore']]
Y = featuresCredit['Approved']

# Validación Holdout
creditX_train, creditX_test, creditY_train, creditY_test = holdout_validation(X, Y)
clasificador.ajustar(creditX_train, creditY_train)
predicciones_holdout = clasificador.predecir(creditX_test)
conf_matrix_holdout = confusion_matrix(creditY_test, predicciones_holdout)
accuracy_holdout = accuracy_score(creditY_test, predicciones_holdout)

print("Matriz de Confusión - Holdout:")
print(conf_matrix_holdout)
print("Precisión - Holdout:", accuracy_holdout)

# Validación K-Fold
folds = k_fold_cross_validation(X, Y)
conf_matrix_kfold = np.zeros((len(set(Y)), len(set(Y))))
accuracy_kfold = []

for creditX_train, creditX_test, creditY_train, creditY_test in folds:
    clasificador.ajustar(creditX_train, creditY_train)
    predicciones_fold = clasificador.predecir(creditX_test)
    conf_matrix_kfold += confusion_matrix(creditY_test, predicciones_fold)
    accuracy_kfold.append(accuracy_score(creditY_test, predicciones_fold))

average_accuracy_kfold = np.mean(accuracy_kfold)

print("\nMatriz de Confusión - K-Fold (Acumulada):")
print(conf_matrix_kfold)
print("Precisión Promedio - K-Fold:", average_accuracy_kfold)

# Validación Leave-One-Out
num_clases = len(set(Y))
conf_matrix_leave_one_out = np.zeros((num_clases, num_clases))
accuracy_leave_one_out = []

leaveVali = leave_one_out_validation(X, Y)
for creditX_train, creditX_test, creditY_train, creditY_test in leaveVali:
    clasificador.ajustar(creditX_train, creditY_train)
    predicciones_leave = clasificador.predecir(creditX_test)
    conf_matrix_leave_one_out += confusion_matrix(creditY_test, predicciones_leave, labels=range(num_clases))
    accuracy_leave_one_out.append(accuracy_score(creditY_test, predicciones_leave))

average_accuracy_leave_one_out = np.mean(accuracy_leave_one_out)

print("\nMatriz de Confusión - Leave-One-Out (Acumulada):")
print(conf_matrix_leave_one_out)
print("Precisión Promedio - Leave-One-Out:", average_accuracy_leave_one_out)


--- Validaciones para el Dataset Credit Card ---
Matriz de Confusión - Holdout:
[[101  14]
 [ 37  55]]
Precisión - Holdout: 0.7536231884057971

Matriz de Confusión - K-Fold (Acumulada):
[[331.  52.]
 [122. 185.]]
Precisión Promedio - K-Fold: 0.7478260869565218

Matriz de Confusión - Leave-One-Out (Acumulada):
[[332.  51.]
 [121. 186.]]
Precisión Promedio - Leave-One-Out: 0.7507246376811594


In [11]:
# Dataset Obesity
print("\n--- Validaciones para el Dataset Obesity ---")
X = featuresObesity[['Height', 'Weight', 'BMI']]
Y = featuresObesity['ObesityCategory_encoded']

# Validación Holdout
obesityX_train, obesityX_test, obesityY_train, obesityY_test = holdout_validation(X, Y)
clasificador.ajustar(obesityX_train, obesityY_train)
predicciones_holdout = clasificador.predecir(obesityX_test)
conf_matrix_holdout = confusion_matrix(obesityY_test, predicciones_holdout)
accuracy_holdout = accuracy_score(obesityY_test, predicciones_holdout)

print("Matriz de Confusión - Holdout:")
print(conf_matrix_holdout)
print("Precisión - Holdout:", accuracy_holdout)

# Validación K-Fold
folds = k_fold_cross_validation(X, Y)
conf_matrix_kfold = np.zeros((len(set(Y)), len(set(Y))))
accuracy_kfold = []

for obesityX_train, obesityX_test, obesityY_train, obesityY_test in folds:
    clasificador.ajustar(obesityX_train, obesityY_train)
    predicciones_fold = clasificador.predecir(obesityX_test)
    conf_matrix_kfold += confusion_matrix(obesityY_test, predicciones_fold)
    accuracy_kfold.append(accuracy_score(obesityY_test, predicciones_fold))

average_accuracy_kfold = np.mean(accuracy_kfold)

print("\nMatriz de Confusión - K-Fold (Acumulada):")
print(conf_matrix_kfold)
print("Precisión Promedio - K-Fold:", average_accuracy_kfold)

# Validación Leave-One-Out
num_clases = len(set(Y))
conf_matrix_leave_one_out = np.zeros((num_clases, num_clases))
accuracy_leave_one_out = []

leaveVali = leave_one_out_validation(X, Y)
for obesityX_train, obesityX_test, obesityY_train, obesityY_test in leaveVali:
    clasificador.ajustar(obesityX_train, obesityY_train)
    predicciones_leave = clasificador.predecir(obesityX_test)
    conf_matrix_leave_one_out += confusion_matrix(obesityY_test, predicciones_leave, labels=range(num_clases))
    accuracy_leave_one_out.append(accuracy_score(obesityY_test, predicciones_leave))

average_accuracy_leave_one_out = np.mean(accuracy_leave_one_out)

print("\nMatriz de Confusión - Leave-One-Out (Acumulada):")
print(conf_matrix_leave_one_out)
print("Precisión Promedio - Leave-One-Out:", average_accuracy_leave_one_out)


--- Validaciones para el Dataset Obesity ---
Matriz de Confusión - Holdout:
[[102   0   5   4]
 [  0  48   9   0]
 [ 15   1  73   0]
 [  4   0   0  39]]
Precisión - Holdout: 0.8733333333333333

Matriz de Confusión - K-Fold (Acumulada):
[[306.   0.  41.  24.]
 [  0. 158.  33.   0.]
 [ 31.  11. 253.   0.]
 [ 12.   0.   0. 131.]]
Precisión Promedio - K-Fold: 0.8480000000000001

Matriz de Confusión - Leave-One-Out (Acumulada):
[[302.   0.  42.  27.]
 [  0. 158.  33.   0.]
 [ 31.  10. 254.   0.]
 [ 12.   0.   0. 131.]]
Precisión Promedio - Leave-One-Out: 0.845


## Clasificador 1NN

In [12]:
from sklearn.neighbors import KNeighborsClassifier
# Crear el clasificador 1NN
knn = KNeighborsClassifier(n_neighbors=1)

In [13]:
X = featuresIris[['sepal length','sepal width','petal length','petal width']]
Y = featuresIris['class_encoded']
# Validacion con holdout
irisX_train, irisX_test,irisY_train,irisY_test = holdout_validation(X,Y)
knn.fit(irisX_train, irisY_train)
predictionsHold = knn.predict(irisX_test)

conf_matrix_holdout = confusion_matrix(irisY_test, predictionsHold)
accuracy_holdout = accuracy_score(irisY_test, predictionsHold)

print("Matriz de Confusión - Holdout:")
print(conf_matrix_holdout)
print("Precisión - Holdout:", accuracy_holdout)

# Validacion con Fold
folds = k_fold_cross_validation(X,Y)
conf_matrix_kfold = np.zeros((len(set(Y)), len(set(Y))))
accuracy_kfold = []
for irisX_train, irisX_test,irisY_train,irisY_test in folds:
    knn.fit(irisX_train,irisY_train)
    predictionsFold = knn.predict(irisX_test)
    conf_matrix_kfold += confusion_matrix(irisY_test, predictionsFold)
    accuracy_kfold.append(accuracy_score(irisY_test, predictionsFold))

average_accuracy_kfold = np.mean(accuracy_kfold)

print("\nMatriz de Confusión - K-Fold (Acumulada):")
print(conf_matrix_kfold)
print("Precisión Promedio - K-Fold:", average_accuracy_kfold)


# Validacion con Leave-One-Out

num_clases = len(set(Y))
conf_matrix_leave_one_out = np.zeros((num_clases, num_clases))
accuracy_leave_one_out = []

leaveVali = leave_one_out_validation(X,Y)
for irisX_train, irisX_test,irisY_train,irisY_test in leaveVali:
    knn.fit(irisX_train,irisY_train)
    predictionsLeave = knn.predict(irisX_test)
    conf_matrix_fold = confusion_matrix(irisY_test, predictionsLeave, labels=range(num_clases))
    conf_matrix_leave_one_out += conf_matrix_fold
    accuracy_leave_one_out.append(accuracy_score(irisY_test, predictionsLeave))

average_accuracy_leave_one_out = np.mean(accuracy_leave_one_out)

print("\nMatriz de Confusión - Leave-One-Out (Acumulada):")
print(conf_matrix_leave_one_out)
print("Precisión Promedio - Leave-One-Out:", average_accuracy_leave_one_out)

Matriz de Confusión - Holdout:
[[15  0  0]
 [ 0 15  0]
 [ 0  3 12]]
Precisión - Holdout: 0.9333333333333333

Matriz de Confusión - K-Fold (Acumulada):
[[50.  0.  0.]
 [ 0. 47.  3.]
 [ 0.  3. 47.]]
Precisión Promedio - K-Fold: 0.9600000000000002

Matriz de Confusión - Leave-One-Out (Acumulada):
[[50.  0.  0.]
 [ 0. 47.  3.]
 [ 0.  3. 47.]]
Precisión Promedio - Leave-One-Out: 0.96


In [14]:
X = featuresCredit[['Debt','Married','Industry_encoded','YearsEmployed','PriorDefault','Employed','CreditScore']]
Y = featuresCredit['Approved']
# Validacion con holdout
creditX_train, creditX_test,creditY_train,creditY_test = holdout_validation(X,Y)
knn.fit(creditX_train, creditY_train)
predictionsHold = knn.predict(creditX_test)

conf_matrix_holdout = confusion_matrix(creditY_test, predictionsHold)
accuracy_holdout = accuracy_score(creditY_test, predictionsHold)

print("Matriz de Confusión - Holdout:")
print(conf_matrix_holdout)
print("Precisión - Holdout:", accuracy_holdout)

# Validacion con Fold
folds = k_fold_cross_validation(X,Y)
conf_matrix_kfold = np.zeros((len(set(Y)), len(set(Y))))
accuracy_kfold = []
for creditX_train, creditX_test,creditY_train,creditY_test in folds:
    knn.fit(creditX_train,creditY_train)
    predictionsFold = knn.predict(creditX_test)
    conf_matrix_kfold += confusion_matrix(creditY_test, predictionsFold)
    accuracy_kfold.append(accuracy_score(creditY_test, predictionsFold))

average_accuracy_kfold = np.mean(accuracy_kfold)

print("\nMatriz de Confusión - K-Fold (Acumulada):")
print(conf_matrix_kfold)
print("Precisión Promedio - K-Fold:", average_accuracy_kfold)


# Validacion con Leave-One-Out

num_clases = len(set(Y))
conf_matrix_leave_one_out = np.zeros((num_clases, num_clases))
accuracy_leave_one_out = []

leaveVali = leave_one_out_validation(X,Y)
for creditX_train, creditX_test,creditY_train,creditY_test in leaveVali:
    knn.fit(creditX_train,creditY_train)
    predictionsLeave = knn.predict(creditX_test)
    conf_matrix_fold = confusion_matrix(creditY_test, predictionsLeave, labels=range(num_clases))
    conf_matrix_leave_one_out += conf_matrix_fold
    accuracy_leave_one_out.append(accuracy_score(creditY_test, predictionsLeave))

average_accuracy_leave_one_out = np.mean(accuracy_leave_one_out)

print("\nMatriz de Confusión - Leave-One-Out (Acumulada):")
print(conf_matrix_leave_one_out)
print("Precisión Promedio - Leave-One-Out:", average_accuracy_leave_one_out)

Matriz de Confusión - Holdout:
[[96 19]
 [28 64]]
Precisión - Holdout: 0.7729468599033816

Matriz de Confusión - K-Fold (Acumulada):
[[312.  71.]
 [ 94. 213.]]
Precisión Promedio - K-Fold: 0.7608695652173912

Matriz de Confusión - Leave-One-Out (Acumulada):
[[319.  64.]
 [ 90. 217.]]
Precisión Promedio - Leave-One-Out: 0.7768115942028986


In [15]:
X = featuresObesity[['Height','Weight','BMI']]
Y = featuresObesity['ObesityCategory_encoded']
# Validacion con holdout
obesityX_train, obesityX_test,obesityY_train,obesityY_test = holdout_validation(X,Y)
knn.fit(obesityX_train, obesityY_train)
predictionsHold = knn.predict(obesityX_test)

conf_matrix_holdout = confusion_matrix(obesityY_test, predictionsHold)
accuracy_holdout = accuracy_score(obesityY_test, predictionsHold)

print("Matriz de Confusión - Holdout:")
print(conf_matrix_holdout)
print("Precisión - Holdout:", accuracy_holdout)

# Validacion con Fold
folds = k_fold_cross_validation(X,Y)
conf_matrix_kfold = np.zeros((len(set(Y)), len(set(Y))))
accuracy_kfold = []
for obesityX_train, obesityX_test,obesityY_train,obesityY_test in folds:
    knn.fit(obesityX_train,obesityY_train)
    predictionsFold = knn.predict(obesityX_test)
    conf_matrix_kfold += confusion_matrix(obesityY_test, predictionsFold)
    accuracy_kfold.append(accuracy_score(obesityY_test, predictionsFold))

average_accuracy_kfold = np.mean(accuracy_kfold)

print("\nMatriz de Confusión - K-Fold (Acumulada):")
print(conf_matrix_kfold)
print("Precisión Promedio - K-Fold:", average_accuracy_kfold)


# Validacion con Leave-One-Out
num_clases = len(set(Y))
conf_matrix_leave_one_out = np.zeros((num_clases, num_clases))
accuracy_leave_one_out = []

leaveVali = leave_one_out_validation(X,Y)
for obesityX_train, obesityX_test,obesityY_train,obesityY_test in leaveVali:
    knn.fit(obesityX_train,obesityY_train)
    predictionsLeave = knn.predict(obesityX_test)
    conf_matrix_fold = confusion_matrix(obesityY_test, predictionsLeave, labels=range(num_clases))
    conf_matrix_leave_one_out += conf_matrix_fold
    accuracy_leave_one_out.append(accuracy_score(obesityY_test, predictionsLeave))

average_accuracy_leave_one_out = np.mean(accuracy_leave_one_out)

print("\nMatriz de Confusión - Leave-One-Out (Acumulada):")
print(conf_matrix_leave_one_out)
print("Precisión Promedio - Leave-One-Out:", average_accuracy_leave_one_out)

Matriz de Confusión - Holdout:
[[109   0   1   1]
 [  0  54   3   0]
 [  2   0  87   0]
 [  1   0   0  42]]
Precisión - Holdout: 0.9733333333333334

Matriz de Confusión - K-Fold (Acumulada):
[[366.   0.   1.   4.]
 [  0. 188.   3.   0.]
 [  2.   5. 288.   0.]
 [  3.   0.   0. 140.]]
Precisión Promedio - K-Fold: 0.982

Matriz de Confusión - Leave-One-Out (Acumulada):
[[364.   0.   3.   4.]
 [  0. 188.   3.   0.]
 [  3.   5. 287.   0.]
 [  4.   0.   0. 139.]]
Precisión Promedio - Leave-One-Out: 0.978
