# Juan Felipe Osorio Franco


# Ejercicio:

- Consulte en que consiste el clasificador de vecinos más cercanos en sklearn (ver [documentación knn](https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm) y [knn sklearn](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html)).

- Realice una comparación entre los clasificadores SGD, RandomForest implementados en el cuaderno y Knn con distancia Euclídea de un vecino para el problema multiclase Mnist en términos de ACC, f1, y AUC.

In [2]:
import numpy as np
import tensorflow as tf
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.preprocessing import label_binarize

# Preparar los datos
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.mnist.load_data()
X = np.r_[X_train,X_test]
y = np.r_[y_train,y_test]
X = X/255.
X = X.reshape(X.shape[0],-1)

# Separar conjuntos de entrenamiento y prueba
X_train, X_test = X[:60000], X[60000:]
y_train, y_test = y[:60000], y[60000:]

# Normalizar datos
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
X_test_scaled = scaler.transform(X_test.astype(np.float64))

# Definir clasificadores
classifiers = {
    'SGD': SGDClassifier(max_iter=5, random_state=42),
    'RandomForest': RandomForestClassifier(n_estimators=10, random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=1)
}

# Métricas para cada clasificador
results = {}

for name, clf in classifiers.items():
    print(f"\nEvaluando {name}...")

    # Entrenar el clasificador
    clf.fit(X_train_scaled, y_train)

    # Obtener predicciones usando validación cruzada
    y_pred = cross_val_predict(clf, X_train_scaled, y_train, cv=3)

    # Calcular métricas
    acc = accuracy_score(y_train, y_pred)
    f1 = f1_score(y_train, y_pred, average='weighted')

    # Para AUC, necesitamos binarizar las etiquetas y calcular el promedio
    y_bin = label_binarize(y_train, classes=range(10))
    if hasattr(clf, "decision_function"):
        y_scores = cross_val_predict(clf, X_train_scaled, y_train, cv=3, method="decision_function")
    else:
        y_scores = cross_val_predict(clf, X_train_scaled, y_train, cv=3, method="predict_proba")

    if y_scores.ndim == 1:
        y_scores = np.column_stack([1 - y_scores, y_scores])

    # Calcular AUC promedio para todas las clases
    auc = roc_auc_score(y_bin, y_scores, multi_class='ovr', average='macro')

    results[name] = {
        'Accuracy': acc,
        'F1-Score': f1,
        'AUC': auc
    }

# Mostrar resultados
print("\nResultados de la comparación:")
print("-" * 50)
for clf_name, metrics in results.items():
    print(f"\n{clf_name}:")
    for metric_name, value in metrics.items():
        print(f"{metric_name}: {value:.4f}")

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[1m11490434/11490434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step

Evaluando SGD...





Evaluando RandomForest...

Evaluando KNN...

Resultados de la comparación:
--------------------------------------------------

SGD:
Accuracy: 0.9085
F1-Score: 0.9083
AUC: 0.9820

RandomForest:
Accuracy: 0.9397
F1-Score: 0.9396
AUC: 0.9938

KNN:
Accuracy: 0.9386
F1-Score: 0.9385
AUC: 0.9654
