# Parte I. 
Implementa el método de SMOTE, descrito en: 
https://www.analyticsvidhya.com/blog/2020/10/overcoming-class-imbalance-using-smote-techniques/

Prueba tu implementación con el dataset:
Glass

Aplica el clasificador Euclidiano y 1NN antes y después de aplicar el SMOTE

Valida el desempeño de los clasificadores con 
Hold-Out 
10-Fold Cross-Validation

Determina si hubo mejoras en el desempeño de los clasificadores. 

paquete para dataset glass

In [3]:
pip install ucimlrepo


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


importo el dataset 

In [4]:
from ucimlrepo import fetch_ucirepo  # Importar la función para descargar datasets

# Descargar el dataset Glass
glass_data = fetch_ucirepo(id=42)

# Extraer características y etiquetas
X = glass_data.data.features  # Características del dataset
y = glass_data.data.targets   # Etiquetas del dataset

# Mostrar información del dataset (opcional)
print(glass_data.metadata)     # Información general
print(glass_data.variables)    # Detalle de las variables


{'uci_id': 42, 'name': 'Glass Identification', 'repository_url': 'https://archive.ics.uci.edu/dataset/42/glass+identification', 'data_url': 'https://archive.ics.uci.edu/static/public/42/data.csv', 'abstract': 'From USA Forensic Science Service; 6 types of glass; defined in terms of their oxide content (i.e. Na, Fe, K, etc)', 'area': 'Physics and Chemistry', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 214, 'num_features': 9, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['Type_of_glass'], 'index_col': ['Id_number'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1987, 'last_updated': 'Thu Aug 10 2023', 'dataset_doi': '10.24432/C5WW2P', 'creators': ['B. German'], 'intro_paper': None, 'additional_info': {'summary': 'Vina conducted a comparison test of her rule-based system, BEAGLE, the nearest-neighbor algorithm, and discriminant analysis.  BEAGLE is a product available through VRS Consulting, In

In [12]:
import numpy as np
from scipy.spatial import distance

# SMOTE manual
def smote(X, y, minority_class, k=5):
    np.random.seed(42)  # Para reproducibilidad

    # Asegurarse de que X e y son arreglos numpy
    X = np.array(X)
    y = np.array(y)

    # Filtrar datos de la clase minoritaria
    minority_indices = np.where(y == minority_class)[0]
    X_minority = X[minority_indices]
    
    synthetic_samples = []
    
    # Generar muestras sintéticas para cada punto de la clase minoritaria
    for sample in X_minority:
        # Calcular distancias a todos los puntos de la misma clase
        dists = distance.cdist([sample], X_minority, metric='euclidean')[0]
        nearest_neighbors = np.argsort(dists)[1:k+1]  # Excluir el propio punto
        
        # Crear muestras sintéticas
        for neighbor in nearest_neighbors:
            diff = X_minority[neighbor] - sample
            new_sample = sample + np.random.rand() * diff
            synthetic_samples.append(new_sample)
    
    # Convertir las muestras sintéticas a un arreglo de numpy
    synthetic_samples = np.array(synthetic_samples)
    
    # Concatenar los datos originales con los nuevos
    X_synthetic = np.vstack([X, synthetic_samples])
    y_synthetic = np.hstack([y.flatten(), np.full(synthetic_samples.shape[0], minority_class)])
    
    return X_synthetic, y_synthetic




# Clasificador Euclidiano
def euclidean_classifier(X_train, y_train, X_test):
    predictions = []
    for test_sample in X_test:
        dists = distance.cdist([test_sample], X_train, metric='euclidean')[0]
        closest_idx = np.argmin(dists)
        predictions.append(y_train[closest_idx])
    return np.array(predictions)

# Clasificador 1NN (idéntico al Euclidiano, pero explícito para nombrar la comparación)
def one_nn_classifier(X_train, y_train, X_test):
    return euclidean_classifier(X_train, y_train, X_test)

# Validación Hold-Out
def hold_out_validation(X, y, classifier, test_size=0.3):
    np.random.seed(42)
    indices = np.arange(len(y))
    np.random.shuffle(indices)
    split_idx = int(len(y) * (1 - test_size))
    
    X_train, X_test = X[indices[:split_idx]], X[indices[split_idx:]]
    y_train, y_test = y[indices[:split_idx]], y[indices[split_idx:]]
    
    y_pred = classifier(X_train, y_train, X_test)
    accuracy = np.mean(y_pred == y_test)
    return accuracy

# Validación 10-Fold Cross-Validation
def cross_validation(X, y, classifier, k=10):
    n_samples = len(y)
    fold_size = n_samples // k
    indices = np.arange(n_samples)
    np.random.seed(42)
    np.random.shuffle(indices)
    accuracies = []
    
    for i in range(k):
        start, end = i * fold_size, (i + 1) * fold_size
        test_indices = indices[start:end]
        train_indices = np.hstack([indices[:start], indices[end:]])
        
        # Asegurarse de que no haya índices vacíos
        if len(test_indices) == 0 or len(train_indices) == 0:
            continue

        X_test, y_test = X[test_indices], y[test_indices]
        X_train, y_train = X[train_indices], y[train_indices]

        y_pred = classifier(X_train, y_train, X_test)
        accuracies.append(np.mean(y_pred == y_test))
    
    if len(accuracies) == 0:
        raise ValueError("No se pudieron generar pliegues válidos para la validación cruzada.")
    
    return np.mean(accuracies)


# Dataset Glass con ucimlrepo
from ucimlrepo import fetch_ucirepo
glass_data = fetch_ucirepo(id=42)
X = np.array(glass_data.data.features)
y = np.array(glass_data.data.targets)

# Identificar clase minoritaria
classes, counts = np.unique(y, return_counts=True)
minority_class = classes[np.argmin(counts)]

# Resultados sin SMOTE
print("\nResultados sin SMOTE:")
hold_out_acc_euclidean = hold_out_validation(X, y, euclidean_classifier)
hold_out_acc_1nn = hold_out_validation(X, y, one_nn_classifier)
cv_acc_euclidean = cross_validation(X, y, euclidean_classifier)
cv_acc_1nn = cross_validation(X, y, one_nn_classifier)

print(f"Hold-Out Euclidean Accuracy: {hold_out_acc_euclidean:.4f}")
print(f"Hold-Out 1NN Accuracy: {hold_out_acc_1nn:.4f}")
print(f"10-Fold CV Euclidean Accuracy: {cv_acc_euclidean:.4f}")
print(f"10-Fold CV 1NN Accuracy: {cv_acc_1nn:.4f}")

# Aplicar SMOTE
X_smote, y_smote = smote(X, y, minority_class)

# Resultados con SMOTE
print("\nResultados con SMOTE:")
hold_out_acc_euclidean_smote = hold_out_validation(X_smote, y_smote, euclidean_classifier)
hold_out_acc_1nn_smote = hold_out_validation(X_smote, y_smote, one_nn_classifier)
cv_acc_euclidean_smote = cross_validation(X_smote, y_smote, euclidean_classifier)
cv_acc_1nn_smote = cross_validation(X_smote, y_smote, one_nn_classifier)

print(f"Hold-Out Euclidean Accuracy (SMOTE): {hold_out_acc_euclidean_smote:.4f}")
print(f"Hold-Out 1NN Accuracy (SMOTE): {hold_out_acc_1nn_smote:.4f}")
print(f"10-Fold CV Euclidean Accuracy (SMOTE): {cv_acc_euclidean_smote:.4f}")
print(f"10-Fold CV 1NN Accuracy (SMOTE): {cv_acc_1nn_smote:.4f}")



Resultados sin SMOTE:
Hold-Out Euclidean Accuracy: 0.7231
Hold-Out 1NN Accuracy: 0.7231
10-Fold CV Euclidean Accuracy: 0.7286
10-Fold CV 1NN Accuracy: 0.7286

Resultados con SMOTE:
Hold-Out Euclidean Accuracy (SMOTE): 0.7692
Hold-Out 1NN Accuracy (SMOTE): 0.7692
10-Fold CV Euclidean Accuracy (SMOTE): 0.7960
10-Fold CV 1NN Accuracy (SMOTE): 0.7960
