# SMOTE y Perceptron Simple

## SMOTE

In [1]:
import numpy as np
from collections import Counter
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from ucimlrepo import fetch_ucirepo

def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

class EuclideanClassifier:
    def __init__(self):
        self.centroids = {}
        
    def fit(self, X, y):
        classes = np.unique(y)
        for c in classes:
            # Calcular el centroide para cada clase
            self.centroids[c] = np.mean(X[y == c], axis=0)
            
    def predict(self, X):
        predictions = []
        for x in X:
            distances = {c: euclidean_distance(x, centroid) 
                        for c, centroid in self.centroids.items()}
            predictions.append(min(distances.items(), key=lambda x: x[1])[0])
        return np.array(predictions)

class KNNClassifier:
    def __init__(self, k=1):
        self.k = k
        self.X_train = None
        self.y_train = None
        
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
        
    def predict(self, X):
        predictions = []
        for x in X:
            distances = [euclidean_distance(x, x_train) for x_train in self.X_train]
            k_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = self.y_train[k_indices]
            most_common = np.bincount(k_nearest_labels).argmax()
            predictions.append(most_common)
        return np.array(predictions)

def smote(X, y, k=5, minority_target=None, n_synthetic=None):
    """
    Implementación de SMOTE (Synthetic Minority Over-sampling Technique)
    """
    # Identificar la clase minoritaria si no se especifica
    if minority_target is None:
        counts = Counter(y)
        minority_target = min(counts, key=counts.get)
    
    # Obtener ejemplos de la clase minoritaria
    minority_indices = np.where(y == minority_target)[0]
    X_minority = X[minority_indices]
    
    if n_synthetic is None:
        # Calcular cuántas muestras sintéticas generar
        counts = Counter(y)
        n_synthetic = max(counts.values()) - counts[minority_target]
    
    synthetic_samples = []
    
    # Generar muestras sintéticas
    for _ in range(n_synthetic):
        # Seleccionar un ejemplo de la clase minoritaria al azar
        idx = np.random.randint(0, len(X_minority))
        point = X_minority[idx]
        
        # Encontrar k vecinos más cercanos
        distances = [euclidean_distance(point, p) for p in X_minority]
        sorted_indices = np.argsort(distances)[1:k+1]  # Excluir el punto mismo
        
        # Seleccionar un vecino al azar
        neighbor_idx = np.random.choice(sorted_indices)
        neighbor = X_minority[neighbor_idx]
        
        # Generar punto sintético
        diff = neighbor - point
        gap = np.random.random()
        synthetic_point = point + gap * diff
        
        synthetic_samples.append(synthetic_point)
    
    # Combinar datos originales con sintéticos
    X_synthetic = np.vstack((X, synthetic_samples))
    y_synthetic = np.hstack((y, [minority_target] * n_synthetic))
    
    return X_synthetic, y_synthetic

def evaluate_classifier(clf, X, y, test_size=0.2, cv_folds=10):
    """
    Evalúa el clasificador usando both hold-out y k-fold cross-validation
    """
    results = {}
    
    # Hold-out validation
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    holdout_accuracy = np.mean(y_pred == y_test)
    results['holdout'] = holdout_accuracy
    
    # K-fold cross-validation
    kf = KFold(n_splits=cv_folds, shuffle=True, random_state=42)
    cv_scores = []
    
    for train_idx, test_idx in kf.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        cv_scores.append(np.mean(y_pred == y_test))
    
    results['cv_mean'] = np.mean(cv_scores)
    results['cv_std'] = np.std(cv_scores)
    
    return results

# Cargar y preparar los datos
glass_identification = fetch_ucirepo(id=42)
X = glass_identification.data.features.values
y = glass_identification.data.targets.values.ravel()

# Crear instancias de los clasificadores
euclidean_clf = EuclideanClassifier()
knn_clf = KNNClassifier(k=1)

# Evaluar antes de SMOTE
print("Resultados antes de SMOTE:")
print("\nClasificador Euclidiano:")
euclidean_results_before = evaluate_classifier(euclidean_clf, X, y)
print(f"Hold-out accuracy: {euclidean_results_before['holdout']:.4f}")
print(f"CV accuracy: {euclidean_results_before['cv_mean']:.4f} ± {euclidean_results_before['cv_std']:.4f}")

print("\n1-NN:")
knn_results_before = evaluate_classifier(knn_clf, X, y)
print(f"Hold-out accuracy: {knn_results_before['holdout']:.4f}")
print(f"CV accuracy: {knn_results_before['cv_mean']:.4f} ± {knn_results_before['cv_std']:.4f}")

# Aplicar SMOTE
X_balanced, y_balanced = smote(X, y)

# Evaluar después de SMOTE
print("\nResultados después de SMOTE:")
print("\nClasificador Euclidiano:")
euclidean_results_after = evaluate_classifier(euclidean_clf, X_balanced, y_balanced)
print(f"Hold-out accuracy: {euclidean_results_after['holdout']:.4f}")
print(f"CV accuracy: {euclidean_results_after['cv_mean']:.4f} ± {euclidean_results_after['cv_std']:.4f}")

print("\n1-NN:")
knn_results_after = evaluate_classifier(knn_clf, X_balanced, y_balanced)
print(f"Hold-out accuracy: {knn_results_after['holdout']:.4f}")
print(f"CV accuracy: {knn_results_after['cv_mean']:.4f} ± {knn_results_after['cv_std']:.4f}")

Resultados antes de SMOTE:

Clasificador Euclidiano:
Hold-out accuracy: 0.5581
CV accuracy: 0.4522 ± 0.0737

1-NN:
Hold-out accuracy: 0.8372
CV accuracy: 0.7195 ± 0.0947

Resultados después de SMOTE:

Clasificador Euclidiano:
Hold-out accuracy: 0.6140
CV accuracy: 0.5622 ± 0.0736

1-NN:
Hold-out accuracy: 0.8772
CV accuracy: 0.8042 ± 0.0922


## Perceptron Simple

In [5]:
import numpy as np
from ucimlrepo import fetch_ucirepo

class SimplePerceptron:
    def __init__(self, learning_rate=0.01, n_iterations=1000):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.weights = None
        self.bias = None
        
    def fit(self, X, y):
        n_samples, n_features = X.shape
        
        # Inicializar pesos con valores aleatorios pequeños
        self.weights = np.random.randn(n_features) * 0.01
        self.bias = 0
        
        # Historial de errores para monitoreo
        self.errors = []
        
        # Entrenamiento
        for _ in range(self.n_iterations):
            errors = 0
            
            for idx in range(n_samples):
                linear_output = np.dot(X[idx], self.weights) + self.bias
                prediction = self.activation_function(linear_output)
                error = y[idx] - prediction
                
                # Actualizar pesos y bias
                self.weights += self.learning_rate * error * X[idx]
                self.bias += self.learning_rate * error
                errors += abs(error)
            
            self.errors.append(errors)
            if errors == 0:
                break
    
    def predict(self, X):
        linear_output = np.dot(X, self.weights) + self.bias
        return np.array([self.activation_function(output) for output in linear_output])
    
    def activation_function(self, x):
        return 1 if x >= 0 else 0

def calculate_metrics(y_true, y_pred):
    """Calcula métricas de evaluación manualmente"""
    if len(y_true) == 0:
        return 0, 0, 0, 0, 0, 0, 0, 0
    
    # Calcular matriz de confusión
    tn = sum((yt == 0 and yp == 0) for yt, yp in zip(y_true, y_pred))
    fp = sum((yt == 0 and yp == 1) for yt, yp in zip(y_true, y_pred))
    fn = sum((yt == 1 and yp == 0) for yt, yp in zip(y_true, y_pred))
    tp = sum((yt == 1 and yp == 1) for yt, yp in zip(y_true, y_pred))
    
    # Calcular métricas
    accuracy = (tp + tn) / len(y_true) if len(y_true) > 0 else 0
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return accuracy, precision, recall, f1, tp, tn, fp, fn

# Cargar y preparar los datos
iris = fetch_ucirepo(id=53)
X_raw = iris.data.features.values
y_raw = iris.data.targets.values.ravel()

# Convertir etiquetas string a números
y_numeric = np.zeros_like(y_raw, dtype=int)
y_numeric[y_raw == 'Iris-setosa'] = 0
y_numeric[y_raw == 'Iris-virginica'] = 2

# Seleccionar solo setosa (0) y virginica (2)
mask = np.isin(y_numeric, [0, 2])
X = X_raw[mask]
y = y_numeric[mask]

# Convertir virginica (2) a 1 para clasificación binaria
y = (y == 2).astype(int)

# Normalizar características manualmente
X_mean = np.mean(X, axis=0)
X_std = np.std(X, axis=0)
X = (X - X_mean) / X_std

# Verificar los datos procesados
print("Forma de X:", X.shape)
print("Forma de y:", y.shape)
print("Valores únicos en y:", np.unique(y))
print("Distribución de clases:", np.bincount(y))

# División Hold-out (70/30)
np.random.seed(42)
indices = np.random.permutation(len(X))
split_point = int(len(X) * 0.7)

X_train = X[indices[:split_point]]
X_test = X[indices[split_point:]]
y_train = y[indices[:split_point]]
y_test = y[indices[split_point:]]

# Verificar conjuntos de entrenamiento y prueba
print("\nConjuntos de entrenamiento y prueba:")
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

# Entrenar el perceptrón
perceptron = SimplePerceptron(learning_rate=0.1, n_iterations=1000)
perceptron.fit(X_train, y_train)

# Realizar predicciones
y_train_pred = perceptron.predict(X_train)
y_test_pred = perceptron.predict(X_test)

# Calcular y mostrar métricas para conjunto de entrenamiento
train_metrics = calculate_metrics(y_train, y_train_pred)
print("\nResultados en conjunto de entrenamiento:")
print(f"Accuracy: {train_metrics[0]:.4f}")
print(f"Precision: {train_metrics[1]:.4f}")
print(f"Recall: {train_metrics[2]:.4f}")
print(f"F1-Score: {train_metrics[3]:.4f}")
print(f"Matriz de Confusión:")
print(f"TP: {train_metrics[4]}, TN: {train_metrics[5]}")
print(f"FP: {train_metrics[6]}, FN: {train_metrics[7]}")

# Calcular y mostrar métricas para conjunto de prueba
test_metrics = calculate_metrics(y_test, y_test_pred)
print("\nResultados en conjunto de prueba:")
print(f"Accuracy: {test_metrics[0]:.4f}")
print(f"Precision: {test_metrics[1]:.4f}")
print(f"Recall: {test_metrics[2]:.4f}")
print(f"F1-Score: {test_metrics[3]:.4f}")
print(f"Matriz de Confusión:")
print(f"TP: {test_metrics[4]}, TN: {test_metrics[5]}")
print(f"FP: {test_metrics[6]}, FN: {test_metrics[7]}")

print("\nPesos finales:", perceptron.weights)
print("Bias final:", perceptron.bias)
print("Iteraciones hasta convergencia:", len(perceptron.errors))

Forma de X: (150, 4)
Forma de y: (150,)
Valores únicos en y: [0 1]
Distribución de clases: [100  50]

Conjuntos de entrenamiento y prueba:
X_train shape: (105, 4)
X_test shape: (45, 4)
y_train shape: (105,)
y_test shape: (45,)

Resultados en conjunto de entrenamiento:
Accuracy: 0.9524
Precision: 0.9655
Recall: 0.8750
F1-Score: 0.9180
Matriz de Confusión:
TP: 28, TN: 72
FP: 1, FN: 4

Resultados en conjunto de prueba:
Accuracy: 0.9556
Precision: 1.0000
Recall: 0.8889
F1-Score: 0.9412
Matriz de Confusión:
TP: 16, TN: 27
FP: 0, FN: 2

Pesos finales: [-0.48193779 -0.25098156  1.88421894  1.00312574]
Bias final: -2.0000000000000004
Iteraciones hasta convergencia: 1000
