# Trabajo 3: Clasificación de Imágenes Médicas (Pneumonia Detection)
## Parte 2: Clasificación con Descriptores Clásicos y CNN
**Objetivo:** Entrenar y evaluar múltiples clasificadores utilizando descriptores clásicos (HOG) y una red neuronal convolucional (CNN).

**Tareas:**
1. Crear matriz de características (HOG).
2. Normalización y Reducción de Dimensionalidad (PCA).
3. Entrenar clasificadores: SVM, Random Forest, k-NN, Logistic Regression.
4. Entrenar una CNN básica.
5. Evaluación comparativa.

In [None]:
import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from skimage.feature import hog
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc, classification_report
import tensorflow as tf
from tensorflow.keras import layers, models

%matplotlib inline
plt.rcParams['figure.figsize'] = (10, 6)

### 1. Carga y Preprocesamiento de Datos
Cargamos las imágenes, las convertimos a escala de grises, redimensionamos a 224x224 y aplicamos CLAHE.

In [None]:
BASE_DIR = Path('../data')
TRAIN_DIR = BASE_DIR / 'train'
TEST_DIR = BASE_DIR / 'test'
VAL_DIR = BASE_DIR / 'val'

IMG_SIZE = (224, 224)

def load_and_preprocess_data(directory):
    images = []
    labels = []
    classes = ['NORMAL', 'PNEUMONIA']
    
    for label, class_name in enumerate(classes):
        path = directory / class_name
        if not path.exists():
            print(f'Warning: Path {path} does not exist.')
            continue
            
        # Buscar extensiones comunes
        files = list(path.glob('*.jpeg')) + list(path.glob('*.jpg')) + list(path.glob('*.png'))
        
        for img_path in files:
            # Leer en escala de grises
            img = cv2.imread(str(img_path), cv2.IMREAD_GRAYSCALE)
            if img is None: continue
            
            # Resize
            img = cv2.resize(img, IMG_SIZE)
            
            # CLAHE
            clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
            img = clahe.apply(img)
            
            images.append(img)
            labels.append(label)
            
    return np.array(images), np.array(labels)

print('Cargando datos de entrenamiento...')
X_train_raw, y_train = load_and_preprocess_data(TRAIN_DIR)
print('Cargando datos de validación...')
X_val_raw, y_val = load_and_preprocess_data(VAL_DIR)
print('Cargando datos de prueba...')
X_test_raw, y_test = load_and_preprocess_data(TEST_DIR)

print(f'Train shape: {X_train_raw.shape}')
print(f'Val shape: {X_val_raw.shape}')
print(f'Test shape: {X_test_raw.shape}')

### 2. Extracción de Características (HOG)
Utilizamos Histogram of Oriented Gradients (HOG) como descriptor clásico.

In [None]:
def extract_hog_features(images):
    hog_features = []
    print('Extrayendo características HOG... (esto puede tardar un poco)')
    for i, img in enumerate(images):
        if i % 500 == 0: print(f'Procesando imagen {i}/{len(images)}')
        # pixels_per_cell=(16, 16) reduce la dimensionalidad
        feature = hog(img, orientations=9, pixels_per_cell=(16, 16),
                      cells_per_block=(2, 2), block_norm='L2-Hys', visualize=False)
        hog_features.append(feature)
    return np.array(hog_features)

X_train_hog = extract_hog_features(X_train_raw)
X_val_hog = extract_hog_features(X_val_raw)
X_test_hog = extract_hog_features(X_test_raw)

print(f'HOG Train shape: {X_train_hog.shape}')

### 3. Normalización y Reducción de Dimensionalidad (PCA)
Combinamos Train y Val para el entrenamiento de modelos clásicos (para tener más datos en CV), normalizamos y aplicamos PCA para reducir el número de features.

In [None]:
# Combinar Train y Val
X_full_hog = np.concatenate((X_train_hog, X_val_hog))
y_full = np.concatenate((y_train, y_val))

# Normalización
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_full_hog)
X_test_scaled = scaler.transform(X_test_hog)

# PCA
print('Aplicando PCA...')
pca = PCA(n_components=0.95) # Mantener 95% de varianza
X_pca = pca.fit_transform(X_scaled)
X_test_pca = pca.transform(X_test_scaled)

print(f'Original features: {X_scaled.shape[1]}')
print(f'Reduced features (95% variance): {X_pca.shape[1]}')

### 4. Entrenamiento y Evaluación de Clasificadores Clásicos
Probamos SVM, Random Forest, k-NN y Regresión Logística con Validación Cruzada.

In [None]:
classifiers = {
    'SVM (Linear)': SVC(kernel='linear', probability=True, random_state=42),
    'SVM (RBF)': SVC(kernel='rbf', probability=True, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'k-NN': KNeighborsClassifier(n_neighbors=5),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42)
}

results = {}

for name, clf in classifiers.items():
    print(f'\nEntrenando {name}...')
    # Cross Validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(clf, X_pca, y_full, cv=cv, scoring='f1')
    print(f'  CV F1-Score: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores):.4f})')
    
    # Train on full train set
    clf.fit(X_pca, y_full)
    
    # Predict on Test
    y_pred = clf.predict(X_test_pca)
    y_prob = clf.predict_proba(X_test_pca)[:, 1] if hasattr(clf, 'predict_proba') else None
    
    # Metrics
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    results[name] = {
        'Accuracy': acc,
        'Precision': prec,
        'Recall': rec,
        'F1-Score': f1,
        'y_pred': y_pred,
        'y_prob': y_prob
    }
    
    print(f'  Test F1-Score: {f1:.4f}')
    
    # Feature Importance (only for RF)
    if name == 'Random Forest':
        importances = clf.feature_importances_
        indices = np.argsort(importances)[::-1]
        plt.figure(figsize=(10, 4))
        plt.title('Feature Importances (Top 20 PCA Components)')
        plt.bar(range(20), importances[indices[:20]], align='center')
        plt.xticks(range(20), indices[:20])
        plt.xlim([-1, 20])
        plt.show()

### 5. Convolutional Neural Network (CNN)
Entrenamos una CNN básica utilizando las imágenes en bruto (preprocesadas).

In [None]:
print('Preparando datos para CNN...')
# Reshape for CNN: (N, 224, 224, 1)
X_train_cnn = X_train_raw.reshape(-1, 224, 224, 1) / 255.0
X_val_cnn = X_val_raw.reshape(-1, 224, 224, 1) / 255.0
X_test_cnn = X_test_raw.reshape(-1, 224, 224, 1) / 255.0

model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 1)),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

print('Entrenando CNN...')
history = model.fit(X_train_cnn, y_train, epochs=10, 
                    validation_data=(X_val_cnn, y_val), batch_size=32)

# Evaluate CNN
y_prob_cnn = model.predict(X_test_cnn)
y_pred_cnn = (y_prob_cnn > 0.5).astype(int).flatten()

results['CNN'] = {
    'Accuracy': accuracy_score(y_test, y_pred_cnn),
    'Precision': precision_score(y_test, y_pred_cnn),
    'Recall': recall_score(y_test, y_pred_cnn),
    'F1-Score': f1_score(y_test, y_pred_cnn),
    'y_pred': y_pred_cnn,
    'y_prob': y_prob_cnn.flatten()
}

### 6. Comparación de Resultados
Visualizamos las métricas y curvas ROC de todos los modelos.

In [None]:
# Compare Metrics
metrics_df = pd.DataFrame(results).T[['Accuracy', 'Precision', 'Recall', 'F1-Score']]
print(metrics_df)

plt.figure(figsize=(12, 6))
sns.heatmap(metrics_df, annot=True, cmap='viridis', fmt='.4f')
plt.title('Comparación de Modelos')
plt.show()

# ROC Curves
plt.figure(figsize=(10, 8))
for name, res in results.items():
    if res['y_prob'] is not None:
        fpr, tpr, _ = roc_curve(y_test, res['y_prob'])
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.2f})')

plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves')
plt.legend(loc='lower right')
plt.show()

In [None]:
# Matrices de Confusión
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

for i, (name, res) in enumerate(results.items()):
    if i >= len(axes): break
    cm = confusion_matrix(y_test, res['y_pred'])
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[i])
    axes[i].set_title(f'Confusion Matrix: {name}')
    axes[i].set_xlabel('Predicted')
    axes[i].set_ylabel('True')

plt.tight_layout()
plt.show()