# Baseline básico (Dummy + kNN)

Este notebook entrena dos baselines:
- **DummyClassifier (majority class)**
- **kNN (k=5)** con escalado

Y escribe los resultados en `logs/metrics_baseline.txt`. Si ese archivo ya existe, también lo muestra.


In [None]:
import os, logging
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier

# Logging mínimo
logging.basicConfig(
    filename="../logs/pipeline.log",
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

def load_data():
    processed = Path("../data/processed/customers_clean.csv")
    raw = Path("../data/raw/customers.csv")
    if processed.exists():
        logging.info("Notebook: cargando dataset procesado (customers_clean.csv)")
        df = pd.read_csv(processed)
    elif raw.exists():
        logging.info("Notebook: cargando dataset crudo (customers.csv) y aplicando preprocesamiento mínimo")
        df = pd.read_csv(raw).dropna(subset=["edad"]).copy()
        df["edad"] = df["edad"].astype(int)
    else:
        raise FileNotFoundError("No se encontró data/processed/customers_clean.csv ni data/raw/customers.csv")
    return df

df = load_data()
df.head()

In [None]:
X = df[["edad", "monto"]].values
y = df["fraude"].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

# Dummy
dummy = DummyClassifier(strategy="most_frequent")
dummy.fit(X_train, y_train)
y_pred_dummy = dummy.predict(X_test)

# kNN
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_std, y_train)
y_pred_knn = knn.predict(X_test_std)

from sklearn.metrics import classification_report, confusion_matrix
rep_dummy = classification_report(y_test, y_pred_dummy, digits=4)
rep_knn = classification_report(y_test, y_pred_knn, digits=4)
cm_dummy = confusion_matrix(y_test, y_pred_dummy).tolist()
cm_knn = confusion_matrix(y_test, y_pred_knn).tolist()

print("=== Dummy (majority class) ===")
print(rep_dummy)
print("Confusion matrix:", cm_dummy)
print("\n=== kNN (k=5) ===")
print(rep_knn)
print("Confusion matrix:", cm_knn)

# Guardar a logs/metrics_baseline.txt
os.makedirs("../logs", exist_ok=True)
with open("../logs/metrics_baseline.txt", "w", encoding="utf-8") as f:
    f.write("=== Dummy (majority class) ===\n")
    f.write(rep_dummy + "\n")
    f.write(f"Confusion matrix: {cm_dummy}\n\n")
    f.write("=== kNN (k=5) ===\n")
    f.write(rep_knn + "\n")
    f.write(f"Confusion matrix: {cm_knn}\n")

## Vista rápida del archivo `logs/metrics_baseline.txt` (si existe)

In [None]:
path = "../logs/metrics_baseline.txt"
if os.path.exists(path):
    with open(path, "r", encoding="utf-8") as f:
        print(f.read())
else:
    print("Aún no existe metrics_baseline.txt. Ejecuta las celdas anteriores primero.")