# Práctica 1: Aprendizaje Supervisado

Ejecución desde cero siguiendo `01-aprendizaje-supervisado.md`, usando el dataset de práctica 0 y el mismo preprocesado.


## 1. Imports y carga de datos


In [18]:
from pathlib import Path
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

BASE_DIR = Path('.').resolve()
IMAGES_DIR = BASE_DIR / 'images'
IMAGES_DIR.mkdir(parents=True, exist_ok=True)
DATA_PATH = BASE_DIR.parent / 'practica_0' / 'data' / 'ObesityDataSet_raw_and_data_sinthetic.csv'

df = pd.read_csv(DATA_PATH)
print('Shape original:', df.shape)


Shape original: (2111, 17)


## 2. Transformaciones de datos

Bloque único con las transformaciones de práctica 0 (solo transformaciones, sin imágenes).


In [None]:
# Transformaciones de datos (extraídas de práctica 0)
target_col = 'NObeyesdad'
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

# 1) Eliminar outliers de Weight y Height (IQR)
cols_to_clean = ['Weight', 'Height']
mask_clean = pd.Series(True, index=df.index)
for col in cols_to_clean:
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    mask_clean &= (df[col] >= lower) & (df[col] <= upper)
df = df.loc[mask_clean].reset_index(drop=True)

# 2) Transformación Box-Cox en Age + eliminación de outliers post-transformación
df['Age'], lambda_age = stats.boxcox(df['Age'] + 1)
q1 = df['Age'].quantile(0.25)
q3 = df['Age'].quantile(0.75)
iqr = q3 - q1
lower = q1 - 1.5 * iqr
upper = q3 + 1.5 * iqr
df = df[(df['Age'] >= lower) & (df['Age'] <= upper)].reset_index(drop=True)

# 3) Separar features/target y codificar target
X = df.drop(columns=[target_col]).copy()
y = df[target_col].copy()
le_target = LabelEncoder()
y_encoded = le_target.fit_transform(y)
class_names = [str(c) for c in le_target.classes_]

# 4) Codificación de categóricas de features
X_processed = X.copy()
binary_cols = ['Gender', 'family_history_with_overweight', 'FAVC', 'SMOKE', 'SCC']
for col in binary_cols:
    if col in X_processed.columns:
        le = LabelEncoder()
        X_processed[col] = le.fit_transform(X_processed[col])
multi_cat_cols = ['CAEC', 'CALC', 'MTRANS']
X_processed = pd.get_dummies(X_processed, columns=multi_cat_cols, drop_first=True)

# 5) Train-test split estratificado
X_train, X_test, y_train, y_test = train_test_split(
    X_processed,
    y_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded
)

# 6) Filtrado post-split de outliers por Z-score en train (solo variables numéricas originales)
scaler_temp = StandardScaler()
X_train_z = scaler_temp.fit_transform(X_train[numeric_cols])
mask_inliers = (np.abs(X_train_z) <= 3).all(axis=1)
X_train = X_train.loc[mask_inliers].reset_index(drop=True)
y_train = y_train[mask_inliers]

print('Shape transformado:', df.shape)
print('Shape X procesado:', X_processed.shape)
print('Train/Test:', X_train.shape, X_test.shape)


Shape transformado: (2107, 17)
Shape X procesado: (2107, 23)
Train/Test: (1685, 23) (422, 23)
Lambda Age Box-Cox: -1.641515
