In [6]:
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("/workspaces/Final_Project_DataScient/data/processed/datafinal.csv")
df

Unnamed: 0,HHADULT,SEXVAR,MEDCOST1,SLEPTIM1,CVDSTRK3,ADDEPEV3,DIABETE4,MARITAL,RENTHOM1,VETERAN3,...,_ASTHMS1,_DRDXAR2,_AGEG5YR,_BMI5CAT,_CHLDCNT,_EDUCAG,_INCOMG1,_RFBING6,_AIDTST4,tobacco_use
0,2,1.0,0.0,6.0,0.0,1.0,0.0,0.0,0.0,0.0,...,3.0,0.0,8.0,1.0,0.0,3.0,4.0,0.0,0.0,0.0
1,4,1.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,0.0,8.0,2.0,0.0,3.0,-1.0,0.0,0.0,0.0
2,2,1.0,0.0,7.0,0.0,0.0,0.0,0.0,1.0,0.0,...,3.0,1.0,6.0,3.0,0.0,1.0,2.0,0.0,0.0,0.0
3,2,0.0,0.0,6.0,0.0,0.0,3.0,0.0,0.0,0.0,...,3.0,0.0,6.0,3.0,0.0,2.0,5.0,0.0,0.0,0.0
4,2,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,1.0,...,3.0,0.0,8.0,3.0,0.0,2.0,3.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109974,1,0.0,0.0,6.0,0.0,0.0,0.0,1.0,0.0,0.0,...,3.0,1.0,8.0,2.0,0.0,2.0,5.0,1.0,0.0,1.0
109975,4,0.0,0.0,7.0,1.0,0.0,3.0,0.0,1.0,0.0,...,3.0,1.0,9.0,2.0,0.0,3.0,5.0,0.0,1.0,0.0
109976,2,1.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,1.0,...,3.0,0.0,6.0,2.0,1.0,3.0,4.0,0.0,1.0,0.0
109977,1,1.0,1.0,5.0,0.0,0.0,0.0,0.0,2.0,0.0,...,3.0,0.0,9.0,0.0,0.0,1.0,-1.0,-1.0,0.0,2.0


In [3]:
df["ADDEPEV3"].value_counts(normalize=True)

ADDEPEV3
0.0    0.793233
1.0    0.206767
Name: proportion, dtype: float64

# 🏁 Modelado Predictivo

## 🎯 Introducción
Este notebook tiene como objetivo construir y evaluar un modelo de aprendizaje automático para **predecir la variable objetivo** a partir de un conjunto de datos inicial. El dataset utilizado presenta desafíos típicos en ciencia de datos, como:

- **Clases desbalanceadas**, lo que exige evaluar diferentes estrategias de balanceo.
- Gran cantidad de **características**, para las que es clave evaluar métodos de selección y reducción de variables.

A lo largo del notebook se realizan pruebas para:

- Comparar diferentes modelos base (Logistic Regression, Random Forest, XGBoost, entre otros).
- Evaluar distintos enfoques para imputación de nulos (`NaN → -1` vs eliminación de registros).
- Aplicar diferentes estrategias de balance de clases (undersampling, oversampling, class weighting).
- Analizar el impacto de diversos métodos de selección de características (`SelectKBest`, importancia de características en Random Forest/XGBoost).
- Identificar y justificar el modelo final seleccionado para la tarea.

## 🔍 Resultado Esperado
Al finalizar, contar con:

✅ Una comparación clara de rendimientos para cada estrategia implementada.  
✅ El modelo final seleccionado junto a su justificación.  


In [5]:
def show_metrics(y_test, y_pred_test, y_train, y_pred_train):    
    accuracy_test = accuracy_score(y_test, y_pred_test)
    accuracy_train = accuracy_score(y_train, y_pred_train)
    f1_score_test = f1_score(y_test, y_pred_test)
    f1_score_train = f1_score(y_train, y_pred_train)
    precision_test = precision_score(y_test, y_pred_test)
    precision_train = precision_score(y_train, y_pred_train)
    recall_test = recall_score(y_test, y_pred_test)
    recall_train = recall_score(y_train, y_pred_train)

    print("\nRESULTADOS DEL MODELO")
    print("=" * 50)
    print(f"{'Métrica':<15} {'Test':<10} {'Train':<10}")
    print("-" * 50)
    print(f"{'Accuracy':<15} {accuracy_test:<10.2f} {accuracy_train:<10.2f}")
    print(f"{'F1 Score':<15} {f1_score_test:<10.2f} {f1_score_train:<10.2f}")
    print(f"{'Precision':<15} {precision_test:<10.2f} {precision_train:<10.2f}")
    print(f"{'Recall':<15} {recall_test:<10.2f} {recall_train:<10.2f}")
    print("=" * 50)

## ⚡️ Fase 1: Pruebas Iniciales con el Dataset Original

### 🎯 Descripción
En esta primera fase, trabajamos con la **data tal cual está**:
- Sin eliminar variables.
- Conservando los valores "faltantes" representados como `-1`.
- Sin aplicar ninguna técnica de imputación o balanceo de clases.

### 🔍 Objetivo
Obtener una línea base para evaluar:
- El rendimiento inicial de diferentes modelos.
- La importancia de procesar o no los datos antes de continuar.

### 🛠️ Modelos Evaluados
- Logistic Regression
- Random Forest
- XGBoost
- Otros modelos base relevantes.

### 📊 Resultados
*(Se incluirán métricas alcanzadas por cada modelo para establecer una línea base.)*


In [None]:
# DIVISION DE DATOS EN TRAIN Y TEST
X = df.drop("ADDEPEV3", axis = 1)
y = df["ADDEPEV3"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=y, random_state = 42)

In [None]:
# ESCALADO DE VARIABLES PARA MODELOS LINEALES O BASADOS EN DISTANCIAS
