# CAP√çTULO 1: CONFIGURACI√ìN Y PREPARACI√ìN


In [None]:
# Importar configuraci√≥n compartida
import sys
sys.path.append('..')
from config import *


<a id='11-imports'></a>
## 1.1 Importaci√≥n de Librer√≠as

En esta secci√≥n se importan todas las librer√≠as necesarias para el desarrollo del proyecto, organizadas por categor√≠as funcionales para facilitar su comprensi√≥n y mantenimiento.


In [1]:
# ========================================================================================
# IMPORTACI√ìN DE LIBRER√çAS
# ========================================================================================
#Manipulacion de datos
import numpy as np
import pandas as pd

#Visualizacion
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

# Preprocesamiento
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Mnejo de desbalanceo
from imblearn.over_sampling import SMOTE

# Feature Selection
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif

#Modelos
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

# Metricas
from sklearn.metrics import (
accuracy_score,precision_score,recall_score,confusion_matrix,f1_score,roc_auc_score,roc_curve,
classification_report
)

#Optimizacion
from sklearn.model_selection import RandomizedSearchCV
import optuna

#Interpretabilidad
import shap

#Utilidades
from tqdm import tqdm
import warnings
import joblib
from datetime import datetime
warnings.filterwarnings("ignore")

print("Librerias importadas correctamente")

Librerias importadas correctamente


<a id='12-setup'></a>
## 1.2 Configuraci√≥n del Entorno
Se establecen los par√°metros de configuraci√≥n necesarios para garantizar la reproducibilidad del proyecto y optimizar la visualizaci√≥n de resultados.

In [2]:
# ============================================================================
# CONFIGURACI√ìN DEL ENTORNO
# ============================================================================

# ----------------------------------------------------------------------------
# 1. SEMILLA PARA REPRODUCIBILIDAD
# ----------------------------------------------------------------------------
RANDOM_STATE = 42

# Fijar semillas en todas las librer√≠as
np.random.seed(RANDOM_STATE)
import random
random.seed(RANDOM_STATE)

print("‚úì Semilla de reproducibilidad establecida: {}".format(RANDOM_STATE))

# ----------------------------------------------------------------------------
# 2. CONFIGURACI√ìN DE VISUALIZACI√ìN
# ----------------------------------------------------------------------------

# Estilo de matplotlib/seaborn
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
sns.set_context("notebook", font_scale=1.1)

# Par√°metros de figuras
FIGURE_SIZE = (12, 6)           # Tama√±o por defecto
DPI_DISPLAY = 100               # Para visualizaci√≥n en pantalla
DPI_SAVE = 300                  # Para guardar (alta resoluci√≥n para PDF)

# Aplicar configuraci√≥n
plt.rcParams['figure.figsize'] = FIGURE_SIZE
plt.rcParams['figure.dpi'] = DPI_DISPLAY
plt.rcParams['savefig.dpi'] = DPI_SAVE
plt.rcParams['font.size'] = 10
plt.rcParams['axes.labelsize'] = 11
plt.rcParams['axes.titlesize'] = 13
plt.rcParams['legend.fontsize'] = 10

print("‚úì Configuraci√≥n de visualizaci√≥n aplicada")

# ----------------------------------------------------------------------------
# 3. PALETA DE COLORES PERSONALIZADA
# ----------------------------------------------------------------------------
COLOR_PALETTE = ['#2E86AB', '#A23B72', '#F18F01', '#C73E1D', '#6A994E']
sns.set_palette(COLOR_PALETTE)

print("‚úì Paleta de colores definida")

# ----------------------------------------------------------------------------
# 4. DIRECTORIOS DE TRABAJO
# ----------------------------------------------------------------------------
import os

# Crear directorio para outputs si no existe
OUTPUT_DIR = 'outputs'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
    print(f"‚úì Directorio '{OUTPUT_DIR}' creado")
else:
    print(f"‚úì Directorio '{OUTPUT_DIR}' ya existe")

# Directorio para modelos guardados
MODELS_DIR = 'models'
if not os.path.exists(MODELS_DIR):
    os.makedirs(MODELS_DIR)
    print(f"‚úì Directorio '{MODELS_DIR}' creado")
else:
    print(f"‚úì Directorio '{MODELS_DIR}' ya existe")

# ----------------------------------------------------------------------------
# 5. PAR√ÅMETROS GLOBALES DEL PROYECTO
# ----------------------------------------------------------------------------

# Divisi√≥n de datos
TEST_SIZE = 0.2                 # 80% train, 20% test
VALIDATION_FOLDS = 5            # Para cross-validation

# Optimizaci√≥n de hiperpar√°metros
N_ITER_RANDOM_SEARCH = 50       # Iteraciones para RandomizedSearchCV
CV_FOLDS = 3                    # Folds para CV durante tuning (m√°s r√°pido)

# Configuraci√≥n de modelos
N_JOBS = -1                     # Usar todos los cores disponibles

print("‚úì Par√°metros globales configurados")

# ----------------------------------------------------------------------------
# 6. INFORMACI√ìN DEL ENTORNO
# ----------------------------------------------------------------------------
import sys
from datetime import datetime

print("\n" + "="*80)
print("INFORMACI√ìN DEL ENTORNO")
print("="*80)
print(f"Fecha y hora de ejecuci√≥n: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Python version: {sys.version.split()[0]}")
print(f"Working directory: {os.getcwd()}")
print(f"Random state: {RANDOM_STATE}")
print(f"Test size: {TEST_SIZE}")
print(f"CV folds: {VALIDATION_FOLDS}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"Models directory: {MODELS_DIR}")
print("="*80)
print("\n‚úÖ Configuraci√≥n del entorno completada\n")

‚úì Semilla de reproducibilidad establecida: 42
‚úì Configuraci√≥n de visualizaci√≥n aplicada
‚úì Paleta de colores definida
‚úì Directorio 'outputs' ya existe
‚úì Directorio 'models' ya existe
‚úì Par√°metros globales configurados

INFORMACI√ìN DEL ENTORNO
Fecha y hora de ejecuci√≥n: 2025-12-14 15:39:07
Python version: 3.11.13
Working directory: /mnt/shared/cursos/michelle/TFM/src
Random state: 42
Test size: 0.2
CV folds: 5
Output directory: outputs
Models directory: models

‚úÖ Configuraci√≥n del entorno completada



<a id='13-carga'></a>
## 1.3 Carga del Dataset

Se procede a cargar el dataset Heart Disease desde el archivo CSV y realizar una primera inspecci√≥n de los datos.

In [3]:
# ============================================================================
# CARGA DEL DATASET
# ============================================================================

# ----------------------------------------------------------------------------
# 1. DEFINIR RUTA DEL ARCHIVO
# ----------------------------------------------------------------------------
DATA_PATH = os.path.join(RAW_DATA_DIR, 'heart.csv')

print("Cargando dataset...")
print("="*80)

# ----------------------------------------------------------------------------
# 2. CARGAR DATOS
# ----------------------------------------------------------------------------
try:
    df = pd.read_csv(DATA_PATH)
    print(f"‚úì Dataset cargado exitosamente desde: {DATA_PATH}")
except FileNotFoundError:
    print(f"‚ùå Error: No se encontr√≥ el archivo '{DATA_PATH}'")
    print(f"   Verifica que el archivo est√© en: {os.getcwd()}")
    raise
except Exception as e:
    print(f"‚ùå Error al cargar el dataset: {str(e)}")
    raise

# ----------------------------------------------------------------------------
# 3. CREAR BACKUP DE DATOS ORIGINALES
# ----------------------------------------------------------------------------
df_original = df.copy()
print("‚úì Backup de datos originales creado")

# ----------------------------------------------------------------------------
# 4. INFORMACI√ìN B√ÅSICA DEL DATASET
# ----------------------------------------------------------------------------
print("\n" + "="*80)
print("INFORMACI√ìN B√ÅSICA DEL DATASET")
print("="*80)
print(f"Dimensiones del dataset: {df.shape[0]} filas √ó {df.shape[1]} columnas")
print(f"N√∫mero de observaciones: {df.shape[0]}")
print(f"N√∫mero de variables: {df.shape[1]}")
print(f"Memoria utilizada: {df.memory_usage(deep=True).sum() / 1024:.2f} KB")
print("="*80)

# ----------------------------------------------------------------------------
# 5. PRIMERAS FILAS DEL DATASET
# ----------------------------------------------------------------------------
print("\nüìä Primeras 5 filas del dataset:\n")
display(df.head())

# ----------------------------------------------------------------------------
# 6. √öLTIMAS FILAS DEL DATASET
# ----------------------------------------------------------------------------
print("\nüìä √öltimas 5 filas del dataset:\n")
display(df.tail())

# ----------------------------------------------------------------------------
# 7. MUESTRA ALEATORIA
# ----------------------------------------------------------------------------
print("\nüìä Muestra aleatoria de 5 filas:\n")
display(df.sample(5, random_state=RANDOM_STATE))

# ----------------------------------------------------------------------------
# 8. NOMBRES DE COLUMNAS
# ----------------------------------------------------------------------------
print("\nüìã Variables en el dataset:")
print("-"*80)
for i, col in enumerate(df.columns, 1):
    print(f"{i:2d}. {col}")
print("-"*80)
print(f"Total: {len(df.columns)} variables\n")

print("‚úÖ Carga del dataset completada\n")

Cargando dataset...
‚úì Dataset cargado exitosamente desde: heart.csv
‚úì Backup de datos originales creado

INFORMACI√ìN B√ÅSICA DEL DATASET
Dimensiones del dataset: 303 filas √ó 14 columnas
N√∫mero de observaciones: 303
N√∫mero de variables: 14
Memoria utilizada: 33.27 KB

üìä Primeras 5 filas del dataset:



Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0



üìä √öltimas 5 filas del dataset:



Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
298,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1
299,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,2
300,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,3
301,57.0,0.0,2.0,130.0,236.0,0.0,2.0,174.0,0.0,0.0,2.0,1.0,3.0,1
302,38.0,1.0,3.0,138.0,175.0,0.0,0.0,173.0,0.0,0.0,1.0,0.0,3.0,0



üìä Muestra aleatoria de 5 filas:



Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
179,53.0,1.0,3.0,130.0,246.0,1.0,2.0,173.0,0.0,0.0,1.0,3.0,3.0,0
228,54.0,1.0,4.0,110.0,206.0,0.0,2.0,108.0,1.0,0.0,2.0,1.0,3.0,3
111,56.0,1.0,4.0,125.0,249.0,1.0,2.0,144.0,1.0,1.2,2.0,1.0,3.0,1
246,58.0,1.0,4.0,100.0,234.0,0.0,0.0,156.0,0.0,0.1,1.0,1.0,7.0,2
60,51.0,0.0,4.0,130.0,305.0,0.0,0.0,142.0,1.0,1.2,2.0,0.0,7.0,2



üìã Variables en el dataset:
--------------------------------------------------------------------------------
 1. age
 2. sex
 3. cp
 4. trestbps
 5. chol
 6. fbs
 7. restecg
 8. thalach
 9. exang
10. oldpeak
11. slope
12. ca
13. thal
14. num
--------------------------------------------------------------------------------
Total: 14 variables

‚úÖ Carga del dataset completada

