In [1]:
# Importación de librerías
import pandas as pd
from pathlib import Path
import os
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    classification_report, confusion_matrix,
    ConfusionMatrixDisplay, roc_auc_score,
    roc_curve, auc
)
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix


In [2]:
# 1. Detectar carpeta actual y raíz del proyecto
current_dir = Path(os.getcwd())
project_root = current_dir.parent.parent
data_path = project_root / "data" / "raw" / "heart-disease-dataset.csv"

# 2. Cargar CSV
heart = pd.read_csv(data_path)

# 3. Limpiar columnas no deseadas
heart.drop(columns=['exercise_angina', 'oldpeak', 'st_slope'], inplace=True)

heart.head()

Unnamed: 0,age,sex,chest_pain_type,resting_bp_s,cholesterol,fasting_blood_sugar,resting_ecg,max_heart_rate,target
0,40.0,1.0,2.0,140.0,289.0,0.0,0.0,172.0,0.0
1,49.0,0.0,3.0,160.0,180.0,0.0,0.0,156.0,1.0
2,37.0,1.0,2.0,130.0,283.0,0.0,1.0,98.0,0.0
3,48.0,0.0,4.0,138.0,214.0,0.0,0.0,108.0,1.0
4,54.0,1.0,3.0,150.0,195.0,0.0,0.0,122.0,0.0


In [3]:
# Valores único
heart.nunique()

age                     50
sex                      2
chest_pain_type          4
resting_bp_s            67
cholesterol            222
fasting_blood_sugar      2
resting_ecg              3
max_heart_rate         119
target                   2
dtype: int64

In [4]:
# Mostrar distribución porcentual solo en columnas categóricas (<= 10 valores únicos)
for col in heart.columns:
    if heart[col].nunique() <= 10:
        print(f"\n📊 Distribución en '{col}':")
        print(heart[col].value_counts(normalize=True).round(3) * 100)


📊 Distribución en 'sex':
sex
1.0    76.4
0.0    23.6
Name: proportion, dtype: float64

📊 Distribución en 'chest_pain_type':
chest_pain_type
4.0    52.5
3.0    23.8
2.0    18.2
1.0     5.5
Name: proportion, dtype: float64

📊 Distribución en 'fasting_blood_sugar':
fasting_blood_sugar
0.0    78.7
1.0    21.3
Name: proportion, dtype: float64

📊 Distribución en 'resting_ecg':
resting_ecg
0.0    57.5
2.0    27.3
1.0    15.2
Name: proportion, dtype: float64

📊 Distribución en 'target':
target
1.0    52.9
0.0    47.1
Name: proportion, dtype: float64


#### Análisis de columnas categóricas

| Variable              | Distribución destacada                      | Observación                                            |
| --------------------- | ------------------------------------------- | ------------------------------------------------------ |
| `sex`                 | 76.4% hombres, 23.6% mujeres                | 🔸 Desbalance moderado (3:1). Puede afectar al modelo. |
| `chest_pain_type`     | Muy dominada por clase 4.0 (52.5%)          | 🔸 Clase 1.0 (5.5%) tiene **muy pocos casos**.         |
| `fasting_blood_sugar` | 78.7% bajo (0), 21.3% alto (1)              | 🔹 Desbalance moderado, pero aceptable.                |
| `resting_ecg`         | Distribución variada: 57.5% / 27.3% / 15.2% | ✅ Sin problemas graves.                                |
| `target`              | 52.9% positivos, 47.1% negativos            | ✅ Bien balanceado.                                     |


In [5]:
# Ver proporción de target en mujeres
heart[heart['sex'] == 0]['target'].value_counts(normalize=True).round(3) * 100

target
0.0    75.1
1.0    24.9
Name: proportion, dtype: float64

In [6]:
# Ver proporción de target en hombres
heart[heart['sex'] == 1]['target'].value_counts(normalize=True).round(3) * 100


target
1.0    61.5
0.0    38.5
Name: proportion, dtype: float64

In [7]:
heart.shape

(1190, 9)

In [8]:
heart['sex'].value_counts()

sex
1.0    909
0.0    281
Name: count, dtype: int64

#### CSV solo de sex == 0 (mujer) para generar datos sintéticos en MOSTLY.AI

In [5]:
heart_females = heart[heart['sex'] == 0]

In [6]:
heart_females_positive = heart_females[heart_females['target'] == 1]

In [7]:
# Asegurarse de que exista la carpeta 'results'
output_path = project_root / "data" / "synthetic" 
output_path.mkdir(parents=True, exist_ok=True)

# Guardar el dataset combinado
heart_females.to_csv(output_path / "heart_females.csv", index=False)
print("✅ Archivo guardado en: data/synthetic/heart_females.csv")

# Guardar el dataset combinado
heart_females.to_csv("..\heart_females.csv", index=False)

✅ Archivo guardado en: data/synthetic/heart_females.csv


In [8]:
# Path para cargar el dataset de mujeres
data_path = project_root / "data" / "synthetic" / "heart_females.csv"

heart_females = pd.read_csv(data_path)


In [9]:
heart_females

Unnamed: 0,age,sex,chest_pain_type,resting_bp_s,cholesterol,fasting_blood_sugar,resting_ecg,max_heart_rate,target
0,49.0,0.0,3.0,160.0,180.0,0.0,0.0,156.0,1.0
1,48.0,0.0,4.0,138.0,214.0,0.0,0.0,108.0,1.0
2,45.0,0.0,2.0,130.0,237.0,0.0,0.0,170.0,0.0
3,48.0,0.0,2.0,120.0,284.0,0.0,0.0,120.0,0.0
4,37.0,0.0,3.0,130.0,211.0,0.0,0.0,142.0,0.0
...,...,...,...,...,...,...,...,...,...
276,58.0,0.0,4.0,170.0,225.0,1.0,2.0,146.0,1.0
277,55.0,0.0,2.0,132.0,342.0,0.0,0.0,166.0,0.0
278,63.0,0.0,4.0,124.0,197.0,0.0,0.0,136.0,1.0
279,57.0,0.0,4.0,140.0,241.0,0.0,0.0,123.0,1.0


_

### Comprobación de datos sintéticos generados

In [10]:
# Path para cargar el dataset de mujeres
data_path = project_root / "data" / "synthetic" / "heart_females_synthetic.csv"

synthetic = pd.read_csv(data_path)

# Verificamos que sex = 0 en el sintético
print("Valores únicos en 'sex' (sintético):", synthetic['sex'].unique())


Valores únicos en 'sex' (sintético): [0.]


In [11]:
# Ver distribución de target en sintéticos
print("\n📊 Distribución de target (sintético):")
print(synthetic['target'].value_counts(normalize=True).round(3) * 100)



📊 Distribución de target (sintético):
target
0.0    74.8
1.0    25.2
Name: proportion, dtype: float64


In [12]:
synthetic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  500 non-null    int64  
 1   sex                  500 non-null    float64
 2   chest_pain_type      500 non-null    float64
 3   resting_bp_s         500 non-null    int64  
 4   cholesterol          500 non-null    int64  
 5   fasting_blood_sugar  500 non-null    float64
 6   resting_ecg          500 non-null    float64
 7   max_heart_rate       500 non-null    int64  
 8   target               500 non-null    float64
dtypes: float64(5), int64(4)
memory usage: 35.3 KB


In [13]:
heart.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1190 entries, 0 to 1189
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  1190 non-null   float64
 1   sex                  1190 non-null   float64
 2   chest_pain_type      1190 non-null   float64
 3   resting_bp_s         1190 non-null   float64
 4   cholesterol          1190 non-null   float64
 5   fasting_blood_sugar  1190 non-null   float64
 6   resting_ecg          1190 non-null   float64
 7   max_heart_rate       1190 non-null   float64
 8   target               1190 non-null   float64
dtypes: float64(9)
memory usage: 83.8 KB


In [14]:
# Convertir todas las columnas a float64 en el dataset sintético
synthetic = synthetic.astype('float64')

In [15]:
synthetic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  500 non-null    float64
 1   sex                  500 non-null    float64
 2   chest_pain_type      500 non-null    float64
 3   resting_bp_s         500 non-null    float64
 4   cholesterol          500 non-null    float64
 5   fasting_blood_sugar  500 non-null    float64
 6   resting_ecg          500 non-null    float64
 7   max_heart_rate       500 non-null    float64
 8   target               500 non-null    float64
dtypes: float64(9)
memory usage: 35.3 KB


In [16]:
# Concatenar original + sintético
heart_augmented = pd.concat([heart, synthetic], ignore_index=True)

print("\n✅ Dataset combinado:")
print("Original:", heart.shape)
print("Sintético:", synthetic.shape)
print("Total:", heart_augmented.shape)


✅ Dataset combinado:
Original: (1190, 9)
Sintético: (500, 9)
Total: (1690, 9)


In [17]:
heart_augmented.nunique()

age                     50
sex                      2
chest_pain_type          4
resting_bp_s            81
cholesterol            307
fasting_blood_sugar      2
resting_ecg              3
max_heart_rate         120
target                   2
dtype: int64

In [18]:
# Asegurarse de que exista la carpeta 'results'
output_path = project_root / "data" / "synthetic" 
output_path.mkdir(parents=True, exist_ok=True)

# Guardar el dataset combinado
heart_augmented.to_csv(output_path / "heart-disease-dataset_augmented.csv", index=False)
print("✅ Archivo guardado en: data/synthetic/heart-disease-dataset_augmented.csv")

# Guardar el dataset combinado
heart_females.to_csv("..\heart-disease-dataset_augmented.csv", index=False)

✅ Archivo guardado en: data/synthetic/heart-disease-dataset_augmented.csv


In [19]:
# Mostrar distribución porcentual solo en columnas categóricas (<= 10 valores únicos)
for col in heart_augmented.columns:
    if heart_augmented[col].nunique() <= 10:
        print(f"\n📊 Distribución en '{col}':")
        print(heart_augmented[col].value_counts(normalize=True).round(3) * 100)


📊 Distribución en 'sex':
sex
1.0    53.8
0.0    46.2
Name: proportion, dtype: float64

📊 Distribución en 'chest_pain_type':
chest_pain_type
4.0    47.9
3.0    24.6
2.0    20.5
1.0     7.0
Name: proportion, dtype: float64

📊 Distribución en 'fasting_blood_sugar':
fasting_blood_sugar
0.0    81.9
1.0    18.1
Name: proportion, dtype: float64

📊 Distribución en 'resting_ecg':
resting_ecg
0.0    55.0
2.0    29.9
1.0    15.0
Name: proportion, dtype: float64

📊 Distribución en 'target':
target
0.0    55.3
1.0    44.7
Name: proportion, dtype: float64


In [20]:
# Mostrar distribución porcentual solo en columnas categóricas (<= 10 valores únicos)
for col in heart.columns:
    if heart[col].nunique() <= 10:
        print(f"\n📊 Distribución en '{col}':")
        print(heart[col].value_counts(normalize=True).round(3) * 100)


📊 Distribución en 'sex':
sex
1.0    76.4
0.0    23.6
Name: proportion, dtype: float64

📊 Distribución en 'chest_pain_type':
chest_pain_type
4.0    52.5
3.0    23.8
2.0    18.2
1.0     5.5
Name: proportion, dtype: float64

📊 Distribución en 'fasting_blood_sugar':
fasting_blood_sugar
0.0    78.7
1.0    21.3
Name: proportion, dtype: float64

📊 Distribución en 'resting_ecg':
resting_ecg
0.0    57.5
2.0    27.3
1.0    15.2
Name: proportion, dtype: float64

📊 Distribución en 'target':
target
1.0    52.9
0.0    47.1
Name: proportion, dtype: float64
