## Dependencias 

In [1]:
import numpy as np 
import pandas as pd 
from tpot import TPOTClassifier
from sklearn.metrics import accuracy_score,roc_auc_score,classification_report
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import SelectKBest
from functools import reduce 
pd.set_option('display.max_columns',None)

  import pkg_resources


## Lectura de datos 

In [2]:
data = pd.read_csv('../../data/heart_disease.tab',delimiter='\t')
data.shape

(303, 14)

In [3]:
data.sample(5)

Unnamed: 0,diameter narrowing,age,gender,chest pain,rest SBP,cholesterol,fasting blood sugar > 120,rest ECG,max HR,exerc ind ang,ST by exercise,slope peak exc ST,major vessels colored,thal
209,1,62,female,asymptomatic,150,244,0,normal,154,1,1.4,flat,0.0,normal
234,0,54,female,non-anginal,160,201,0,normal,163,0,0.0,upsloping,1.0,normal
148,0,45,male,atypical ang,128,308,0,left vent hypertrophy,170,0,0.0,upsloping,0.0,normal
111,1,56,male,asymptomatic,125,249,1,left vent hypertrophy,144,1,1.2,flat,1.0,normal
162,0,54,female,non-anginal,110,214,0,normal,158,0,1.6,flat,0.0,normal


## Clasificación de variables 

In [4]:
varc = ['age','rest SBP','cholesterol','max HR','ST by exercise','major vessels colored']
vard = [ 'gender', 'chest pain',  'fasting blood sugar > 120', 'rest ECG', 'exerc ind ang',  'slope peak exc ST','thal']
vart = ['diameter narrowing']

In [5]:
for v in varc:
    data[v] = pd.to_numeric(data[v],errors='coerce')
for v in vard:
    data[v] =data[v].fillna('MISSING')

## Partición 

In [6]:
train,valid = train_test_split(data,test_size=0.2)
train.reset_index(drop=True,inplace=True)
valid.reset_index(drop=True,inplace=True)
train.shape,valid.shape

((242, 14), (61, 14))

### Matriz de predictoras $\mathcal{X}$

In [7]:
oh = OneHotEncoder(drop='if_binary',sparse_output=False)
oh.fit(train[vard])
varoh = list(oh.get_feature_names_out())
varoh

['gender_male',
 'chest pain_asymptomatic',
 'chest pain_atypical ang',
 'chest pain_non-anginal',
 'chest pain_typical ang',
 'fasting blood sugar > 120_1',
 'rest ECG_ST-T abnormal',
 'rest ECG_left vent hypertrophy',
 'rest ECG_normal',
 'exerc ind ang_1',
 'slope peak exc ST_downsloping',
 'slope peak exc ST_flat',
 'slope peak exc ST_upsloping',
 'thal_MISSING',
 'thal_fixed defect',
 'thal_normal',
 'thal_reversable defect']

In [8]:
X = train[varc].copy()
X[varoh] = oh.transform(train[vard])
X.head()
im = SimpleImputer(strategy='median')
im.fit(X)
X = pd.DataFrame(im.transform(X),columns=X.columns)

### Vector respuesta $\vec{y}$

In [9]:
y = train[vart[0]]
y.head()

0    1
1    0
2    0
3    1
4    1
Name: diameter narrowing, dtype: int64

## Aprendizaje 

In [10]:
var = varc+varoh

In [11]:
from joblib import Parallel, delayed

In [12]:
parallel_backend = Parallel(n_jobs=-1)

In [13]:

tpot = TPOTClassifier(
    generations=5,
    population_size=20,
    verbose=2,
    n_jobs=-1
)
tpot.fit(X, y)

# Retrieve the pipeline found by TPOT
best_pipeline = tpot.fitted_pipeline_

# Optionally, save the pipeline (uncomment to use)
# tpot.export('tpot_best_pipeline.py') 

# Let's keep best, lr to maintain compatibility with later code cells
best = var
mod = best_pipeline




TimeoutError: No valid workers found

## Generalización del modelo (qué tan bien se comporta en datos que nunca vió, es importante aplicar TODAS las transfromaciones previas)

In [None]:
Xv = valid[varc].copy()
Xv[varoh] = oh.transform(valid[vard])
Xv = pd.DataFrame(im.transform(Xv[best]),columns=best)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# Training ROC and AUC
y_train_proba = lr.predict_proba(X)[:, 1]
fpr_train, tpr_train, _ = roc_curve(y, y_train_proba)
auc_train = auc(fpr_train, tpr_train)

# Validation ROC and AUC
y_valid = valid['diameter narrowing']
y_valid_proba = lr.predict_proba(Xv)[:, 1]
fpr_valid, tpr_valid, _ = roc_curve(y_valid, y_valid_proba)
auc_valid = auc(fpr_valid, tpr_valid)

plt.figure(figsize=(9, 7))
plt.plot(fpr_train, tpr_train, color='#1f77b4', lw=3, label=f'Train ROC (AUC = {auc_train:.3f})')
plt.plot(fpr_valid, tpr_valid, color='#ff7f0e', lw=3, label=f'Validation ROC (AUC = {auc_valid:.3f})')
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--', label='Random Guess')

plt.title('ROC Curve for Decision Tree', fontsize=20, fontweight='bold', color='#333333', pad=20)
plt.xlabel('False Positive Rate', fontsize=16, fontweight='bold')
plt.ylabel('True Positive Rate', fontsize=16, fontweight='bold')
plt.legend(fontsize=14, loc='lower right', frameon=True, fancybox=True, shadow=True)
plt.grid(True, linestyle=':', linewidth=1.2, alpha=0.7)
plt.gca().set_facecolor('#f7f7f7')
plt.tick_params(axis='both', which='major', labelsize=13)
plt.tight_layout()
plt.show()

## Uso del modelo 

In [None]:

# Generate predicted probabilities for both training and validation sets using the logistic regression model
y_train_proba = lr.predict_proba(X)[:, 1]
y_valid_proba = lr.predict_proba(Xv)[:, 1]

# Define probability bins from 0 to 1 in steps of 0.2
prob_bins = np.arange(0, 1.1, 0.2)  # [0. , 0.2, 0.4, ..., 1.0]
prob_labels = [f"{prob_bins[i]:.1f}-{prob_bins[i+1]:.1f}" for i in range(len(prob_bins)-1)]

# Bin the probabilities
train_prob_range = pd.cut(y_train_proba, bins=prob_bins, labels=prob_labels, include_lowest=True)
valid_prob_range = pd.cut(y_valid_proba, bins=prob_bins, labels=prob_labels, include_lowest=True)

# Build DataFrames for train and valid
df_train_probs = pd.DataFrame({
    'target': y.values,
    'probability': y_train_proba,
    'prob_range': train_prob_range,
    'sample': 'train'
})

df_valid_probs = pd.DataFrame({
    'target': y_valid.values,
    'probability': y_valid_proba,
    'prob_range': valid_prob_range,
    'sample': 'valid'
})

# Concatenate into a single DataFrame
df_probs = pd.concat([df_train_probs, df_valid_probs], ignore_index=True)

# Show the first few rows
df_probs.head()



In [None]:
df_probs['prob_range'].value_counts()

In [None]:
df_probs.drop('probability',axis=1).assign(n=1).groupby(['target','prob_range','sample']).count().to_clipboard()

In [None]:
!python -V