## <font color='navy'> Examen 2. Laboratorio de Aprendizaje Estadístico.
    
### <font color='navy'> Modelos de Clasificación

    
Luis Fernando Márquez Bañuelos

In [21]:
import warnings
import numpy as np
import pandas as pd
warnings.filterwarnings('ignore')
from sklearn.pipeline import Pipeline
from sklearn import discriminant_analysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, roc_auc_score 
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

## <font color='navy'> Importación de datos

In [6]:
data = pd.read_csv('diabetes.csv')
data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,diabetes
0,Female,80.0,0,1,never,25.19,0
1,Female,54.0,0,0,No Info,27.32,0
2,Male,28.0,0,0,never,27.32,0
3,Female,36.0,0,0,current,23.45,0
4,Male,76.0,1,1,current,20.14,0


## <font color='navy'> Explicación de variables

1. **`gender`** (Categórica):  
   - **Descripción**: Género del paciente.  
   - **Valores posibles**: `Female` (mujer), `Male` (hombre).  

2. **`age`** (Numérica):  
   - **Descripción**: Edad del paciente en años.  

3. **`hypertension`** (Binaria):  
   - **Descripción**: Indica si el paciente tiene hipertensión.  
   - **Valores**: `0` (no), `1` (sí).  

4. **`heart_disease`** (Binaria):  
   - **Descripción**: Indica si el paciente tiene enfermedades cardíacas.  
   - **Valores**: `0` (no), `1` (sí).  

5. **`smoking_history`** (Categórica):  
   - **Descripción**: Historial de tabaquismo del paciente.  
   - **Valores posibles**:  
     - `never`: Nunca ha fumado.  
     - `former`: Exfumador.  
     - `current`: Fumador actual.  
     - `No Info`: Datos no proporcionados.  
     - `ever`: Alguna vez ha fumado (poco común).  
     - `not current`: No fuma actualmente.  

6. **`bmi`** (Numérica):  
   - **Descripción**: Índice de Masa Corporal (BMI, por sus siglas en inglés).  

7. **`diabetes`** (Binaria):  
   - **Descripción**: Diagnóstico de diabetes (variable objetivo).  
   - **Valores**: `0` (no tiene diabetes), `1` (tiene diabetes).  

## <font color='navy'> Análisis de Datos

In [7]:
data = pd.get_dummies(data, columns=['gender', 'smoking_history'])
target = 'diabetes'
data.head()

Unnamed: 0,age,hypertension,heart_disease,bmi,diabetes,gender_Female,gender_Male,gender_Other,smoking_history_No Info,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current
0,80.0,0,1,25.19,0,True,False,False,False,False,False,False,True,False
1,54.0,0,0,27.32,0,True,False,False,True,False,False,False,False,False
2,28.0,0,0,27.32,0,False,True,False,False,False,False,False,True,False
3,36.0,0,0,23.45,0,True,False,False,False,True,False,False,False,False
4,76.0,1,1,20.14,0,False,True,False,False,True,False,False,False,False


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   age                          100000 non-null  float64
 1   hypertension                 100000 non-null  int64  
 2   heart_disease                100000 non-null  int64  
 3   bmi                          100000 non-null  float64
 4   diabetes                     100000 non-null  int64  
 5   gender_Female                100000 non-null  bool   
 6   gender_Male                  100000 non-null  bool   
 7   gender_Other                 100000 non-null  bool   
 8   smoking_history_No Info      100000 non-null  bool   
 9   smoking_history_current      100000 non-null  bool   
 10  smoking_history_ever         100000 non-null  bool   
 11  smoking_history_former       100000 non-null  bool   
 12  smoking_history_never        100000 non-null  bool   
 13  

## <font color='navy'> Modelos Base

In [10]:
X = data.copy()
X = X.drop(columns=[target])
y = data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier())  
])

params_knn = {
    'knn__n_neighbors': range(1, 30),
    'knn__weights': ['uniform', 'distance'],
    'knn__metric': ['euclidean', 'manhattan']
}

In [18]:
random_search = RandomizedSearchCV(estimator=pipeline, 
                                   param_distributions=params_knn, 
                                   n_iter=15, cv=5, n_jobs=-1, scoring='roc_auc', random_state=42)
random_search.fit(X_train, y_train)

In [19]:
random_search.best_params_

{'knn__weights': 'uniform', 'knn__n_neighbors': 27, 'knn__metric': 'manhattan'}

In [20]:
random_knn = random_search.best_estimator_.predict_proba(X_test)[:, 1]
roc_auc_score(y_score=random_knn, y_true=y_test)

0.8103687365920834

In [25]:
pipeline = Pipeline([
    ('poly', PolynomialFeatures(include_bias=False)),  # grado se definirá en grid search
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(max_iter=10000, random_state=42))
])

params_log = {
    'poly__degree': [1, 2, 3],
    'logreg__penalty': [None, 'l2'],
    'logreg__C': [0.01, 0.1, 1]
}

In [26]:
random_search = RandomizedSearchCV(estimator=pipeline, 
                                   param_distributions=params_log, 
                                   n_iter=15, cv=5, n_jobs=-1, scoring='roc_auc', random_state=42)
random_search.fit(X_train, y_train)



In [27]:
random_search.best_params_

{'poly__degree': 2, 'logreg__penalty': 'l2', 'logreg__C': 1}

In [28]:
random_log = random_search.best_estimator_.predict_proba(X_test)[:, 1]
roc_auc_score(y_score=random_log, y_true=y_test)

0.8385362120654222