# Paso 1: Importar las librerías y cargar el los datos del archivo

In [1]:
# Importando todas las librerías que se van a utilizar
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_curve
from sklearn.metrics import roc_auc_score, mean_squared_error, mean_absolute_error, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.utils import shuffle, resample

In [2]:
# Cargar los datos
df = pd.read_csv("/datasets/Churn.csv")
df.tail(10)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
9990,9991,15798964,Nkemakonam,714,Germany,Male,33,3.0,35016.6,1,1,0,53667.08,0
9991,9992,15769959,Ajuluchukwu,597,France,Female,53,4.0,88381.21,1,1,0,69384.71,1
9992,9993,15657105,Chukwualuka,726,Spain,Male,36,2.0,0.0,1,1,0,195192.4,0
9993,9994,15569266,Rahman,644,France,Male,28,7.0,155060.41,1,1,0,29179.52,0
9994,9995,15719294,Wood,800,France,Female,29,2.0,0.0,2,0,0,167773.55,0
9995,9996,15606229,Obijiaku,771,France,Male,39,5.0,0.0,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10.0,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7.0,0.0,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3.0,75075.31,2,1,0,92888.52,1
9999,10000,15628319,Walker,792,France,Female,28,,130142.79,1,1,0,38190.78,0


In [3]:
df.describe()

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,9091.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,15690940.0,650.5288,38.9218,4.99769,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,2886.89568,71936.19,96.653299,10.487806,2.894723,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,1.0,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,2500.75,15628530.0,584.0,32.0,2.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,5000.5,15690740.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,7500.25,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,10000.0,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           9091 non-null   float64
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(3), int64(8), object(3)
memory usage: 1.1+ MB


In [None]:
# Aplicar a las columnas el formato snake_case
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.columns

Index(['rownumber', 'customerid', 'surname', 'creditscore', 'geography',
       'gender', 'age', 'tenure', 'balance', 'numofproducts', 'hascrcard',
       'isactivemember', 'estimatedsalary', 'exited'],
      dtype='object')

In [None]:
# Formato snake_case
df.rename(columns={
    'numofproducts': 'num_of_products',
    'hascrcard': 'has_credit_card',
    'isactivemember': 'is_active_member',
    'estimatedsalary': 'estimated_salary',
    'rownumber': 'row_number',
    'customerid': 'customer_id',
    'creditscore': 'credit_score'
}, inplace=True)
df.columns

Index(['row_number', 'customer_id', 'surname', 'credit_score', 'geography',
       'gender', 'age', 'tenure', 'balance', 'num_of_products',
       'has_credit_card', 'is_active_member', 'estimated_salary', 'exited'],
      dtype='object')

In [7]:
df.fillna(0,inplace=True)

Se sustituyeron los valores nulos (nan) con 0, se asumió que en esas 909 filas no hay un plazo fijo de depósito, pues seguramente también existe la opción de no utilizar plazo fijo. 

In [8]:
df.head(10)

Unnamed: 0,row_number,customer_id,surname,credit_score,geography,gender,age,tenure,balance,num_of_products,has_credit_card,is_active_member,estimated_salary,exited
0,1,15634602,Hargrave,619,France,Female,42,2.0,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1.0,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8.0,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1.0,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2.0,125510.82,1,1,1,79084.1,0
5,6,15574012,Chu,645,Spain,Male,44,8.0,113755.78,2,1,0,149756.71,1
6,7,15592531,Bartlett,822,France,Male,50,7.0,0.0,2,1,1,10062.8,0
7,8,15656148,Obinna,376,Germany,Female,29,4.0,115046.74,4,1,0,119346.88,1
8,9,15792365,He,501,France,Male,44,4.0,142051.07,2,0,1,74940.5,0
9,10,15592389,H?,684,France,Male,27,2.0,134603.88,1,1,1,71725.73,0


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   row_number        10000 non-null  int64  
 1   customer_id       10000 non-null  int64  
 2   surname           10000 non-null  object 
 3   credit_score      10000 non-null  int64  
 4   geography         10000 non-null  object 
 5   gender            10000 non-null  object 
 6   age               10000 non-null  int64  
 7   tenure            10000 non-null  float64
 8   balance           10000 non-null  float64
 9   num_of_products   10000 non-null  int64  
 10  has_credit_card   10000 non-null  int64  
 11  is_active_member  10000 non-null  int64  
 12  estimated_salary  10000 non-null  float64
 13  exited            10000 non-null  int64  
dtypes: float64(3), int64(8), object(3)
memory usage: 1.1+ MB


# Paso 2: Codificación y estandarización

In [10]:
# Codificación categórica
encoder = OrdinalEncoder()
x_enc = encoder.fit_transform(df.loc[:,('surname','geography','gender')])
df.loc[:,('surname','geography','gender')] = pd.DataFrame(x_enc, columns=['surname','geography','gender'])
df.head(10)

Unnamed: 0,row_number,customer_id,surname,credit_score,geography,gender,age,tenure,balance,num_of_products,has_credit_card,is_active_member,estimated_salary,exited
0,1,15634602,1115.0,619,0.0,0.0,42,2.0,0.0,1,1,1,101348.88,1
1,2,15647311,1177.0,608,2.0,0.0,41,1.0,83807.86,1,0,1,112542.58,0
2,3,15619304,2040.0,502,0.0,0.0,42,8.0,159660.8,3,1,0,113931.57,1
3,4,15701354,289.0,699,0.0,0.0,39,1.0,0.0,2,0,0,93826.63,0
4,5,15737888,1822.0,850,2.0,0.0,43,2.0,125510.82,1,1,1,79084.1,0
5,6,15574012,537.0,645,2.0,1.0,44,8.0,113755.78,2,1,0,149756.71,1
6,7,15592531,177.0,822,0.0,1.0,50,7.0,0.0,2,1,1,10062.8,0
7,8,15656148,2000.0,376,1.0,0.0,29,4.0,115046.74,4,1,0,119346.88,1
8,9,15792365,1146.0,501,0.0,1.0,44,4.0,142051.07,2,0,1,74940.5,0
9,10,15592389,1081.0,684,0.0,1.0,27,2.0,134603.88,1,1,1,71725.73,0


Se codificaron las varibales tipo 'object'

In [11]:
# Estandarización de variables numéricas 
scaler = StandardScaler()
x_scaler = scaler.fit_transform(df[['surname','credit_score','geography','gender','age', 'tenure', 'balance', 'num_of_products', 'has_credit_card','is_active_member','estimated_salary']])
df.loc[:,('surname','credit_score','geography','gender','age', 'tenure', 'balance', 'num_of_products', 'has_credit_card','is_active_member','estimated_salary')] = pd.DataFrame(x_scaler, columns=['surname','credit_score','geography','gender','age', 'tenure', 'balance', 'num_of_products', 'has_credit_card','is_active_member','estimated_salary'])
df

Unnamed: 0,row_number,customer_id,surname,credit_score,geography,gender,age,tenure,balance,num_of_products,has_credit_card,is_active_member,estimated_salary,exited
0,1,15634602,-0.464183,-0.326221,-0.901886,-1.095988,0.293517,-0.817441,-1.225848,-0.911583,0.646092,0.970243,0.021886,1
1,2,15647311,-0.390911,-0.440036,1.515067,-1.095988,0.198164,-1.138838,0.117350,-0.911583,-1.547768,0.970243,0.216534,0
2,3,15619304,0.628988,-1.536794,-0.901886,-1.095988,0.293517,1.110941,1.333053,2.527057,0.646092,-1.030670,0.240687,1
3,4,15701354,-1.440356,0.501521,-0.901886,-1.095988,0.007457,-1.138838,-1.225848,0.807737,-1.547768,-1.030670,-0.108918,0
4,5,15737888,0.371354,2.063884,1.515067,-1.095988,0.388871,-0.817441,0.785728,-0.911583,0.646092,0.970243,-0.365276,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,0.580534,1.246488,-0.901886,0.912419,0.007457,0.146750,-1.225848,0.807737,0.646092,-1.030670,-0.066419,0
9996,9997,15569892,-0.203004,-1.391939,-0.901886,0.912419,-0.373958,1.753735,-0.306379,-0.911583,0.646092,0.970243,0.027988,0
9997,9998,15584532,0.073539,0.604988,-0.901886,-1.095988,-0.278604,0.789544,-1.225848,-0.911583,-1.547768,0.970243,-1.008643,1
9998,9999,15682355,0.989439,1.256835,0.306591,0.912419,0.293517,-0.496044,-0.022608,0.807737,0.646092,-1.030670,-0.125231,1


# Paso 3: Segmentación de los datos en un conjunto de entrenamiento, uno de validación y uno de prueba¶

In [12]:
# Separar características y variable objetivo
features = df.drop(columns=['exited', 'customer_id', 'row_number'])
target = df['exited']

# División inicial: 70% entrenamiento + 30% validación+prueba
features_train, features_temp, target_train, target_temp = train_test_split(
    features, target, test_size=0.3, random_state=42)

# División secundaria: 15% validación + 15% prueba
features_valid, features_test, target_valid, target_test = train_test_split(
    features_temp, target_temp, test_size=0.5, random_state=42)

# Verificación de tamaños
print("Tamaño entrenamiento:", len(features_train))
print("Tamaño validación:", len(features_valid))
print("Tamaño prueba:", len(features_test))

Tamaño entrenamiento: 7000
Tamaño validación: 1500
Tamaño prueba: 1500


# CASO: SIN CONSIDERAR EL DESEQUILIBRIO DE LAS CLASES

# Paso 4: Evaluando la calidad de diferentes modelos cambiando los hiperparámetros

**Arbol de decisión**

In [13]:
for depth in range(1,11):
    model = DecisionTreeClassifier(random_state=42, max_depth=depth)
    model.fit(features_train,target_train)
    
    # Predicción para calcular F1
    predictions_valid = model.predict(features_valid)
    f1_ad = f1_score(target_valid,predictions_valid)
    
    # Probabilidades para calcular AUC-ROC (necesita probabilidades, no etiquetas)
    probabilities_valid = model.predict_proba(features_valid)[:, 1]  # Tomamos la columna de probabilidad para la clase 1
    auc_roc = roc_auc_score(target_valid, probabilities_valid)

    print(f"max_depth={depth}: F1={f1_ad:.3f}, AUC-ROC={auc_roc:.3f}")

max_depth=1: F1=0.000, AUC-ROC=0.674
max_depth=2: F1=0.482, AUC-ROC=0.721
max_depth=3: F1=0.509, AUC-ROC=0.779
max_depth=4: F1=0.486, AUC-ROC=0.799
max_depth=5: F1=0.450, AUC-ROC=0.814
max_depth=6: F1=0.504, AUC-ROC=0.819
max_depth=7: F1=0.512, AUC-ROC=0.816
max_depth=8: F1=0.492, AUC-ROC=0.792
max_depth=9: F1=0.508, AUC-ROC=0.770
max_depth=10: F1=0.515, AUC-ROC=0.761


- Mejor F1: 0.547 (max_depth=9)
- Mejor AUC-ROC: 0.832 (max_depth=7)

El árbol de decisión muestra que cuando la profundidad media alta, logra un equilibrio decente entre F1 y AUC-ROC. Sin embargo, tiene fluctuaciones, indicando que no es el más estable.

**Bosque aleatorio**

In [14]:
for n in range(10,101,10):
    model = RandomForestClassifier(random_state=42, n_estimators=n)
    model.fit(features_train,target_train)
    
    # Predicción para F1
    predictions_valid = model.predict(features_valid)
    f1_ba = f1_score(target_valid,predictions_valid)
    
    # Probabilidades para AUC-ROC
    probabilities_valid = model.predict_proba(features_valid)[:, 1]
    auc_roc = roc_auc_score(target_valid, probabilities_valid)

    print(f"n_estimators={n}: F1={f1_ba:.3f}, AUC-ROC={auc_roc:.3f}")

n_estimators=10: F1=0.526, AUC-ROC=0.797
n_estimators=20: F1=0.544, AUC-ROC=0.824
n_estimators=30: F1=0.560, AUC-ROC=0.836
n_estimators=40: F1=0.550, AUC-ROC=0.845
n_estimators=50: F1=0.545, AUC-ROC=0.849
n_estimators=60: F1=0.554, AUC-ROC=0.850
n_estimators=70: F1=0.560, AUC-ROC=0.851
n_estimators=80: F1=0.553, AUC-ROC=0.851
n_estimators=90: F1=0.569, AUC-ROC=0.854
n_estimators=100: F1=0.558, AUC-ROC=0.853


- Mejor F1: 0.573 (n_estimators=90)
- Mejor AUC-ROC: 0.853 (n_estimators=100)

El bosque aleatorio es el mejor de los tres. Consistentemente tiene un buen F1 y el mejor AUC-ROC, lo cual es excelente.

**Regresión logística**

In [15]:
model = LogisticRegression(random_state=42, solver='liblinear')
model.fit(features_train,target_train)

# Predicción para F1
predictions_valid = model.predict(features_valid)
f1_rl = f1_score(target_valid, predictions_valid)

# Probabilidades para AUC-ROC
probabilities_valid = model.predict_proba(features_valid)[:, 1]
auc_roc = roc_auc_score(target_valid, probabilities_valid)

print(f"Regresión Logística - F1={f1_rl:.4f}, AUC-ROC={auc_roc:.4f}")

Regresión Logística - F1=0.2826, AUC-ROC=0.7537


- F1: 0.281
- AUC-ROC: 0.758

La regresión logística es claramente la peor opción. Aunque su AUC-ROC es decent, su F1 es muy bajo.

**Conclusiones:** 
- Si el F1 es bajo pero el AUC-ROC es alto, el modelo distingue bien entre clases en general, pero su desempeño específico en el umbral elegido (0.5) no es ideal. Que es el caso en varios de los modelos que se obtuvieron. 
- En el caso del bosque aleatorio, se puede notar que es la mejor opción. El modelo tiene F1 más altos que los otros modelos y también en AUC-ROC. 
- Entiendo que las instrucciones del proyecto le dan más importancia al F1 porque es un indicador más crítico y que es sensible al desbalance de clases. Después, haré el mismo procedimiento considerando el desequilibrio de clases y que con las técnicas se sobremuestreo y submuestreo, seguramente se verá una mejora en el F1. Hasta el momento, ningún modelo ha alcanzado el 0.59, que es el mínimo para aprobar la revisión. 

# CASO: CONSIDERANDO EL DESEQUILIBRIO DE LAS CLASES

In [None]:
# Explorar el desequilibrio de clases
df['exited'].value_counts(normalize=True)

0    0.7963
1    0.2037
Name: exited, dtype: float64

# Paso 4: Evaluando la calidad de diferentes modelos cambiando los hiperparámetros

**Submuestreo**

In [17]:
# Concatenamos features y target para hacer el muestreo
train_data = pd.concat([features_train, target_train], axis=1)

# Separar clases
class_0 = train_data[train_data['exited'] == 0]
class_1 = train_data[train_data['exited'] == 1]

# Submuestrear la clase mayoritaria
class_0_downsampled = class_0.sample(len(class_1), random_state=42)

# Combinar clases balanceadas
train_data_balanced = pd.concat([class_0_downsampled, class_1])

# Mezclar aleatoriamente
train_data_balanced = shuffle(train_data_balanced, random_state=42)

# Separar nuevamente features y target
features_train_balanced_sub = train_data_balanced.drop('exited', axis=1)
target_train_balanced_sub = train_data_balanced['exited']

**Arbol de decisión**

In [18]:
for depth in range(1,11):
    model = DecisionTreeClassifier(random_state=42, max_depth=depth)
    model.fit(features_train_balanced_sub,target_train_balanced_sub)
    
    # Predicción para calcular F1
    predictions_valid = model.predict(features_valid)
    f1_ad = f1_score(target_valid,predictions_valid)
    
    # Probabilidades para calcular AUC-ROC (necesita probabilidades, no etiquetas)
    probabilities_valid = model.predict_proba(features_valid)[:, 1]  # Tomamos la columna de probabilidad para la clase 1
    auc_roc = roc_auc_score(target_valid, probabilities_valid)

    print(f"max_depth={depth}: F1={f1_ad:.3f}, AUC-ROC={auc_roc:.3f}")

max_depth=1: F1=0.470, AUC-ROC=0.692
max_depth=2: F1=0.494, AUC-ROC=0.744
max_depth=3: F1=0.494, AUC-ROC=0.792
max_depth=4: F1=0.516, AUC-ROC=0.810
max_depth=5: F1=0.567, AUC-ROC=0.825
max_depth=6: F1=0.522, AUC-ROC=0.818
max_depth=7: F1=0.529, AUC-ROC=0.819
max_depth=8: F1=0.511, AUC-ROC=0.798
max_depth=9: F1=0.493, AUC-ROC=0.779
max_depth=10: F1=0.493, AUC-ROC=0.741


**Bosque aleatorio**

In [19]:
for n in range(10,101,10):
    model = RandomForestClassifier(random_state=42, n_estimators=n)
    model.fit(features_train_balanced_sub,target_train_balanced_sub)
    
    # Predicción para F1
    predictions_valid = model.predict(features_valid)
    f1_ba = f1_score(target_valid,predictions_valid)
    
    # Probabilidades para AUC-ROC
    probabilities_valid = model.predict_proba(features_valid)[:, 1]
    auc_roc = roc_auc_score(target_valid, probabilities_valid)

    print(f"n_estimators={n}: F1={f1_ba:.3f}, AUC-ROC={auc_roc:.3f}")

n_estimators=10: F1=0.528, AUC-ROC=0.815
n_estimators=20: F1=0.538, AUC-ROC=0.833
n_estimators=30: F1=0.538, AUC-ROC=0.837
n_estimators=40: F1=0.534, AUC-ROC=0.838
n_estimators=50: F1=0.541, AUC-ROC=0.839
n_estimators=60: F1=0.537, AUC-ROC=0.839
n_estimators=70: F1=0.538, AUC-ROC=0.840
n_estimators=80: F1=0.550, AUC-ROC=0.843
n_estimators=90: F1=0.544, AUC-ROC=0.844
n_estimators=100: F1=0.542, AUC-ROC=0.846


**Regresión logística**

In [20]:
model = LogisticRegression(random_state=42, solver='liblinear')
model.fit(features_train_balanced_sub,target_train_balanced_sub)

# Predicción para F1
predictions_valid = model.predict(features_valid)
f1_rl = f1_score(target_valid, predictions_valid)

# Probabilidades para AUC-ROC
probabilities_valid = model.predict_proba(features_valid)[:, 1]
auc_roc = roc_auc_score(target_valid, probabilities_valid)

print(f"Regresión Logística - F1={f1_rl:.4f}, AUC-ROC={auc_roc:.4f}")

Regresión Logística - F1=0.4447, AUC-ROC=0.7552


**Sobremuestreo**

In [21]:
# Concatenamos features y target para hacer el muestreo
train_data = pd.concat([features_train, target_train], axis=1)

# Separar clases
class_0 = train_data[train_data['exited'] == 0]
class_1 = train_data[train_data['exited'] == 1]

# Sobremuestrear la clase minoritaria
class_1_upsampled = resample(class_1, 
                             replace=True, 
                             n_samples=len(class_0), 
                             random_state=42)

# Combinar clases balanceadas
train_data_balanced = pd.concat([class_0, class_1_upsampled])

# Mezclar aleatoriamente
train_data_balanced = shuffle(train_data_balanced, random_state=42)

# Separar nuevamente features y target
features_train_balanced_sobre = train_data_balanced.drop('exited', axis=1)
target_train_balanced_sobre = train_data_balanced['exited']

**Arbol de decisión**

In [22]:
for depth in range(1,11):
    model = DecisionTreeClassifier(random_state=42, max_depth=depth)
    model.fit(features_train_balanced_sobre,target_train_balanced_sobre)
    
    # Predicción para calcular F1
    predictions_valid = model.predict(features_valid)
    f1_ad = f1_score(target_valid,predictions_valid)
    
    # Probabilidades para calcular AUC-ROC (necesita probabilidades, no etiquetas)
    probabilities_valid = model.predict_proba(features_valid)[:, 1]  # Tomamos la columna de probabilidad para la clase 1
    auc_roc = roc_auc_score(target_valid, probabilities_valid)

    print(f"max_depth={depth}: F1={f1_ad:.3f}, AUC-ROC={auc_roc:.3f}")

max_depth=1: F1=0.437, AUC-ROC=0.672
max_depth=2: F1=0.453, AUC-ROC=0.727
max_depth=3: F1=0.453, AUC-ROC=0.775
max_depth=4: F1=0.527, AUC-ROC=0.799
max_depth=5: F1=0.522, AUC-ROC=0.813
max_depth=6: F1=0.546, AUC-ROC=0.822
max_depth=7: F1=0.526, AUC-ROC=0.806
max_depth=8: F1=0.508, AUC-ROC=0.789
max_depth=9: F1=0.483, AUC-ROC=0.773
max_depth=10: F1=0.495, AUC-ROC=0.750


**Bosque aleatorio**

In [23]:
for n in range(10,101,10):
    model = RandomForestClassifier(random_state=42, n_estimators=n)
    model.fit(features_train_balanced_sobre,target_train_balanced_sobre)
    
    # Predicción para F1
    predictions_valid = model.predict(features_valid)
    f1_ba = f1_score(target_valid,predictions_valid)
    
    # Probabilidades para AUC-ROC
    probabilities_valid = model.predict_proba(features_valid)[:, 1]
    auc_roc = roc_auc_score(target_valid, probabilities_valid)

    print(f"n_estimators={n}: F1={f1_ba:.3f}, AUC-ROC={auc_roc:.3f}")

n_estimators=10: F1=0.564, AUC-ROC=0.827
n_estimators=20: F1=0.542, AUC-ROC=0.829
n_estimators=30: F1=0.568, AUC-ROC=0.840
n_estimators=40: F1=0.575, AUC-ROC=0.842
n_estimators=50: F1=0.584, AUC-ROC=0.846
n_estimators=60: F1=0.578, AUC-ROC=0.848
n_estimators=70: F1=0.593, AUC-ROC=0.849
n_estimators=80: F1=0.586, AUC-ROC=0.851
n_estimators=90: F1=0.584, AUC-ROC=0.851
n_estimators=100: F1=0.575, AUC-ROC=0.852


**Regresión logística**

In [24]:
model = LogisticRegression(random_state=42, solver='liblinear')
model.fit(features_train_balanced_sobre,target_train_balanced_sobre)

# Predicción para F1
predictions_valid = model.predict(features_valid)
f1_rl = f1_score(target_valid, predictions_valid)

# Probabilidades para AUC-ROC
probabilities_valid = model.predict_proba(features_valid)[:, 1]
auc_roc = roc_auc_score(target_valid, probabilities_valid)

print(f"Regresión Logística - F1={f1_rl:.4f}, AUC-ROC={auc_roc:.4f}")

Regresión Logística - F1=0.4434, AUC-ROC=0.7559


Después de realizar técnicas de desbalance de clases, se determinó que el sobre muestreo es que el mejor se adapta al modelo, el que tiene mejor F1. 

**Prueba final**

In [27]:
# Mejor modelo encontrado 
best_model = RandomForestClassifier(random_state=42, n_estimators=70)
best_model.fit(features_train_balanced_sobre,target_train_balanced_sobre)

# Predicciones en el conjunto de prueba
predictions_test = best_model.predict(features_test)

# Calcular métricas finales
f1_test = f1_score(target_test, predictions_test)
probabilities_test = best_model.predict_proba(features_test)[:, 1]
auc_roc_test = roc_auc_score(target_test, probabilities_test)

print(f"F1 final en prueba: {f1_test:.3f}")
print(f"AUC-ROC final en prueba: {auc_roc_test:.3f}")

F1 final en prueba: 0.602
AUC-ROC final en prueba: 0.848


El modelo alcanzó una métrica de F1 superior a 0.59, que era el umbral establecido. Esta métrica es muy útil en situaciones con clases desbalanceadas, ya que considera tanto la precisión como la exhaustividad (recall) del modelo.