In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import os

In [4]:
# Cargando el df

df = pd.read_csv('Churn_Modelling.csv')

df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [5]:
print(f'El dataset cuenta con {df.shape[0]} filas y {df.shape[1]} columnas')

El dataset cuenta con 10000 filas y 14 columnas


# **PRE PROCESAMIENTO DE DATOS**

In [6]:
# Revisando si la cantidad de customers únicos es igual a la del dataframe
len(df['CustomerId'].unique())

10000

### Eliminando columnas irrelevantes

In [7]:
df = df.drop(columns=['RowNumber', 'CustomerId', 'Surname'], axis=1)

#revisando la eliminación
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      10000 non-null  int64  
 1   Geography        10000 non-null  object 
 2   Gender           10000 non-null  object 
 3   Age              10000 non-null  int64  
 4   Tenure           10000 non-null  int64  
 5   Balance          10000 non-null  float64
 6   NumOfProducts    10000 non-null  int64  
 7   HasCrCard        10000 non-null  int64  
 8   IsActiveMember   10000 non-null  int64  
 9   EstimatedSalary  10000 non-null  float64
 10  Exited           10000 non-null  int64  
dtypes: float64(2), int64(7), object(2)
memory usage: 859.5+ KB


- A primera vista no hay datos nulos
- Existen 2 columnas objects que son categoricas `Gender` y `Age`
- Todas las demás features son numéricas

In [9]:
print(f'Cantidad de datos duplicados {df.duplicated().sum()}') 

Cantidad de datos duplicados 0


In [10]:
categorical_columns = df.select_dtypes(include='object')
numeric_columns = df.select_dtypes(include=['int', 'float'])

for col in categorical_columns:
    
    print(df[col].value_counts())
    print('==='*60)


Geography
France     5014
Germany    2509
Spain      2477
Name: count, dtype: int64
Gender
Male      5457
Female    4543
Name: count, dtype: int64


- No se observa problemas en cuanto a que existan categorias con problemas de escritura u otro tipo que no nos permitan hacer un label encoding

In [11]:
labeler = LabelEncoder()
df['Gender'] = labeler.fit_transform(df['Gender'])

In [12]:
onehot = OneHotEncoder(sparse_output=False)
encoder_geo = onehot.fit_transform(df[['Geography']])

In [13]:
column_names = onehot.get_feature_names_out(['Geography'])
column_names

array(['Geography_France', 'Geography_Germany', 'Geography_Spain'],
      dtype=object)

In [14]:
df[column_names] = encoder_geo

In [15]:
df = df.drop(columns='Geography', axis=1)
df.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,0,42,8,159660.8,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,0,39,1,0.0,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.1,0,0.0,0.0,1.0


In [16]:
# Guarda objetos
def saving(path, object):
    with open(path, 'wb') as file:
        pickle.dump(object, file)
    return object

In [17]:
label_encoder_save = saving('label_encoder.pkl', labeler)
onehot_encoder_save = saving('onehot_encoder.pkl', onehot)

In [18]:
numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns

umbral = 10 
vars_continuas = [col for col in numeric_columns if df[col].nunique() > umbral]

vars_discretas = [col for col in numeric_columns if df[col].nunique() <= umbral]

In [19]:
df[numeric_columns].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
CreditScore,10000.0,650.5288,96.653299,350.0,584.0,652.0,718.0,850.0
Gender,10000.0,0.5457,0.497932,0.0,0.0,1.0,1.0,1.0
Age,10000.0,38.9218,10.487806,18.0,32.0,37.0,44.0,92.0
Tenure,10000.0,5.0128,2.892174,0.0,3.0,5.0,7.0,10.0
Balance,10000.0,76485.889288,62397.405202,0.0,0.0,97198.54,127644.24,250898.09
NumOfProducts,10000.0,1.5302,0.581654,1.0,1.0,1.0,2.0,4.0
HasCrCard,10000.0,0.7055,0.45584,0.0,0.0,1.0,1.0,1.0
IsActiveMember,10000.0,0.5151,0.499797,0.0,0.0,1.0,1.0,1.0
EstimatedSalary,10000.0,100090.239881,57510.492818,11.58,51002.11,100193.915,149388.2475,199992.48
Exited,10000.0,0.2037,0.402769,0.0,0.0,0.0,0.0,1.0


In [20]:
df['Exited'].value_counts()

Exited
0    7963
1    2037
Name: count, dtype: int64

## **ENTRENAMIENTO**

In [21]:
X = df.drop(columns='Exited', axis=1)
y= df['Exited']

In [22]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True, stratify=y)

In [23]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)

X_test = scaler.transform(X_test)

In [24]:
scaler_filer = saving('scaler_features.pkl', scaler)

In [25]:
import tensorflow as tf

2025-02-18 22:01:57.772256: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-18 22:01:57.882712: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-18 22:01:58.033090: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1739926918.207326  136942 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1739926918.242844  136942 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-18 22:01:58.488441: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU ins

In [26]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
import datetime


In [27]:
model = Sequential(
   [ Dense(64, activation='relu', input_shape=(X_train.shape[1], )),
    
    Dense(32, activation='relu'),

    Dense(1, activation='sigmoid')
]

)

model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
W0000 00:00:1739926924.339098  136942 gpu_device.cc:2344] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [28]:
from tensorflow.keras.metrics import F1Score, Accuracy

opt = tf.keras.optimizers.Adam(learning_rate=0.01)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

In [29]:
os.makedirs('log/fit', exist_ok=True)
log_dir = "log/fit" + datetime.datetime.now().strftime('%d-%m-%Y - %H:%M:%S')

In [30]:
tensorflow_callback = TensorBoard(log_dir = log_dir, histogram_freq=1)

In [31]:
early = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

In [32]:
history = model.fit(
    X_train, y_train,
    validation_data = (X_test, y_test),
    epochs=100,
    callbacks= [tensorflow_callback, early]
)

Epoch 1/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step - accuracy: 0.8114 - loss: 0.4379 - val_accuracy: 0.8475 - val_loss: 0.3622
Epoch 2/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.8460 - loss: 0.3644 - val_accuracy: 0.8565 - val_loss: 0.3511
Epoch 3/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8507 - loss: 0.3605 - val_accuracy: 0.8615 - val_loss: 0.3455
Epoch 4/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8593 - loss: 0.3378 - val_accuracy: 0.8530 - val_loss: 0.3466
Epoch 5/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8621 - loss: 0.3341 - val_accuracy: 0.8600 - val_loss: 0.3399
Epoch 6/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8668 - loss: 0.3321 - val_accuracy: 0.8665 - val_loss: 0.3393
Epoch 7/100
[1m250/25

In [33]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
y_pred_binary = (y_pred > 0.5).astype(int)

clasif = classification_report(y_pred_binary, y_test)

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step


In [34]:
print(clasif)

              precision    recall  f1-score   support

           0       0.95      0.89      0.92      1716
           1       0.52      0.75      0.61       284

    accuracy                           0.87      2000
   macro avg       0.74      0.82      0.77      2000
weighted avg       0.89      0.87      0.88      2000



In [63]:
from imblearn.combine import SMOTETomek

smotetomek = SMOTETomek(random_state=42)
X_train_res, y_train_res = smotetomek.fit_resample(X_train, y_train)


In [150]:
model2 = Sequential(
   [ Dense(64, activation='relu', input_shape=(X_train.shape[1], )),
    Dropout(0.2),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
]

)

model2.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [151]:
from tensorflow.keras.metrics import F1Score, Accuracy


opt2 = tf.keras.optimizers.Adam(learning_rate=0.001)
model2.compile(
    loss='binary_crossentropy',
    optimizer=opt2, 
    metrics=['accuracy'])

In [152]:
from sklearn.utils import class_weight
import numpy as np
class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights = dict(enumerate(class_weights))
print("Class weights:", class_weights)

history2 = model2.fit(
    X_train, y_train,
    validation_data = (X_test, y_test),
    epochs=100,
    callbacks= [tensorflow_callback, early],
    class_weight=class_weights
)

Class weights: {0: np.float64(0.6279434850863422), 1: np.float64(2.4539877300613497)}
Epoch 1/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.6562 - loss: 0.6679 - val_accuracy: 0.6630 - val_loss: 0.6046
Epoch 2/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7012 - loss: 0.5643 - val_accuracy: 0.7260 - val_loss: 0.5298
Epoch 3/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7195 - loss: 0.5372 - val_accuracy: 0.7665 - val_loss: 0.4888
Epoch 4/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7555 - loss: 0.5042 - val_accuracy: 0.7460 - val_loss: 0.5070
Epoch 5/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.7614 - loss: 0.4928 - val_accuracy: 0.8020 - val_loss: 0.4330
Epoch 6/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 

In [153]:
from sklearn.metrics import classification_report

y_pred = model2.predict(X_test)
y_pred_binary = (y_pred > 0.5).astype(int)

clasif = classification_report(y_pred_binary, y_test)

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step  


In [154]:
print(clasif)

              precision    recall  f1-score   support

           0       0.82      0.92      0.87      1425
           1       0.72      0.51      0.60       575

    accuracy                           0.80      2000
   macro avg       0.77      0.71      0.73      2000
weighted avg       0.79      0.80      0.79      2000



In [None]:
model.save('model.h5')

