In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
import joblib
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from tensorflow import keras
from keras.models import Model, load_model, save_model
from keras.layers import Input, Dense
!pip install scikeras
from scikeras.wrappers import KerasClassifier, BaseWrapper
from sklearn.metrics import accuracy_score, roc_auc_score

Collecting scikeras
  Downloading scikeras-0.12.0-py3-none-any.whl (27 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.12.0


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
churn = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CustomerChurn_dataset.csv')

In [6]:
churn.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [7]:
churn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [8]:
# Getting the categorical columns
categorical = churn.select_dtypes(include=['object']).columns.tolist()
categorical

['customerID',
 'gender',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'TotalCharges',
 'Churn']

In [9]:
# Encoding the object columns using the LabelEncoder
label_encoder = LabelEncoder()
for column in categorical:
    churn[column] = label_encoder.fit_transform(churn[column])

In [10]:
churn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   int64  
 1   gender            7043 non-null   int64  
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   int64  
 4   Dependents        7043 non-null   int64  
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   int64  
 7   MultipleLines     7043 non-null   int64  
 8   InternetService   7043 non-null   int64  
 9   OnlineSecurity    7043 non-null   int64  
 10  OnlineBackup      7043 non-null   int64  
 11  DeviceProtection  7043 non-null   int64  
 12  TechSupport       7043 non-null   int64  
 13  StreamingTV       7043 non-null   int64  
 14  StreamingMovies   7043 non-null   int64  
 15  Contract          7043 non-null   int64  
 16  PaperlessBilling  7043 non-null   int64  


In [11]:
# Defining independent and dependent variables
X = churn.drop(columns=['Churn','customerID'])
y = churn['Churn']

In [12]:
# Scaling the data
scaler = StandardScaler()
scaler.fit_transform(X)

array([[-1.00955867, -0.43991649,  1.03453023, ...,  0.39855772,
        -1.16032292, -0.39860759],
       [ 0.99053183, -0.43991649, -0.96662231, ...,  1.33486261,
        -0.25962894, -0.94876238],
       [ 0.99053183, -0.43991649, -0.96662231, ...,  1.33486261,
        -0.36266036, -1.64188328],
       ...,
       [-1.00955867, -0.43991649,  1.03453023, ...,  0.39855772,
        -1.1686319 , -0.13968008],
       [ 0.99053183,  2.27315869,  1.03453023, ...,  1.33486261,
         0.32033821, -0.31653445],
       [ 0.99053183, -0.43991649, -0.96662231, ..., -1.47405205,
         1.35896134,  1.13801338]])

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# Creating a Random Forest classifier
rfc = RandomForestClassifier(n_estimators=110,max_depth=20,criterion='entropy')

# Training the model
rfc.fit(X_train, y_train)

# Making predictions on the test set
y_pred = rfc.predict(X_test)

# Calculating accuracy for reference
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Getting feature importances
feature_importances = rfc.feature_importances_

# Creating a DataFrame to display feature importances
feature_importance_df = pd.DataFrame(
    {"Feature": X_train.columns, "Importance": feature_importances}
)

# Sorting features by importance
feature_importance_df = feature_importance_df.sort_values(by="Importance", ascending=False)

# Displaying the feature importance DataFrame
print("\nFeature Importance: \n")
print(feature_importance_df)

Accuracy: 0.7970191625266146

Feature Importance: 

             Feature  Importance
17    MonthlyCharges    0.175839
18      TotalCharges    0.169461
4             tenure    0.169256
14          Contract    0.088663
16     PaymentMethod    0.053381
8     OnlineSecurity    0.042744
11       TechSupport    0.042494
0             gender    0.028325
9       OnlineBackup    0.026516
15  PaperlessBilling    0.025991
6      MultipleLines    0.024893
2            Partner    0.023434
7    InternetService    0.023428
10  DeviceProtection    0.021507
3         Dependents    0.020345
13   StreamingMovies    0.019482
1      SeniorCitizen    0.019298
12       StreamingTV    0.018755
5       PhoneService    0.006188


In [17]:
# Creating the model using a function
def create_model(neurons=10, activation='relu'):
    input_layer = Input(shape=(X_train.shape[1],))
    hidden_layer1 = Dense(neurons, activation=activation)(input_layer)
    hidden_layer2 = Dense(neurons, activation=activation)(hidden_layer1)
    hidden_layer3 = Dense(neurons, activation=activation)(hidden_layer2)
    hidden_layer4 = Dense(neurons, activation=activation)(hidden_layer3)
    output_layer = Dense(1, activation='sigmoid')(hidden_layer4)

    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Creating a KerasClassifier object
model = KerasClassifier(model=create_model, neurons=64, verbose=0)

# Defining the grid search parameters
param_grid = {
    'neurons': [64, 32, 16, 8],
    'epochs': [10, 15, 20],
    'batch_size': [16, 32, 64],
    'validation_split': [0.1, 0.2, 0.3]
}

# Using GridSearchCV
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
grid_result = grid.fit(X_train, y_train)

# Printing the best parameters and corresponding accuracy
print(f"Best Parameters: {grid_result.best_params_}")
print(f"Best Accuracy: {grid_result.best_score_}")


Best Parameters: {'batch_size': 16, 'epochs': 20, 'neurons': 8, 'validation_split': 0.1}
Best Accuracy: 0.7854100106496272


In [30]:
# Best parameters from the grid search
best_params = {'batch_size': 16, 'epochs': 20, 'neurons': 8, 'validation_split': 0.1}

# Instantiating the model with the best parameters
best_model = create_model(neurons=best_params['neurons'])

# Compiling the model
best_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Training the model with the best parameters
best_model.fit(X_train, y_train, epochs=best_params['epochs'], batch_size=best_params['batch_size'], validation_split=best_params['validation_split'])


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x78df76a41ab0>

In [31]:
# Making predictions on the test set
y_pred = best_model.predict(X_test)

# Converting predictions to binary (0 or 1)
y_pred_binary = (y_pred > 0.5).astype(int)

# Calculating accuracy
accuracy = accuracy_score(y_test, y_pred_binary)
print(f"Accuracy: {accuracy}")

# Calculating ROC AUC score
roc_auc = roc_auc_score(y_test, y_pred)
print(f"ROC AUC Score: {roc_auc}")


Accuracy: 0.8041163946061036
ROC AUC Score: 0.84650827579782


In [33]:
# Saving the scaler
joblib.dump(scaler, 'scaler.pkl')

# Saving the model
best_model.save('best_model.h5')

In [34]:
from google.colab import files

# Download the saved model
files.download('scaler.pkl')
files.download('best_model.h5')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>