Extracting the relevant features that can define a customer churn.

In [25]:
import numpy as np
import pandas as pd

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [26]:
dataframe=pd.read_csv('/content/drive/My Drive/CustomerChurn_dataset.csv')

In [27]:
dataframe

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


Processing the dataframe

In [29]:
dataframe['TotalCharges'] = pd.to_numeric(dataframe['TotalCharges'],errors='coerce').fillna(0)
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [30]:
# Isolating all numerical & all categorical dataframe
numerical_data = dataframe.select_dtypes(exclude=['object'])
categorical_data= dataframe.select_dtypes(include=['object'])

# factorizing categorical values (turning them into numbers)
categorical_columns = list(categorical_data.columns.values)
for col in categorical_columns:
  categorical_data[col], b = pd.factorize(categorical_data[col])

#  Filling  in the blanks in the numerical columns that is imputing
numerical_data.fillna(numerical_data.mean(), inplace =True)


In [31]:
numerical_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   SeniorCitizen   7043 non-null   int64  
 1   tenure          7043 non-null   int64  
 2   MonthlyCharges  7043 non-null   float64
 3   TotalCharges    7043 non-null   float64
dtypes: float64(2), int64(2)
memory usage: 220.2 KB


In [32]:
categorical_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   customerID        7043 non-null   int64
 1   gender            7043 non-null   int64
 2   Partner           7043 non-null   int64
 3   Dependents        7043 non-null   int64
 4   PhoneService      7043 non-null   int64
 5   MultipleLines     7043 non-null   int64
 6   InternetService   7043 non-null   int64
 7   OnlineSecurity    7043 non-null   int64
 8   OnlineBackup      7043 non-null   int64
 9   DeviceProtection  7043 non-null   int64
 10  TechSupport       7043 non-null   int64
 11  StreamingTV       7043 non-null   int64
 12  StreamingMovies   7043 non-null   int64
 13  Contract          7043 non-null   int64
 14  PaperlessBilling  7043 non-null   int64
 15  PaymentMethod     7043 non-null   int64
 16  Churn             7043 non-null   int64
dtypes: int64(17)
memory usage: 935.5 

In [57]:
# Putting in the new dataset together as one
new_categorical_data = categorical_data.reset_index(drop=True)
new_numerical_data = numerical_data.reset_index(drop=True)

new_data = pd.concat([categorical_data, numerical_data], axis=1)


In [34]:
new_data

Unnamed: 0,customerID,gender,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,...,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,Churn,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,29.85,29.85
1,1,1,1,0,1,1,0,1,1,1,...,0,0,1,1,1,0,0,34,56.95,1889.50
2,2,1,1,0,1,1,0,1,0,0,...,0,0,0,0,1,1,0,2,53.85,108.15
3,3,1,1,0,0,0,0,1,1,1,...,0,0,1,1,2,0,0,45,42.30,1840.75
4,4,0,1,0,1,1,1,0,1,0,...,0,0,0,0,0,1,0,2,70.70,151.65
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,7038,1,0,1,1,2,0,1,1,1,...,1,1,1,0,1,0,0,24,84.80,1990.50
7039,7039,0,0,1,1,2,1,0,0,1,...,1,1,1,0,3,0,0,72,103.20,7362.90
7040,7040,0,0,1,0,0,0,1,1,0,...,0,0,0,0,0,0,0,11,29.60,346.45
7041,7041,1,0,0,1,2,1,0,1,0,...,0,0,0,0,1,1,1,4,74.40,306.60


In [99]:
# Selecting the target variable
y = new_data['Churn']
len(y)

# Extracting the features of X
X = new_data.drop('Churn', axis=1)
scaler = StandardScaler()
X1 = scaler.fit_transform(X)
X = pd.DataFrame(X1, columns =X.columns )

In [100]:
X.head()

Unnamed: 0,customerID,gender,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
0,-1.731805,-1.009559,-1.03453,-0.654012,-3.05401,-2.061669,-1.183234,-0.903589,-1.180777,-0.998016,-0.909172,-1.071457,-1.07821,-0.828207,-0.829798,-1.145198,-0.439916,-1.277445,-1.160323,-0.992611
1,-1.731313,0.990532,0.966622,-0.654012,0.327438,-0.505697,-1.183234,0.351386,0.173655,0.286059,-0.909172,-1.071457,-1.07821,0.371271,1.205113,-0.274744,-0.439916,0.066327,-0.259629,-0.172165
2,-1.730821,0.990532,0.966622,-0.654012,0.327438,-0.505697,-1.183234,0.351386,-1.180777,-0.998016,-0.909172,-1.071457,-1.07821,-0.828207,-0.829798,-0.274744,-0.439916,-1.236724,-0.36266,-0.958066
3,-1.730329,0.990532,0.966622,-0.654012,-3.05401,-2.061669,-1.183234,0.351386,0.173655,0.286059,0.347362,-1.071457,-1.07821,0.371271,1.205113,0.595711,-0.439916,0.514251,-0.746535,-0.193672
4,-1.729837,-1.009559,0.966622,-0.654012,0.327438,-0.505697,0.17225,-0.903589,0.173655,-0.998016,-0.909172,-1.071457,-1.07821,-0.828207,-0.829798,-1.145198,-0.439916,-1.236724,0.197365,-0.938874


In [59]:
len(y)

7043

Feature Selection

In [101]:
#importing a model to help in finding the best features
from sklearn.feature_selection import SelectKBest, chi2
import pandas as pd

In [102]:
# showing or printing the selected features
print(selected_features_val)

Index(['tenure', 'MonthlyCharges', 'TotalCharges', 'OnlineSecurity_No',
       'TechSupport_No', 'Contract_Month-to-month', 'Contract_Two year',
       'PaymentMethod_Electronic check'],
      dtype='object')


In [73]:
#Importing the models needed for training
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score


In [103]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   float64
 1   gender            7043 non-null   float64
 2   Partner           7043 non-null   float64
 3   Dependents        7043 non-null   float64
 4   PhoneService      7043 non-null   float64
 5   MultipleLines     7043 non-null   float64
 6   InternetService   7043 non-null   float64
 7   OnlineSecurity    7043 non-null   float64
 8   OnlineBackup      7043 non-null   float64
 9   DeviceProtection  7043 non-null   float64
 10  TechSupport       7043 non-null   float64
 11  StreamingTV       7043 non-null   float64
 12  StreamingMovies   7043 non-null   float64
 13  Contract          7043 non-null   float64
 14  PaperlessBilling  7043 non-null   float64
 15  PaymentMethod     7043 non-null   float64
 16  SeniorCitizen     7043 non-null   float64


In [104]:
# Spliting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [87]:
y_train.info()

<class 'pandas.core.series.Series'>
Int64Index: 5634 entries, 2142 to 860
Series name: Churn
Non-Null Count  Dtype 
--------------  ----- 
5634 non-null   object
dtypes: object(1)
memory usage: 88.0+ KB


In [105]:
# Defining the input layer
input_layer = Input(shape=(X_train.shape[1],))

In [106]:
# Defining the architecture of the MLP model
hidden_layer1 = Dense(60, activation='relu')(input_layer)
hidden_layer2 = Dense(40, activation='relu')(hidden_layer1)
output_layer = Dense(1, activation='sigmoid')(hidden_layer2)

In [107]:
# Creating the model
model = Model(inputs=input_layer, outputs=output_layer)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [108]:
# Training the model more times
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7c51185ffd90>

In [109]:
# Predict probabilities on the test set
y_pred_probs = model.predict(X_test)

# Convert probabilities to binary predictions (0 or 1)
y_pred = (y_pred_probs > 0.5).astype(int)



In [110]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Test Accuracy: {accuracy * 100:.2f}%')

Test Accuracy: 81.12%


In [111]:
# Calculate AUC score
auc_score = roc_auc_score(y_test, y_pred_probs)
print(f'AUC Score: {auc_score:.4f}')

AUC Score: 0.8483
