In [70]:
# Create an environment to reading, understanding, and visualization

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
%matplotlib inline 
import seaborn as sns
import missingno as msno
from scipy import stats 

import warnings
warnings.filterwarnings('ignore') 

In [71]:
path = r"G:\Github-2025\customer_churn_ML_ANN\data_set\clean_df.csv"
sm_df = pd.read_csv(path)
sm_df.head()

Unnamed: 0,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,No,Yes,No,No,No,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0
1,No,No,No,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,0
2,No,No,No,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1
3,No,No,No,No,No,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,0
4,No,No,No,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,1


In [72]:
col_convert_yes_no = ['SeniorCitizen', 'Partner', 'Dependents', 'PhoneService',
       'MultipleLines', 'OnlineSecurity', 'OnlineBackup',
       'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling']
for col in col_convert_yes_no:
       sm_df[col].replace({'Yes':1, 'No':0}, inplace=True)

In [73]:
df1 = pd.get_dummies(data=sm_df, columns=['InternetService','Contract','PaymentMethod'])

In [74]:
# Convert boolean values to integers (0 or 1)
df1 = df1.astype(int)
df1

Unnamed: 0,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,...,InternetService_DSL,InternetService_Fiber optic,InternetService_No,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,0,0,0,0,1,0,0,0,...,1,0,0,1,0,0,0,0,1,0
1,0,0,0,1,0,1,0,1,0,0,...,1,0,0,0,1,0,0,0,0,1
2,0,0,0,1,0,1,1,0,0,0,...,1,0,0,1,0,0,0,0,0,1
3,0,0,0,0,0,1,0,1,1,0,...,1,0,0,0,1,0,1,0,0,0
4,0,0,0,1,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,0,1,1,1,1,1,0,1,1,1,...,1,0,0,0,1,0,0,0,0,1
7028,0,1,1,1,1,0,1,1,0,1,...,0,1,0,0,1,0,0,1,0,0
7029,0,1,1,0,0,1,0,0,0,0,...,1,0,0,1,0,0,0,0,1,0
7030,1,1,0,1,1,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,1


In [75]:
imbalance_df = df1.copy()
imbalance_df.to_csv(r"G:\Github-2025\customer_churn_ML_ANN\data_set\imbalance_df.csv", index=False)

# Imbalance dataset to balance

In [76]:
from sklearn.model_selection import train_test_split

# Now, you can proceed with SMOTE
X = df1.drop('Churn', axis=1)
y = df1['Churn']

from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy='minority')
X_sm, y_sm = smote.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.25, random_state=42)


In [77]:
y_sm.value_counts()

Churn
0    5163
1    5163
Name: count, dtype: int64

In [78]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline

numeric_features = ['TotalCharges','MonthlyCharges']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ]
    )

# LogisticRegression

In [79]:
from sklearn.linear_model import LogisticRegression
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000))
])

# Fit the pipeline on your data and target variable
pipeline.fit(X, y)

In [80]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Predictions on train and test data
y_pred_train = pipeline.predict(X_train)
y_pred_test = pipeline.predict(X_test)

# Calculate accuracy and confusion matrix
accuracy_train_lgr = accuracy_score(y_train, y_pred_train)
accuracy_test_lgr = accuracy_score(y_test, y_pred_test)
cm = confusion_matrix(y_test, y_pred_test)

# Print results
print("Accuracy on train set (Logistic Regression):", accuracy_train_lgr)
print("Accuracy on test set (Logistic Regression):", accuracy_test_lgr)
print("Confusion Matrix (Logistic Regression):\n", cm)
print("Classification Report (Logistic Regression):\n", classification_report(y_test, y_pred_test))

Accuracy on train set (Logistic Regression): 0.6774276859504132
Accuracy on test set (Logistic Regression): 0.6847405112316034
Confusion Matrix (Logistic Regression):
 [[1175  129]
 [ 685  593]]
Classification Report (Logistic Regression):
               precision    recall  f1-score   support

           0       0.63      0.90      0.74      1304
           1       0.82      0.46      0.59      1278

    accuracy                           0.68      2582
   macro avg       0.73      0.68      0.67      2582
weighted avg       0.73      0.68      0.67      2582



In [81]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
# Define cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
scores = cross_val_score(pipeline, X, y, cv=cv, scoring='accuracy')

# Print the cross-validation results
print("Cross-Validation Scores:", scores)
print("Mean Accuracy:", np.mean(scores))

Cross-Validation Scores: [0.7960199  0.77896233 0.76671408 0.7859175  0.79871977]
Mean Accuracy: 0.7852667166100001


# RandomForestClassifier

In [82]:
from sklearn.ensemble import RandomForestClassifier
# Create a pipeline with ColumnTransformer and a Random Forest classifier
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42, n_estimators=100))
])

# Fit the pipeline on your data and target variable
rf_pipeline.fit(X, y)

In [83]:
y_pred_train_rf = rf_pipeline.predict(X_train)
y_pred_test_rf = rf_pipeline.predict(X_test)

accuracy_train_rf = accuracy_score(y_train,y_pred_train_rf)
accuracy_test_rf = accuracy_score(y_test,y_pred_test_rf)
cm = confusion_matrix(y_test,y_pred_test_rf)

print("Accuracy on train-rf :", accuracy_train_rf)
print("Accuracy on test-rf :", accuracy_test_rf)
print("Confusion Matrix-rf :\n", cm)
print("For RandomForestClassifier:", classification_report(y_test,y_pred_test_rf))

Accuracy on train-rf : 0.8548553719008265
Accuracy on test-rf : 0.8621223857474826
Confusion Matrix-rf :
 [[1284   20]
 [ 336  942]]
For RandomForestClassifier:               precision    recall  f1-score   support

           0       0.79      0.98      0.88      1304
           1       0.98      0.74      0.84      1278

    accuracy                           0.86      2582
   macro avg       0.89      0.86      0.86      2582
weighted avg       0.88      0.86      0.86      2582



In [84]:
from sklearn.model_selection import cross_val_score, StratifiedKFold

# Define cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
scores = cross_val_score(rf_pipeline, X, y, cv=cv, scoring='accuracy')

# Print the cross-validation results
print("Cross-Validation Scores:", scores)
print("Mean Accuracy:", np.mean(scores))

Cross-Validation Scores: [0.76261549 0.75977257 0.73968706 0.75391181 0.75248933]
Mean Accuracy: 0.7536952506316213


# DecessionTrees

In [85]:
from sklearn.tree import DecisionTreeClassifier

# Create a pipeline with preprocessing and the classifier
dc_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(max_depth=10, min_samples_leaf=10))
])

# Fit the pipeline on the training data
dc_pipeline.fit(X_train, y_train)


In [86]:
y_pred_train_dc = dc_pipeline.predict(X_train)
y_pred_test_dc = dc_pipeline.predict(X_test)

accuracy_train_dc = accuracy_score(y_train,y_pred_train_dc)
accuracy_test_dc = accuracy_score(y_test,y_pred_test_dc)
cm_dc = confusion_matrix(y_test,y_pred_test_dc)

print("Accuracy on train-dc :", accuracy_train_dc)
print("Accuracy on test-dc :", accuracy_test_dc)
print("Confusion Matrix-dc :\n", cm_dc)
print("For dc:", classification_report(y_test,y_pred_test_dc))

Accuracy on train-dc : 0.8019111570247934
Accuracy on test-dc : 0.7536793183578622
Confusion Matrix-dc :
 [[ 937  367]
 [ 269 1009]]
For SVM:               precision    recall  f1-score   support

           0       0.78      0.72      0.75      1304
           1       0.73      0.79      0.76      1278

    accuracy                           0.75      2582
   macro avg       0.76      0.75      0.75      2582
weighted avg       0.76      0.75      0.75      2582



In [87]:
# Define cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
scores = cross_val_score(dc_pipeline, X, y, cv=cv, scoring='accuracy')

# Print the cross-validation results
print("Cross-Validation Scores:", scores)
print("Mean Accuracy:", np.mean(scores))

Cross-Validation Scores: [0.78464819 0.77683014 0.76386913 0.75604552 0.7859175 ]
Mean Accuracy: 0.7734620941219528


# SVC

In [88]:
from sklearn.svm import SVC

# Create a pipeline with preprocessing and the classifier
svm_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifer', SVC(kernel='linear',C=1.0, random_state =42))
])

# Fit the pipeline on the training data
svm_pipeline.fit(X_train, y_train)

In [89]:
y_pred_train_svm = svm_pipeline.predict(X_train)
y_pred_test_svm = svm_pipeline.predict(X_test)

accuracy_train_svm = accuracy_score(y_train,y_pred_train_svm)
accuracy_test_svm = accuracy_score(y_test,y_pred_test_svm)
cm_svm = confusion_matrix(y_test,y_pred_test_svm)

print("Accuracy on train-svm :", accuracy_train_svm)
print("Accuracy on test-svm :", accuracy_test_svm)
print("Confusion Matrix-svm :\n", cm_svm)
print("For SVM:", classification_report(y_test,y_pred_test_svm))

Accuracy on train-svm : 0.7181043388429752
Accuracy on test-svm : 0.7184353214562355
Confusion Matrix-svm :
 [[949 355]
 [372 906]]
For SVM:               precision    recall  f1-score   support

           0       0.72      0.73      0.72      1304
           1       0.72      0.71      0.71      1278

    accuracy                           0.72      2582
   macro avg       0.72      0.72      0.72      2582
weighted avg       0.72      0.72      0.72      2582



In [90]:
# Define cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
scores = cross_val_score(svm_pipeline, X, y, cv=cv, scoring='accuracy')

# Print the cross-validation results
print("Cross-Validation Scores:", scores)
print("Mean Accuracy:", np.mean(scores))

Cross-Validation Scores: [0.79673063 0.77754087 0.76742532 0.77667141 0.7972973 ]
Mean Accuracy: 0.7831331050498372


# XGBClassifier

In [91]:
from xgboost import XGBClassifier
# Create XGBoost pipeline
xgb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(n_estimators=500, learning_rate=0.01))
])

# Fit the pipeline
xgb_pipeline.fit(X_train, y_train)

In [92]:
y_pred_train_xgb = xgb_pipeline.predict(X_train)
y_pred_test_xgb = xgb_pipeline.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Assuming y_pred_train and y_pred_test are your predicted values
accuracy_train_xgb = accuracy_score(y_train, y_pred_train_xgb)
accuracy_test_xgb = accuracy_score(y_test, y_pred_test_xgb)

print(f"Accuracy_train_XGB: {accuracy_train_xgb}\nAccuracy_test_Xgb: {accuracy_test_xgb}")

cm = confusion_matrix(y_test, y_pred_test_xgb)
print(f"Confusion_matrix:\n {cm}")

# Rename the variable to avoid conflicts with the function name
classification_report_xgb = classification_report(y_test, y_pred_test_xgb)

print(f"Classification_report:\n{classification_report_xgb}")

Accuracy_train_XGB: 0.7839617768595041
Accuracy_test_Xgb: 0.7556158017041054
Confusion_matrix:
 [[ 901  403]
 [ 228 1050]]
Classification_report:
              precision    recall  f1-score   support

           0       0.80      0.69      0.74      1304
           1       0.72      0.82      0.77      1278

    accuracy                           0.76      2582
   macro avg       0.76      0.76      0.75      2582
weighted avg       0.76      0.76      0.75      2582



In [93]:
from sklearn.model_selection import cross_val_score, StratifiedKFold


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Compute cross-validated scores
cross_val_scores = cross_val_score(xgb_pipeline, X, y, cv=cv, scoring='accuracy')

# Print the mean and standard deviation of cross-validated scores
print("Cross-validated Accuracy: {:.2f} (+/- {:.2f})".format(cross_val_scores.mean(), cross_val_scores.std()))

Cross-validated Accuracy: 0.79 (+/- 0.00)


# GradientBoostingClassifier

In [94]:
from sklearn.ensemble import GradientBoostingClassifier

gb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier(n_estimators=90, learning_rate=0.01))
])

gb_pipeline.fit(X_train, y_train)

In [95]:
y_pred_train_gb = gb_pipeline.predict(X_train)
y_pred_test_gb = gb_pipeline.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Assuming y_pred_train and y_pred_test are your predicted values
accuracy_train_gb = accuracy_score(y_train, y_pred_train_gb)
accuracy_test_gb = accuracy_score(y_test, y_pred_test_gb)

print(f"Accuracy_train_GB: {accuracy_train_gb}\nAccuracy_test_gb: {accuracy_test_gb}")

cm = confusion_matrix(y_test, y_pred_test_gb)
print(f"Confusion_matrix:\n {cm}")

# Rename the variable to avoid conflicts with the function name
classification_report_gb = classification_report(y_test, y_pred_test_gb)

print(f"Classification_report:\n{classification_report_gb}")

Accuracy_train_GB: 0.7521952479338843
Accuracy_test_gb: 0.7401239349341595
Confusion_matrix:
 [[954 350]
 [321 957]]
Classification_report:
              precision    recall  f1-score   support

           0       0.75      0.73      0.74      1304
           1       0.73      0.75      0.74      1278

    accuracy                           0.74      2582
   macro avg       0.74      0.74      0.74      2582
weighted avg       0.74      0.74      0.74      2582



In [96]:
from sklearn.model_selection import cross_val_score, StratifiedKFold


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Compute cross-validated scores
cross_val_scores = cross_val_score(gb_pipeline, X, y, cv=cv, scoring='accuracy')

# Print the mean and standard deviation of cross-validated scores
print("Cross-validated Accuracy: {:.2f} (+/- {:.2f})".format(cross_val_scores.mean(), cross_val_scores.std()))

Cross-validated Accuracy: 0.77 (+/- 0.00)
