In [21]:
# Create an environment to reading, understanding, and visualization

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
%matplotlib inline 
import seaborn as sns
import missingno as msno
from scipy import stats 

import warnings
warnings.filterwarnings('ignore') 

In [22]:
path = r"G:\Github-2025\customer_churn_ML_ANN\data_set\imbalance_df.csv"
un_df = pd.read_csv(path)
un_df.head()

Unnamed: 0,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,...,InternetService_DSL,InternetService_Fiber optic,InternetService_No,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,0,0,0,0,1,0,0,0,...,1,0,0,1,0,0,0,0,1,0
1,0,0,0,1,0,1,0,1,0,0,...,1,0,0,0,1,0,0,0,0,1
2,0,0,0,1,0,1,1,0,0,0,...,1,0,0,1,0,0,0,0,0,1
3,0,0,0,0,0,1,0,1,1,0,...,1,0,0,0,1,0,1,0,0,0
4,0,0,0,1,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,1,0


# Imbalance dataset to balance

In [23]:
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler

# Step 1: Split the Dataset
X = un_df.drop('Churn', axis=1)
y = un_df['Churn']

# Step 2: Undersampling with RandomUnderSampler
undersampler = RandomUnderSampler(sampling_strategy='majority')
X_resampled, y_resampled = undersampler.fit_resample(X, y)

# Step 3: Split the Data into Train and Test Sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.25, random_state=42)

In [24]:
y_resampled.value_counts()

Churn
0    1869
1    1869
Name: count, dtype: int64

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline

numeric_features = ['TotalCharges']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ]
    )

In [26]:
from sklearn.linear_model import LogisticRegression
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000))
])

# Fit the pipeline on your data and target variable
pipeline.fit(X, y)

In [27]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Predictions on train and test data
y_pred_train = pipeline.predict(X_train)
y_pred_test = pipeline.predict(X_test)

# Calculate accuracy and confusion matrix
accuracy_train_lgr = accuracy_score(y_train, y_pred_train)
accuracy_test_lgr = accuracy_score(y_test, y_pred_test)
cm = confusion_matrix(y_test, y_pred_test)

# Print results
print("Accuracy on train set (Logistic Regression):", accuracy_train_lgr)
print("Accuracy on test set (Logistic Regression):", accuracy_test_lgr)
print("Confusion Matrix (Logistic Regression):\n", cm)
print("Classification Report (Logistic Regression):\n", classification_report(y_test, y_pred_test))

Accuracy on train set (Logistic Regression): 0.4948269711023903
Accuracy on test set (Logistic Regression): 0.5155080213903743
Confusion Matrix (Logistic Regression):
 [[482   0]
 [453   0]]
Classification Report (Logistic Regression):
               precision    recall  f1-score   support

           0       0.52      1.00      0.68       482
           1       0.00      0.00      0.00       453

    accuracy                           0.52       935
   macro avg       0.26      0.50      0.34       935
weighted avg       0.27      0.52      0.35       935



In [28]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
# Define cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
scores = cross_val_score(pipeline, X, y, cv=cv, scoring='accuracy')

# Print the cross-validation results
print("Cross-Validation Scores:", scores)
print("Mean Accuracy:", np.mean(scores))

Cross-Validation Scores: [0.73418621 0.73418621 0.73470839 0.73399716 0.73399716]
Mean Accuracy: 0.7342150252598014


In [29]:
from sklearn.ensemble import RandomForestClassifier
# Create a pipeline with ColumnTransformer and a Random Forest classifier
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42, n_estimators=100))
])

# Fit the pipeline on your data and target variable
rf_pipeline.fit(X, y)

In [30]:
y_pred_train_rf = rf_pipeline.predict(X_train)
y_pred_test_rf = rf_pipeline.predict(X_test)

accuracy_train_rf = accuracy_score(y_train,y_pred_train_rf)
accuracy_test_rf = accuracy_score(y_test,y_pred_test_rf)
cm = confusion_matrix(y_test,y_pred_test_rf)

print("Accuracy on train-rf :", accuracy_train_rf)
print("Accuracy on test-rf :", accuracy_test_rf)
print("Confusion Matrix-rf :\n", cm)
print("For RandomForestClassifier:", classification_report(y_test,y_pred_test_rf))

Accuracy on train-rf : 0.7791651801641098
Accuracy on test-rf : 0.7732620320855615
Confusion Matrix-rf :
 [[455  27]
 [185 268]]
For RandomForestClassifier:               precision    recall  f1-score   support

           0       0.71      0.94      0.81       482
           1       0.91      0.59      0.72       453

    accuracy                           0.77       935
   macro avg       0.81      0.77      0.76       935
weighted avg       0.81      0.77      0.77       935



In [31]:
from sklearn.model_selection import cross_val_score, StratifiedKFold

# Define cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
scores = cross_val_score(rf_pipeline, X, y, cv=cv, scoring='accuracy')

# Print the cross-validation results
print("Cross-Validation Scores:", scores)
print("Mean Accuracy:", np.mean(scores))

Cross-Validation Scores: [0.65813788 0.65742715 0.67140825 0.6571835  0.66500711]
Mean Accuracy: 0.6618327788005713


In [32]:
from sklearn.svm import SVC

# Create a pipeline with preprocessing and the classifier
svm_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifer', SVC(kernel='linear',C=1.0, random_state =42))
])

# Fit the pipeline on the training data
svm_pipeline.fit(X_train, y_train)

In [33]:
y_pred_train_svm = svm_pipeline.predict(X_train)
y_pred_test_svm = svm_pipeline.predict(X_test)

accuracy_train_svm = accuracy_score(y_train,y_pred_train_svm)
accuracy_test_svm = accuracy_score(y_test,y_pred_test_svm)
cm_svm = confusion_matrix(y_test,y_pred_test_svm)

print("Accuracy on train-svm :", accuracy_train_svm)
print("Accuracy on test-svm :", accuracy_test_svm)
print("Confusion Matrix-svm :\n", cm_svm)
print("For SVM:", classification_report(y_test,y_pred_test_svm))

Accuracy on train-svm : 0.5918658580092758
Accuracy on test-svm : 0.5957219251336898
Confusion Matrix-svm :
 [[190 292]
 [ 86 367]]
For SVM:               precision    recall  f1-score   support

           0       0.69      0.39      0.50       482
           1       0.56      0.81      0.66       453

    accuracy                           0.60       935
   macro avg       0.62      0.60      0.58       935
weighted avg       0.62      0.60      0.58       935



In [34]:
# Define cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
scores = cross_val_score(svm_pipeline, X, y, cv=cv, scoring='accuracy')

# Print the cross-validation results
print("Cross-Validation Scores:", scores)
print("Mean Accuracy:", np.mean(scores))

Cross-Validation Scores: [0.73418621 0.73418621 0.73470839 0.73399716 0.73399716]
Mean Accuracy: 0.7342150252598014


In [35]:
from xgboost import XGBClassifier
# Create XGBoost pipeline
xgb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(n_estimators=500, learning_rate=0.01))
])

# Fit the pipeline
xgb_pipeline.fit(X_train, y_train)

In [36]:
y_pred_train_xgb = xgb_pipeline.predict(X_train)
y_pred_test_xgb = xgb_pipeline.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Assuming y_pred_train and y_pred_test are your predicted values
accuracy_train_xgb = accuracy_score(y_train, y_pred_train_xgb)
accuracy_test_xgb = accuracy_score(y_test, y_pred_test_xgb)

print(f"Accuracy_train_XGB: {accuracy_train_xgb}\nAccuracy_test_Xgb: {accuracy_test_xgb}")

cm = confusion_matrix(y_test, y_pred_test_xgb)
print(f"Confusion_matrix:\n {cm}")

# Rename the variable to avoid conflicts with the function name
classification_report_xgb = classification_report(y_test, y_pred_test_xgb)

print(f"Classification_report:\n{classification_report_xgb}")

Accuracy_train_XGB: 0.6660720656439529
Accuracy_test_Xgb: 0.6064171122994653
Confusion_matrix:
 [[313 169]
 [199 254]]
Classification_report:
              precision    recall  f1-score   support

           0       0.61      0.65      0.63       482
           1       0.60      0.56      0.58       453

    accuracy                           0.61       935
   macro avg       0.61      0.61      0.60       935
weighted avg       0.61      0.61      0.61       935



In [37]:
from sklearn.model_selection import cross_val_score, StratifiedKFold


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Compute cross-validated scores
cross_val_scores = cross_val_score(xgb_pipeline, X, y, cv=cv, scoring='accuracy')

# Print the mean and standard deviation of cross-validated scores
print("Cross-validated Accuracy: {:.2f} (+/- {:.2f})".format(cross_val_scores.mean(), cross_val_scores.std()))

Cross-validated Accuracy: 0.75 (+/- 0.00)


In [38]:
from sklearn.ensemble import GradientBoostingClassifier

gb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier(n_estimators=90, learning_rate=0.01))
])

gb_pipeline.fit(X_train, y_train)

In [39]:
y_pred_train_gb = gb_pipeline.predict(X_train)
y_pred_test_gb = gb_pipeline.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Assuming y_pred_train and y_pred_test are your predicted values
accuracy_train_gb = accuracy_score(y_train, y_pred_train_gb)
accuracy_test_gb = accuracy_score(y_test, y_pred_test_gb)

print(f"Accuracy_train_GB: {accuracy_train_gb}\nAccuracy_test_gb: {accuracy_test_gb}")

cm = confusion_matrix(y_test, y_pred_test_gb)
print(f"Confusion_matrix:\n {cm}")

# Rename the variable to avoid conflicts with the function name
classification_report_gb = classification_report(y_test, y_pred_test_gb)

print(f"Classification_report:\n{classification_report_gb}")

Accuracy_train_GB: 0.62575811630396
Accuracy_test_gb: 0.6192513368983957
Confusion_matrix:
 [[297 185]
 [171 282]]
Classification_report:
              precision    recall  f1-score   support

           0       0.63      0.62      0.63       482
           1       0.60      0.62      0.61       453

    accuracy                           0.62       935
   macro avg       0.62      0.62      0.62       935
weighted avg       0.62      0.62      0.62       935



In [40]:
from sklearn.model_selection import cross_val_score, StratifiedKFold


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Compute cross-validated scores
cross_val_scores = cross_val_score(gb_pipeline, X, y, cv=cv, scoring='accuracy')

# Print the mean and standard deviation of cross-validated scores
print("Cross-validated Accuracy: {:.2f} (+/- {:.2f})".format(cross_val_scores.mean(), cross_val_scores.std()))

Cross-validated Accuracy: 0.75 (+/- 0.00)
