In [20]:
# Create an environment to reading, understanding, and visualization

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
%matplotlib inline 
import seaborn as sns
import missingno as msno
from scipy import stats 

import warnings
warnings.filterwarnings('ignore') 

In [21]:
path = r"G:\Github-2025\customer_churn_ML_ANN\data_set\clean_df.csv"
direct_df = pd.read_csv(path)
direct_df.head()

Unnamed: 0,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,TotalCharges,Churn
0,No,Yes,No,No,No,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,0
1,No,No,No,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,1889.5,0
2,No,No,No,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,108.15,1
3,No,No,No,No,No,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),1840.75,0
4,No,No,No,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,151.65,1


In [22]:
cat_features = direct_df.select_dtypes(include='object').columns
num_features = direct_df.select_dtypes(include='number').columns

print(f'Categorical_feature={cat_features} \n Numerical_features={num_features}')

Categorical_feature=Index(['SeniorCitizen', 'Partner', 'Dependents', 'PhoneService',
       'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
       'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
       'Contract', 'PaperlessBilling', 'PaymentMethod'],
      dtype='object') 
 Numerical_features=Index(['TotalCharges', 'Churn'], dtype='object')


In [23]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline

# Assuming df, numeric_features, categorical_features, and ordinal_feature are defined

X = direct_df.drop(['Churn'], axis=1)
y = direct_df['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

numeric_features = ['TotalCharges']
categorical_features = ['SeniorCitizen', 'Partner', 'Dependents', 'PhoneService',
       'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
       'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
       'Contract', 'PaperlessBilling', 'PaymentMethod']


preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
        ]
    )

In [24]:
from sklearn.linear_model import LogisticRegression
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000))
])

# Fit the pipeline on your data and target variable
pipeline.fit(X, y)

In [25]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Predictions on train and test data
y_pred_train = pipeline.predict(X_train)
y_pred_test = pipeline.predict(X_test)

# Calculate accuracy and confusion matrix
accuracy_train_lgr = accuracy_score(y_train, y_pred_train)
accuracy_test_lgr = accuracy_score(y_test, y_pred_test)
cm = confusion_matrix(y_test, y_pred_test)

# Print results
print("Accuracy on train set (Logistic Regression):", accuracy_train_lgr)
print("Accuracy on test set (Logistic Regression):", accuracy_test_lgr)
print("Confusion Matrix (Logistic Regression):\n", cm)
print("Classification Report (Logistic Regression):\n", classification_report(y_test, y_pred_test))

Accuracy on train set (Logistic Regression): 0.798445202882063
Accuracy on test set (Logistic Regression): 0.8065984072810012
Confusion Matrix (Logistic Regression):
 [[1186  125]
 [ 215  232]]
Classification Report (Logistic Regression):
               precision    recall  f1-score   support

           0       0.85      0.90      0.87      1311
           1       0.65      0.52      0.58       447

    accuracy                           0.81      1758
   macro avg       0.75      0.71      0.73      1758
weighted avg       0.80      0.81      0.80      1758



In [26]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
# Define cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
scores = cross_val_score(pipeline, X, y, cv=cv, scoring='accuracy')

# Print the cross-validation results
print("Cross-Validation Scores:", scores)
print("Mean Accuracy:", np.mean(scores))

Cross-Validation Scores: [0.7960199  0.80170576 0.78947368 0.79587482 0.814367  ]
Mean Accuracy: 0.7994882324811625


In [27]:
from sklearn.ensemble import RandomForestClassifier
# Create a pipeline with ColumnTransformer and a Random Forest classifier
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42, n_estimators=100))
])

# Fit the pipeline on your data and target variable
rf_pipeline.fit(X, y)

In [28]:
y_pred_train_rf = rf_pipeline.predict(X_train)
y_pred_test_rf = rf_pipeline.predict(X_test)

accuracy_train_rf = accuracy_score(y_train,y_pred_train_rf)
accuracy_test_rf = accuracy_score(y_test,y_pred_test_rf)
cm = confusion_matrix(y_test,y_pred_test_rf)

print("Accuracy on train-rf :", accuracy_train_rf)
print("Accuracy on test-rf :", accuracy_test_rf)
print("Confusion Matrix-rf :\n", cm)
print("For RandomForestClassifier:", classification_report(y_test,y_pred_test_rf))

Accuracy on train-rf : 0.9965870307167235
Accuracy on test-rf : 0.9982935153583617
Confusion Matrix-rf :
 [[1309    2]
 [   1  446]]
For RandomForestClassifier:               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1311
           1       1.00      1.00      1.00       447

    accuracy                           1.00      1758
   macro avg       1.00      1.00      1.00      1758
weighted avg       1.00      1.00      1.00      1758



In [29]:
from sklearn.model_selection import cross_val_score, StratifiedKFold

# Define cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
scores = cross_val_score(rf_pipeline, X, y, cv=cv, scoring='accuracy')

# Print the cross-validation results
print("Cross-Validation Scores:", scores)
print("Mean Accuracy:", np.mean(scores))

Cross-Validation Scores: [0.77540867 0.78322672 0.76031294 0.76458037 0.78520626]
Mean Accuracy: 0.7737469935427517


In [30]:
from sklearn.svm import SVC

# Create a pipeline with preprocessing and the classifier
svm_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifer', SVC(kernel='linear',C=1.0, random_state =42))
])

# Fit the pipeline on the training data
svm_pipeline.fit(X_train, y_train)

In [31]:
y_pred_train_svm = svm_pipeline.predict(X_train)
y_pred_test_svm = svm_pipeline.predict(X_test)

accuracy_train_svm = accuracy_score(y_train,y_pred_train_svm)
accuracy_test_svm = accuracy_score(y_test,y_pred_test_svm)
cm_svm = confusion_matrix(y_test,y_pred_test_svm)

print("Accuracy on train-svm :", accuracy_train_svm)
print("Accuracy on test-svm :", accuracy_test_svm)
print("Confusion Matrix-svm :\n", cm_svm)
print("For SVM:", classification_report(y_test,y_pred_test_svm))

Accuracy on train-svm : 0.7938945771710277
Accuracy on test-svm : 0.7969283276450512
Confusion Matrix-svm :
 [[1174  137]
 [ 220  227]]
For SVM:               precision    recall  f1-score   support

           0       0.84      0.90      0.87      1311
           1       0.62      0.51      0.56       447

    accuracy                           0.80      1758
   macro avg       0.73      0.70      0.71      1758
weighted avg       0.79      0.80      0.79      1758



In [32]:
# Define cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
scores = cross_val_score(svm_pipeline, X, y, cv=cv, scoring='accuracy')

# Print the cross-validation results
print("Cross-Validation Scores:", scores)
print("Mean Accuracy:", np.mean(scores))

Cross-Validation Scores: [0.80454869 0.79246624 0.77809388 0.7916074  0.80725462]
Mean Accuracy: 0.7947941657289654


In [33]:
from xgboost import XGBClassifier
# Create XGBoost pipeline
xgb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(n_estimators=500, learning_rate=0.01))
])

# Fit the pipeline
xgb_pipeline.fit(X_train, y_train)

In [34]:
y_pred_train_xgb = xgb_pipeline.predict(X_train)
y_pred_test_xgb = xgb_pipeline.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Assuming y_pred_train and y_pred_test are your predicted values
accuracy_train_xgb = accuracy_score(y_train, y_pred_train_xgb)
accuracy_test_xgb = accuracy_score(y_test, y_pred_test_xgb)

print(f"Accuracy_train_XGB: {accuracy_train_xgb}\nAccuracy_test_Xgb: {accuracy_test_xgb}")

cm = confusion_matrix(y_test, y_pred_test_xgb)
print(f"Confusion_matrix:\n {cm}")

# Rename the variable to avoid conflicts with the function name
classification_report_xgb = classification_report(y_test, y_pred_test_xgb)

print(f"Classification_report:\n{classification_report_xgb}")

Accuracy_train_XGB: 0.8371255214258627
Accuracy_test_Xgb: 0.8043230944254836
Confusion_matrix:
 [[1189  122]
 [ 222  225]]
Classification_report:
              precision    recall  f1-score   support

           0       0.84      0.91      0.87      1311
           1       0.65      0.50      0.57       447

    accuracy                           0.80      1758
   macro avg       0.75      0.71      0.72      1758
weighted avg       0.79      0.80      0.80      1758



In [35]:
from sklearn.model_selection import cross_val_score, StratifiedKFold


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Compute cross-validated scores
cross_val_scores = cross_val_score(xgb_pipeline, X, y, cv=cv, scoring='accuracy')

# Print the mean and standard deviation of cross-validated scores
print("Cross-validated Accuracy: {:.2f} (+/- {:.2f})".format(cross_val_scores.mean(), cross_val_scores.std()))

Cross-validated Accuracy: 0.80 (+/- 0.01)


In [36]:
from sklearn.ensemble import GradientBoostingClassifier

gb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier(n_estimators=90, learning_rate=0.01))
])

gb_pipeline.fit(X_train, y_train)

In [37]:
y_pred_train_gb = gb_pipeline.predict(X_train)
y_pred_test_gb = gb_pipeline.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Assuming y_pred_train and y_pred_test are your predicted values
accuracy_train_gb = accuracy_score(y_train, y_pred_train_gb)
accuracy_test_gb = accuracy_score(y_test, y_pred_test_gb)

print(f"Accuracy_train_GB: {accuracy_train_gb}\nAccuracy_test_gb: {accuracy_test_gb}")

cm = confusion_matrix(y_test, y_pred_test_gb)
print(f"Confusion_matrix:\n {cm}")

# Rename the variable to avoid conflicts with the function name
classification_report_gb = classification_report(y_test, y_pred_test_gb)

print(f"Classification_report:\n{classification_report_gb}")

Accuracy_train_GB: 0.7497155858930603
Accuracy_test_gb: 0.7639362912400455
Confusion_matrix:
 [[1306    5]
 [ 410   37]]
Classification_report:
              precision    recall  f1-score   support

           0       0.76      1.00      0.86      1311
           1       0.88      0.08      0.15       447

    accuracy                           0.76      1758
   macro avg       0.82      0.54      0.51      1758
weighted avg       0.79      0.76      0.68      1758



In [38]:
from sklearn.model_selection import cross_val_score, StratifiedKFold


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Compute cross-validated scores
cross_val_scores = cross_val_score(gb_pipeline, X, y, cv=cv, scoring='accuracy')

# Print the mean and standard deviation of cross-validated scores
print("Cross-validated Accuracy: {:.2f} (+/- {:.2f})".format(cross_val_scores.mean(), cross_val_scores.std()))

Cross-validated Accuracy: 0.76 (+/- 0.01)
