In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import cross_val_score
from imblearn.over_sampling import SMOTE
import pickle
import warnings
warnings.filterwarnings("ignore")

In [2]:
df=pd.read_csv("cust_churn_preprocessed_data.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,...,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Churn_Yes
0,0,0,1,29.85,29.85,0,1,0,0,1,...,0,0,0,0,0,1,0,1,0,0
1,1,0,34,56.95,1889.5,1,0,0,1,0,...,0,0,0,1,0,0,0,0,1,0
2,2,0,2,53.85,108.15,1,0,0,1,0,...,0,0,0,0,0,1,0,0,1,1
3,3,0,45,42.3,1840.75,1,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
4,4,0,2,70.7,151.65,0,0,0,1,0,...,0,0,0,0,0,1,0,1,0,1


In [4]:
df = df.drop(columns=["Unnamed: 0"])

In [5]:
# splitting the features and target
X = df.drop(columns=["Churn_Yes"])
y = df["Churn_Yes"]

In [6]:
# split training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
print(y_train_smote.shape)
y_train_smote.value_counts()

(7176,)


Churn_Yes
0    3588
1    3588
Name: count, dtype: int64

In [8]:
df.columns

Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges',
       'gender_Male', 'Partner_Yes', 'Dependents_Yes', 'PhoneService_Yes',
       'MultipleLines_No phone service', 'MultipleLines_Yes',
       'InternetService_Fiber optic', 'InternetService_No',
       'OnlineSecurity_No internet service', 'OnlineSecurity_Yes',
       'OnlineBackup_No internet service', 'OnlineBackup_Yes',
       'DeviceProtection_No internet service', 'DeviceProtection_Yes',
       'TechSupport_No internet service', 'TechSupport_Yes',
       'StreamingTV_No internet service', 'StreamingTV_Yes',
       'StreamingMovies_No internet service', 'StreamingMovies_Yes',
       'Contract_One year', 'Contract_Two year', 'PaperlessBilling_Yes',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check',
       'Churn_Yes'],
      dtype='object')

In [9]:
# dictionary of models
models = {
    "Logistic Regression": LogisticRegression(random_state=0, max_iter=500),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(random_state=42)
}

In [10]:
# Feature selection using SelectKBest
k = 5 # Number of top features to select
selector = SelectKBest(score_func=chi2, k=k)
kbest = selector.fit_transform(X_train_smote, y_train_smote)
selected_columns = X.columns[selector.get_support(indices=True)]

# Print selected columns
print("Selected Columns for Feature Selection:", selected_columns)

Selected Columns for Feature Selection: Index(['tenure', 'MonthlyCharges', 'TotalCharges', 'TechSupport_Yes',
       'Contract_Two year'],
      dtype='object')


In [11]:

# Train and evaluate each classifier
results = {}
selected_kbest_features = {}

for name, clf in models.items():
    # Train the model
    clf.fit(X_train_smote, y_train_smote)
    
    # Predict on test set
    y_pred = clf.predict(X_test)

    # Evaluate performance
    cm = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, zero_division=1)
    
    # Store results
    results[name] = {
        "Confusion Matrix": cm,
        "Accuracy": accuracy,
        "Classification Report": report,
    }
    # Store the selected columns for this model
    selected_kbest_features[name] = selected_columns.tolist()

# Print results for each classifier
for name, result in results.items():
    print(f"\nClassifier: {name}")
    print("Confusion Matrix:\n", result["Confusion Matrix"])
    print("Accuracy:", result["Accuracy"])
    print("Classification Report:\n", result["Classification Report"])
    print("Selected Columns for this Model:", selected_kbest_features[name])


Classifier: Logistic Regression
Confusion Matrix:
 [[750 152]
 [ 92 183]]
Accuracy: 0.7926932880203909
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.83      0.86       902
           1       0.55      0.67      0.60       275

    accuracy                           0.79      1177
   macro avg       0.72      0.75      0.73      1177
weighted avg       0.81      0.79      0.80      1177

Selected Columns for this Model: ['tenure', 'MonthlyCharges', 'TotalCharges', 'TechSupport_Yes', 'Contract_Two year']

Classifier: K-Nearest Neighbors
Confusion Matrix:
 [[647 255]
 [ 97 178]]
Accuracy: 0.7009345794392523
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.72      0.79       902
           1       0.41      0.65      0.50       275

    accuracy                           0.70      1177
   macro avg       0.64      0.68      0.64      1177
weighted avg       0.76    

In [12]:
# Find the classifier with the highest accuracy
best_classifier_name = max(results, key=lambda x: results[x]["Accuracy"])
best_classifier_accuracy = results[best_classifier_name]["Accuracy"]

# Print the best classifier and its accuracy
print(f"\nBest Classifier: {best_classifier_name}")
print(f"Accuracy: {best_classifier_accuracy:.4f}")


Best Classifier: Logistic Regression
Accuracy: 0.7927


In [13]:
from sklearn.model_selection import GridSearchCV
param_grid = {'criterion':['gini', 'entropy', 'log_loss'],
              'max_features': [None,'sqrt','log2'],
              'n_estimators':[10,100]} 

grid=GridSearchCV(RandomForestClassifier(),param_grid,refit = True, verbose = 3,n_jobs=-1)
grid.fit(X_train_smote, y_train_smote)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


In [14]:
# evaluate on test data
y_test_pred = grid.predict(X_test)

print("Accuracy Score:\n", accuracy_score(y_test, y_test_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))
print("Classification Report:\n", classification_report(y_test, y_test_pred))

Accuracy Score:
 0.7960917587085812
Confusion Matrix:
 [[780 122]
 [118 157]]
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.86      0.87       902
           1       0.56      0.57      0.57       275

    accuracy                           0.80      1177
   macro avg       0.72      0.72      0.72      1177
weighted avg       0.80      0.80      0.80      1177



In [15]:
# save the trained model as a pickle file
model_data = {"model": grid, "features_names": selected_columns}


with open("customer_churn_model.pkl", "wb") as f:
  pickle.dump(model_data, f)

In [16]:
model_data

{'model': GridSearchCV(estimator=RandomForestClassifier(), n_jobs=-1,
              param_grid={'criterion': ['gini', 'entropy', 'log_loss'],
                          'max_features': [None, 'sqrt', 'log2'],
                          'n_estimators': [10, 100]},
              verbose=3),
 'features_names': Index(['tenure', 'MonthlyCharges', 'TotalCharges', 'TechSupport_Yes',
        'Contract_Two year'],
       dtype='object')}

In [17]:
# load teh saved model and the feature names

with open("customer_churn_model.pkl", "rb") as f:
  model_data = pickle.load(f)

loaded_model = model_data["model"]
feature_names = model_data["features_names"]

In [18]:
# Create the encoders dictionary
encoders = {}

# Loop through each column and fit a LabelEncoder for each
for column in selected_columns:
    encoders[column] = column  # Save the encoder to the dictionary

# Save the encoders dictionary using pickle
with open('encoders.pkl', 'wb') as f:
    pickle.dump(encoders, f)

In [19]:
encoders

{'tenure': 'tenure',
 'MonthlyCharges': 'MonthlyCharges',
 'TotalCharges': 'TotalCharges',
 'TechSupport_Yes': 'TechSupport_Yes',
 'Contract_Two year': 'Contract_Two year'}

In [20]:
input_data = {
    'SeniorCitizen':0, 
    'tenure':1, 
    'MonthlyCharges':28.5,
    'TotalCharges':28.5, 
    'gender_Male':0, 
    'Partner_Yes':1, 
    'Dependents_Yes':1,
    'PhoneService_Yes':1, 
    'MultipleLines_No phone service':0,
    'MultipleLines_Yes':0, 
    'InternetService_Fiber optic':1,
    'InternetService_No':1,
    'OnlineSecurity_No internet service':0,
    'OnlineSecurity_Yes':0, 
    'OnlineBackup_No internet service':0,
    'OnlineBackup_Yes':1, 
    'DeviceProtection_No internet service':1,
    'DeviceProtection_Yes':1, 
    'TechSupport_No internet service':1,
    'TechSupport_Yes':0, 
    'StreamingTV_No internet service':0, 
    'StreamingTV_Yes':1,
    'StreamingMovies_No internet service':1, 
    'StreamingMovies_Yes':0,
    'Contract_One year':0, 
    'Contract_Two year':0, 
    'PaperlessBilling_Yes':0,
    'PaymentMethod_Credit card (automatic)':1,
    'PaymentMethod_Electronic check':1,
    'PaymentMethod_Mailed check':1
}

In [21]:
input_data_df = pd.DataFrame([input_data])

# make a prediction
prediction = loaded_model.predict(input_data_df)
pred_prob = loaded_model.predict_proba(input_data_df)

print(prediction)

# results
print(f"Prediction: {'Churn' if prediction[0] == 1 else 'No Churn'}")
print(f"Prediciton Probability: {pred_prob}")

[0]
Prediction: No Churn
Prediciton Probability: [[0.5 0.5]]
