In [2]:
# Import Neccessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import time

In [3]:
# To display all the columns
pd.set_option("display.max_columns", None)

In [4]:
# Read the data
df = pd.read_csv(r"C:\Users\manju\DS_&_DA\Telecom_churn_prediction - Copy\Telco_Customer_Churn.csv")
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


### DATA EXPLORATION & CLEANING

In [56]:
# get Column information

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [57]:
# Get columns statistics summary

df.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [58]:
# Data rows and columns count

df.shape

(7043, 21)

In [59]:
from evidently import Report
from evidently.presets import DataDriftPreset

report = Report(metrics=[DataDriftPreset()])
html_report = report.run(reference_data=df[:5000], current_data=df[5001:])
html_report.save_json(r"C:\Users\manju\DS_&_DA\Telecom_churn_prediction - Copy\let's_see.json")

#### HANDLING MISSING VALUES

In [5]:
# Removing all the rows where atleast one of the column values is blank

df1 = df[~df.apply(lambda row: (row == " ").any(), axis=1)]

In [61]:
# Check data rows and columns count after removal of blank values rows

df1.shape

(7032, 21)

In [6]:
# Dropping irrelavant feature

df1.drop("customerID", axis=1, inplace=True)

#### FIXING DATA TYPE

In [7]:
# Data Type Correction for TotalCharges

df1["TotalCharges"] = df1["TotalCharges"].astype("float64")
# df1["TotalCharges"] = pd.to_numeric(df1["TotalCharges"], errors="coerce")    # handles any other values other than numeric values too 

In [8]:
df1.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
count,7032.0,7032.0,7032.0,7032.0
mean,0.1624,32.421786,64.798208,2283.300441
std,0.368844,24.54526,30.085974,2266.771362
min,0.0,1.0,18.25,18.8
25%,0.0,9.0,35.5875,401.45
50%,0.0,29.0,70.35,1397.475
75%,0.0,55.0,89.8625,3794.7375
max,1.0,72.0,118.75,8684.8


#### HANDLING IMBALANCED DATA

In [64]:
x = df1.drop(columns=["Churn"])
y = df1["Churn"]

In [65]:
# Using SMOTE to handle the under sampled column by over sampling

from imblearn.over_sampling import SMOTENC

cat_cols = x.select_dtypes(include=["object"]).columns
categorical_features = [x.columns.get_loc(col) for col in cat_cols]  # Identify the index positions of categorical columns

smote_nc = SMOTENC(categorical_features=categorical_features, random_state=42)
x_resampled, y_resampled = smote_nc.fit_resample(x, y)

#### SCALING NUMERICAL FEATURES

In [66]:
# Scale numerical columns

from sklearn.preprocessing import StandardScaler

num_cols = x_resampled.select_dtypes(exclude=["object"]).columns 
ss = StandardScaler()
x_resampled[num_cols] = ss.fit_transform(x_resampled[num_cols])

#### ENCODING CATEGORICAL FEATURES

In [67]:
# label Encode independent catgorical columns through onehot encodinng
from sklearn.preprocessing import OneHotEncoder

cat_cols = x_resampled.select_dtypes(include=["object"]).columns
ohe = OneHotEncoder(drop="first", sparse_output=False)
arr = ohe.fit_transform(x_resampled[cat_cols])
encoded_df = pd.DataFrame(arr, columns=ohe.get_feature_names_out(cat_cols), index=x_resampled.index)
x_resampled = x_resampled.drop(columns=cat_cols)
x_resampled = pd.concat([x_resampled, encoded_df], axis=1)
x_resampled

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No internet service,DeviceProtection_Yes,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,-0.402233,-1.117435,-1.339351,-0.927945,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,-0.402233,0.262654,-0.393639,-0.074956,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,-0.402233,-1.075615,-0.501820,-0.892030,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,-0.402233,0.722684,-0.904882,-0.097317,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,-0.402233,-1.075615,0.086197,-0.872078,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10321,2.486123,1.140893,1.143819,1.585514,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
10322,-0.402233,-0.699226,1.006801,-0.424360,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
10323,-0.402233,-0.950152,-1.697725,-0.884579,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10324,-0.402233,-1.075615,0.401576,-0.872140,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [68]:
# Label Encode Target Feature
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_resampled = le.fit_transform(y_resampled)

#### SPLITTING TRAINING & TESTING DATA

In [69]:
# Import libraries required for the model training

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split, cross_val_score

In [70]:
# Split data into train and test data

x_train, x_test, y_train, y_test = train_test_split(x_resampled, y_resampled, test_size=0.2, random_state=42)

#### MODEL BUILDING

In [None]:
# Dictionary data with models and it's evaluation parameters

models_and_params = {
    "Logistic Regression": {
        "model": LogisticRegression(max_iter=1000),
        "params": {
            "C": [0.01, 0.1, 1, 10],
            "solver": ["liblinear", "lbfgs"],
            "penalty": ["l2"]
        }
    },
    "Decision Tree Classifier": {
        "model": DecisionTreeClassifier(),
        "params": {
            "max_depth": [3, 5, 10, None],
            "min_samples_split": [2, 5, 10],
            "criterion": ["gini", "entropy"]
        }
    },
    "RandomForestClassifier": {
        "model": RandomForestClassifier(),
        "params": {
            "n_estimators": [100, 200],
            "max_depth": [10, 20, 30],
            "min_samples_split": [2, 5, 7],
            "max_features": ["sqrt", "log2"]
        }
    },
    "GradientBoostingClassifier": {
        "model": GradientBoostingClassifier(),
        "params": {
            "n_estimators": [100, 200],
            "learning_rate": [0.01, 0.1],
            "max_depth": [3, 5]
        }
    },
    "AdaBoostClassifier": {
        "model": AdaBoostClassifier(),
        "params": {
            "n_estimators": [50, 100, 200],
            "learning_rate": [0.01, 0.1, 1.0]
        }
    },
    "SVC": {
        "model": SVC(),
        "params": {
            "C": [0.1, 1, 10],
            "kernel": ["linear", "rbf"],
            "gamma": ["scale", "auto"]
        }
    },
    "KNeighborsClassifier": {
        "model": KNeighborsClassifier(),
        "params": {
            "n_neighbors": [3, 5, 7],
            "weights": ["uniform", "distance"],
            "metric": ["minkowski", "euclidean"]
        }
    },
    "GaussianNB": {
        "model": GaussianNB(),
        "params": {}
    },
    "XGBClassifier": {
        "model": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
        "params": {
            "n_estimators": [100, 200, 300],
            "learning_rate": [0.01, 0.1],
            "max_depth": [3, 6, 9],
            "subsample": [0.8, 1.0, 2.0]
        }
    },
    "LGBMClassifier": {
        "model": LGBMClassifier(),
        "params": {
            "n_estimators": [100, 200, 300],
            "learning_rate": [0.01, 0.1],
            "num_leaves": [31, 50, 70],
            "boosting_type": ["gbdt", "dart"]
        }
    },
    "CatBoostClassifier": {
        "model": CatBoostClassifier(verbose=0),
        "params": {
            "iterations": [100, 300, 500],
            "learning_rate": [0.01, 0.1],
            "depth": [4, 6, 10]
        }
    }
}

In [None]:
# Train all the models and get the top 4 best accurate and fast models

model_accuracies = {}

for model_name, mp in models_and_params.items():
    model = mp["model"]                                                # Best model after verifying
    start = time.time()
    model.fit(x_train, y_train)
    end = time.time()
    training_time = end - start

    scores = cross_val_score(model, x_train, y_train, cv=5)                  
    print(f" {model_name} Mean Accuracy = {scores.mean():.4f}")         # Checking mean accuracy 
    print("Time taken for training the data is: ", training_time)
    model_accuracies[model_name] = float(scores.mean())

top_models = sorted(model_accuracies.items(), key=lambda x: x[1], reverse=True)[:4]
top_models = dict(top_models)
top_models

 Logistic Regression Mean Accuracy = 0.8073
Time taken for training the data is:  0.10788559913635254
 Decision Tree Classifier Mean Accuracy = 0.7857
Time taken for training the data is:  0.054117441177368164
 RandomForestClassifier Mean Accuracy = 0.8425
Time taken for training the data is:  1.028700590133667
 GradientBoostingClassifier Mean Accuracy = 0.8236
Time taken for training the data is:  1.2687022686004639
 AdaBoostClassifier Mean Accuracy = 0.8087
Time taken for training the data is:  0.3853113651275635
 SVC Mean Accuracy = 0.8161
Time taken for training the data is:  1.9775948524475098
 KNeighborsClassifier Mean Accuracy = 0.8071
Time taken for training the data is:  0.007148027420043945
 GaussianNB Mean Accuracy = 0.7758
Time taken for training the data is:  0.0
 XGBClassifier Mean Accuracy = 0.8389
Time taken for training the data is:  0.11150002479553223
[LightGBM] [Info] Number of positive: 4134, number of negative: 4126
[LightGBM] [Info] Auto-choosing row-wise multi-t

{'RandomForestClassifier': 0.8424939467312349,
 'XGBClassifier': 0.8388619854721548,
 'CatBoostClassifier': 0.836682808716707,
 'LGBMClassifier': 0.8346246973365616}

#### MODEL EVOLUTION

In [81]:
# Import required model evaluation libraries  

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import StackingClassifier

In [82]:
# Tuning best models with their hyperparameters

best_tuned_models = []

# Loop to tune each model
for model_name, mp in models_and_params.items():
    if model_name in top_models.keys():
        print(f"Tuning {model_name}...")
        
        model = mp["model"]
        params = mp["params"]
        
        if params:
            search = GridSearchCV(estimator=model,
                                  param_grid=params,
                                  cv=5,
                                  scoring="accuracy",
                                  n_jobs=-1,
                                  verbose=0)
            
            search.fit(x_train, y_train)
            best_model = search.best_estimator_
            print(f"Best {model_name} Params:", search.best_params_)
            print("Best Cross-Val Score:", search.best_score_)
            
        else:
            best_model = model.fit(x_train, y_train)
            print("No hyperparameters to tune.")
        
        best_tuned_models.append((model_name, best_model))

# Using Stacking Classifier
meta_learner = LogisticRegression()

stacking_clf = StackingClassifier(
    estimators=best_tuned_models,
    final_estimator=meta_learner,
    passthrough=False,
    cv=5
)

stacking_clf.fit(x_train, y_train)
print("Stacking Classifier Test Accuracy:", stacking_clf.score(x_test, y_test))

# Evaluation
y_pred = stacking_clf.predict(x_test)
roc_auc = roc_auc_score(y_test, y_pred)
print("\nStacking Classifier Evaluation")
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("ROC AUC: ", roc_auc)


Tuning RandomForestClassifier...
Best RandomForestClassifier Params: {'max_depth': 20, 'max_features': 'log2', 'min_samples_split': 5, 'n_estimators': 200}
Best Cross-Val Score: 0.845278450363196
Tuning XGBClassifier...
Best XGBClassifier Params: {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 100, 'subsample': 0.8}
Best Cross-Val Score: 0.8421307506053269
Tuning LGBMClassifier...
[LightGBM] [Info] Number of positive: 4134, number of negative: 4126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000499 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 638
[LightGBM] [Info] Number of data points in the train set: 8260, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500484 -> initscore=0.001937
[LightGBM] [Info] Start training from score 0.001937
Best LGBMClassifier Params: {'boosting_type': 'dart', 'le

In [75]:
# Create a input data 

data = ["Male", 0, "No", "No", 1, "Yes", "No", "DSL", "No", "No", "No", "No", "No", "Yes", "Month-to-month", "Yes", "Mailed check", 55.7, 55.7]
# data = ["Male",	0, "Yes", "No", 12,	"Yes", "No", "No", "No internet service", "No internet service", "No internet service", "No internet service",	
  #      "No internet service", "No internet service", "One year", "No", "Bank transfer (automatic)", 19.8, 202.25] 
cols = df1.drop("Churn", axis=1).columns
input_df = pd.DataFrame([data], columns=cols)
input_cat_cols = input_df.select_dtypes(include=["object"]).columns
en_data = pd.get_dummies(data=input_df, columns=cat_cols, prefix=cat_cols, sparse=False)

# Re-index input dataframe columns according to the input data columns
en_data = en_data.reindex(columns=x_train.columns, fill_value=0)

In [76]:
# Predict the input data

if stacking_clf.predict(en_data)[0] == 0:
    print("The user with en_data is not churned")
else:
    print("The user with en_data is churned")

The user with en_data is not churned
