In [2]:
import pandas as pd
df = pd.read_csv("Telco-Customer-Churn.csv")
df.head()


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
df['TotalCharges']=pd.to_numeric(df['TotalCharges'],errors='coerce')
df.isnull().sum()

customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [4]:
# ok so 11 missing values in TotalCharges so we wil fill them with median
df['TotalCharges']=df['TotalCharges'].fillna(df['TotalCharges'].median())
df.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [5]:
# now will convert traget variable(churn)
df['Churn']=df['Churn'].map({'Yes':1,'No':0})
df['Churn'].value_counts()
# so the outputs shows that we have imbalanced classes

Churn
0    5174
1    1869
Name: count, dtype: int64

In [6]:
# as customer id is unique, no predictive power and can cause of so we will drop it
df=df.drop('customerID',axis=1)
# we add inplace=True to make the changes in the same dataframe

In [7]:
# ok lets separete cat and num vals
categorical_cols = df.select_dtypes(include=['object']).columns
numerical_cols = df.select_dtypes(exclude=['object']).columns

print("Categorical columns:", categorical_cols)
print("Numerical columns:", numerical_cols) 

# we have 19 categorical columns and 4 numerical columns

Categorical columns: Index(['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'PaymentMethod'],
      dtype='object')
Numerical columns: Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges', 'Churn'], dtype='object')


In [8]:
# now will encode cat cols-1 hot encode

df_enc = pd.get_dummies(df, drop_first=True)
print(df_enc)
print(df.shape)
print(df_enc.shape)

# we use drop_first=True to avoid any dummy var , hence we will avoid multicollinearity

      SeniorCitizen  tenure  MonthlyCharges  TotalCharges  Churn  gender_Male  \
0                 0       1           29.85         29.85      0        False   
1                 0      34           56.95       1889.50      0         True   
2                 0       2           53.85        108.15      1         True   
3                 0      45           42.30       1840.75      0         True   
4                 0       2           70.70        151.65      1        False   
...             ...     ...             ...           ...    ...          ...   
7038              0      24           84.80       1990.50      0         True   
7039              0      72          103.20       7362.90      0        False   
7040              0      11           29.60        346.45      0        False   
7041              1       4           74.40        306.60      1         True   
7042              0      66          105.65       6844.50      0         True   

      Partner_Yes  Dependen

In [9]:
df_enc.info()
df_enc.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 31 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   SeniorCitizen                          7043 non-null   int64  
 1   tenure                                 7043 non-null   int64  
 2   MonthlyCharges                         7043 non-null   float64
 3   TotalCharges                           7043 non-null   float64
 4   Churn                                  7043 non-null   int64  
 5   gender_Male                            7043 non-null   bool   
 6   Partner_Yes                            7043 non-null   bool   
 7   Dependents_Yes                         7043 non-null   bool   
 8   PhoneService_Yes                       7043 non-null   bool   
 9   MultipleLines_No phone service         7043 non-null   bool   
 10  MultipleLines_Yes                      7043 non-null   bool   
 11  Inte

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,29.85,29.85,0,False,True,False,False,True,...,False,False,False,False,False,False,True,False,True,False
1,0,34,56.95,1889.5,0,True,False,False,True,False,...,False,False,False,False,True,False,False,False,False,True
2,0,2,53.85,108.15,1,True,False,False,True,False,...,False,False,False,False,False,False,True,False,False,True
3,0,45,42.3,1840.75,0,True,False,False,False,True,...,False,False,False,False,True,False,False,False,False,False
4,0,2,70.7,151.65,1,False,False,False,True,False,...,False,False,False,False,False,False,True,False,True,False


In [10]:
print(df_enc.isnull().sum().sum())
df_enc.select_dtypes(include='object').shape
# so no more categorical cols and null values


0


(7043, 0)

In [11]:
X = df_enc.drop('Churn', axis=1)
y = df_enc['Churn']

num_features=[
    'tenure',
    'MonthlyCharges',
    'TotalCharges'
]
# from this will identify num features 
X[num_features].describe()


Unnamed: 0,tenure,MonthlyCharges,TotalCharges
count,7043.0,7043.0,7043.0
mean,32.371149,64.761692,2281.916928
std,24.559481,30.090047,2265.270398
min,0.0,18.25,18.8
25%,9.0,35.5,402.225
50%,29.0,70.35,1397.475
75%,55.0,89.85,3786.6
max,72.0,118.75,8684.8


In [12]:
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [13]:

from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

X_scaled=X.copy()
X_scaled[num_features]=scaler.fit_transform(X[num_features])
# we scaled only num features because cat features are already encoded

X_scaled[num_features].describe()



Unnamed: 0,tenure,MonthlyCharges,TotalCharges
count,7043.0,7043.0,7043.0
mean,-2.421273e-17,-6.406285e-17,-1.488074e-17
std,1.000071,1.000071,1.000071
min,-1.318165,-1.54586,-0.9991203
25%,-0.9516817,-0.9725399,-0.8298459
50%,-0.1372744,0.1857327,-0.3904632
75%,0.9214551,0.8338335,0.6642871
max,1.613701,1.794352,2.826743


In [14]:
print(X[num_features])
print(X_scaled[num_features])


      tenure  MonthlyCharges  TotalCharges
0          1           29.85         29.85
1         34           56.95       1889.50
2          2           53.85        108.15
3         45           42.30       1840.75
4          2           70.70        151.65
...      ...             ...           ...
7038      24           84.80       1990.50
7039      72          103.20       7362.90
7040      11           29.60        346.45
7041       4           74.40        306.60
7042      66          105.65       6844.50

[7043 rows x 3 columns]
        tenure  MonthlyCharges  TotalCharges
0    -1.277445       -1.160323     -0.994242
1     0.066327       -0.259629     -0.173244
2    -1.236724       -0.362660     -0.959674
3     0.514251       -0.746535     -0.194766
4    -1.236724        0.197365     -0.940470
...        ...             ...           ...
7038 -0.340876        0.665992     -0.128655
7039  1.613701        1.277533      2.243151
7040 -0.870241       -1.168632     -0.854469
7041 -1.1

In [15]:

print(X_scaled.shape)
print(X.shape)


(7043, 30)
(7043, 30)


Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train,y_test=train_test_split(X_scaled,y,test_size=0.2,random_state=42,stratify=y) 

#lets check shapes of x train and test sets
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

# lets verify churn ratio remains same in train and test sets
print(y_train.value_counts(normalize=True))
print(y_test.value_counts(normalize=True))

# so churn ratio is same in both so they are balanced, and stratifed split worked well 

(5634, 30) (1409, 30) (5634,) (1409,)
Churn
0    0.734647
1    0.265353
Name: proportion, dtype: float64
Churn
0    0.734564
1    0.265436
Name: proportion, dtype: float64


Logistic Regression with Balanced Weight Classes

In [17]:
from sklearn.linear_model import LogisticRegression

# lets use log r with balanced class weights
LogR=LogisticRegression(max_iter=1000,class_weight='balanced',random_state=42)

LogR.fit(X_train,y_train)
y_pred=LogR.predict(X_test)
y_pred_prob=LogR.predict_proba(X_test)[:,1]  # probability estimates for positive class

In [18]:
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report, recall_score, precision_score

'''accuracy_score(y_test,y_pred)
roc_auc_score(y_test,y_pred_prob)
classification_report(y_test,y_pred)
confusion_matrix(y_test,y_pred)
recall_score(y_test,y_pred)
precision_score(y_test,y_pred)'''

print("Accuracy:", accuracy_score(y_test,y_pred))
print("ROC AUC Score:", roc_auc_score(y_test,y_pred_prob))  
print("Classification Report:\n", classification_report(y_test,y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test,y_pred))
print("Recall:", recall_score(y_test,y_pred))
print("Precision:", precision_score(y_test,y_pred))

# model is predicting on mostly the majority class (No Churn) due to class imbalance hence auccuracy looks misleading highlyffffff




Accuracy: 0.7388218594748048
ROC AUC Score: 0.8417318969748636
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.72      0.80      1035
           1       0.51      0.78      0.61       374

    accuracy                           0.74      1409
   macro avg       0.70      0.75      0.71      1409
weighted avg       0.80      0.74      0.75      1409

Confusion Matrix:
 [[748 287]
 [ 81 293]]
Recall: 0.7834224598930482
Precision: 0.5051724137931034


Random Forest Classifier

In [19]:
from sklearn.ensemble import RandomForestClassifier

rf= RandomForestClassifier(
    n_estimators=100,
    class_weight='balanced',
    random_state=42,
    max_depth=10,
    min_samples_split=10
)

rf.fit(X_train,y_train)

0,1,2
,"n_estimators  n_estimators: int, default=100 The number of trees in the forest. .. versionchanged:: 0.22  The default value of ``n_estimators`` changed from 10 to 100  in 0.22.",100
,"criterion  criterion: {""gini"", ""entropy"", ""log_loss""}, default=""gini"" The function to measure the quality of a split. Supported criteria are ""gini"" for the Gini impurity and ""log_loss"" and ""entropy"" both for the Shannon information gain, see :ref:`tree_mathematical_formulation`. Note: This parameter is tree-specific.",'gini'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",10
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",10
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",1
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: {""sqrt"", ""log2"", None}, int or float, default=""sqrt"" The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at each  split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. .. versionchanged:: 1.1  The default of `max_features` changed from `""auto""` to `""sqrt""`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.",'sqrt'
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0
,"bootstrap  bootstrap: bool, default=True Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.",True


In [20]:
rf_y_pred=rf.predict(X_test)
rf_y_pred_prob=rf.predict_proba(X_test)[:,1]

In [21]:
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, confusion_matrix, recall_score

print("Accuracy:", accuracy_score(y_test,rf_y_pred))
print("ROC AUC Score:", roc_auc_score(y_test,rf_y_pred_prob))
print("Classification Report:\n", classification_report(y_test,rf_y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test,rf_y_pred))
print("Recall:", recall_score(y_test,rf_y_pred))


Accuracy: 0.7650816181689141
ROC AUC Score: 0.842275698158051
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.77      0.83      1035
           1       0.54      0.74      0.63       374

    accuracy                           0.77      1409
   macro avg       0.72      0.76      0.73      1409
weighted avg       0.80      0.77      0.78      1409

Confusion Matrix:
 [[800 235]
 [ 96 278]]
Recall: 0.7433155080213903


In [22]:
import pandas as pd
feature_importances=pd.Series(rf.feature_importances_,index=X_train.columns).sort_values(ascending=False)
feature_importances.head(10)

tenure                                  0.174934
TotalCharges                            0.139192
Contract_Two year                       0.106338
MonthlyCharges                          0.103020
InternetService_Fiber optic             0.070595
PaymentMethod_Electronic check          0.050967
Contract_One year                       0.042941
OnlineSecurity_Yes                      0.039387
TechSupport_Yes                         0.025534
DeviceProtection_No internet service    0.020766
dtype: float64

XGBoost

In [23]:
%pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [24]:
from xgboost import XGBClassifier

xgb=XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight= (len(y_train[y_train==0]) / len(y_train[y_train==1])),
    eval_metric='logloss',
    random_state=42
)

xgb.fit(X_train,y_train)

0,1,2
,"objective  objective: typing.Union[str, xgboost.sklearn._SklObjWProto, typing.Callable[[typing.Any, typing.Any], typing.Tuple[numpy.ndarray, numpy.ndarray]], NoneType] Specify the learning task and the corresponding learning objective or a custom objective function to be used. For custom objective, see :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more information, along with the end note for function signatures.",'binary:logistic'
,"base_score  base_score: typing.Union[float, typing.List[float], NoneType] The initial prediction score of all instances, global bias.",
,booster,
,"callbacks  callbacks: typing.Optional[typing.List[xgboost.callback.TrainingCallback]] List of callback functions that are applied at end of each iteration. It is possible to use predefined callbacks by using :ref:`Callback API `. .. note::  States in callback are not preserved during training, which means callback  objects can not be reused for multiple training sessions without  reinitialization or deepcopy. .. code-block:: python  for params in parameters_grid:  # be sure to (re)initialize the callbacks before each run  callbacks = [xgb.callback.LearningRateScheduler(custom_rates)]  reg = xgboost.XGBRegressor(**params, callbacks=callbacks)  reg.fit(X, y)",
,colsample_bylevel  colsample_bylevel: typing.Optional[float] Subsample ratio of columns for each level.,
,colsample_bynode  colsample_bynode: typing.Optional[float] Subsample ratio of columns for each split.,
,colsample_bytree  colsample_bytree: typing.Optional[float] Subsample ratio of columns when constructing each tree.,0.8
,"device  device: typing.Optional[str] .. versionadded:: 2.0.0 Device ordinal, available options are `cpu`, `cuda`, and `gpu`.",
,"early_stopping_rounds  early_stopping_rounds: typing.Optional[int] .. versionadded:: 1.6.0 - Activates early stopping. Validation metric needs to improve at least once in  every **early_stopping_rounds** round(s) to continue training. Requires at  least one item in **eval_set** in :py:meth:`fit`. - If early stopping occurs, the model will have two additional attributes:  :py:attr:`best_score` and :py:attr:`best_iteration`. These are used by the  :py:meth:`predict` and :py:meth:`apply` methods to determine the optimal  number of trees during inference. If users want to access the full model  (including trees built after early stopping), they can specify the  `iteration_range` in these inference methods. In addition, other utilities  like model plotting can also use the entire model. - If you prefer to discard the trees after `best_iteration`, consider using the  callback function :py:class:`xgboost.callback.EarlyStopping`. - If there's more than one item in **eval_set**, the last entry will be used for  early stopping. If there's more than one metric in **eval_metric**, the last  metric will be used for early stopping.",
,enable_categorical  enable_categorical: bool See the same parameter of :py:class:`DMatrix` for details.,False


In [25]:
y_pred_xgb = xgb.predict(X_test)
y_prob_xgb = xgb.predict_proba(X_test)[:,1]


In [27]:
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, confusion_matrix, recall_score

print("Accuracy:", accuracy_score(y_test,y_pred_xgb))
print("ROC AUC Score:", roc_auc_score(y_test,y_prob_xgb))
print("Classification Report:\n", classification_report(y_test,y_pred_xgb))
print("Confusion Matrix:\n", confusion_matrix(y_test,y_pred_xgb))
print("Recall:", recall_score(y_test,y_pred_xgb))

Accuracy: 0.7551454932576295
ROC AUC Score: 0.8427562065669482
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.74      0.82      1035
           1       0.53      0.79      0.63       374

    accuracy                           0.76      1409
   macro avg       0.72      0.77      0.72      1409
weighted avg       0.81      0.76      0.77      1409

Confusion Matrix:
 [[770 265]
 [ 80 294]]
Recall: 0.786096256684492


Threshold tunining

In [34]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

# getting the probabilities
y_prob=LogR.predict_proba(X_test)[:,1]

# i can now use y_prob for threshold tuning and lets try different thresholds
thresholds = [0.35, 0.4, 0.5]
for t in thresholds:
    y_pred_thresh=(y_prob>= t).astype(int)
    print(f"Threshold: {t}")
    print("Confusion Matrix:\n", confusion_matrix(y_test,y_pred_thresh))
    print("Classification Report:\n", classification_report(y_test,y_pred_thresh))




Threshold: 0.35
Confusion Matrix:
 [[616 419]
 [ 36 338]]
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.60      0.73      1035
           1       0.45      0.90      0.60       374

    accuracy                           0.68      1409
   macro avg       0.70      0.75      0.66      1409
weighted avg       0.81      0.68      0.70      1409

Threshold: 0.4
Confusion Matrix:
 [[664 371]
 [ 50 324]]
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.64      0.76      1035
           1       0.47      0.87      0.61       374

    accuracy                           0.70      1409
   macro avg       0.70      0.75      0.68      1409
weighted avg       0.81      0.70      0.72      1409

Threshold: 0.5
Confusion Matrix:
 [[748 287]
 [ 81 293]]
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.72      0.80    