## ML Model usage

### Importing ML Libraries

In [15]:
import numpy as np
import pandas as pd
import seaborn as sns
pd.set_option('display.max_columns', None)
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier, XGBRFClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [3]:
ml = pd.read_csv('Customer_churn_Data_Cleaning_Analysis.csv')

In [4]:
ml.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,tengrp,churn,mcgrp,tcgrp
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,Month-to-month,Yes,Electronic check,29.85,29.85,No,1 year,0,18.26 to 35.5,18.8 to 402.225
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,One year,No,Mailed check,56.95,1889.5,No,3 year,0,35.5 to 70.35,1397.475 to 3786.6
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1 year,1,35.5 to 70.35,18.8 to 402.225
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,One year,No,Bank transfer (automatic),42.3,1840.75,No,4 year,0,35.5 to 70.35,1397.475 to 3786.6
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1 year,1,70.35 to 89.85,18.8 to 402.225


In [5]:
ml.drop(columns=(['customerID','churn','mcgrp','tcgrp']), axis=1, inplace=True)

In [6]:
ml.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,...,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,tengrp
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,...,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No,1 year
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,...,No,No,No,One year,No,Mailed check,56.95,1889.5,No,3 year
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,...,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1 year
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,...,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No,4 year
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,...,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1 year


## Preprocessing

## Categorical cols
#### gender --> male/female --> Label Encoding
#### SeniorCitizen --> 0/1 --> already encoded
#### Partner --> Yes/No --> Label Encoding
#### Dependents --> Yes/No --> Label Encoding
#### PhoneService --> Yes/No --> Label Encoding
#### PaperlessBilling --> Yes/No --> Label Encoding
#### Churn --> Yes/No --> Label Encoding

In [7]:
le = LabelEncoder()
cat_cols = ['gender','Partner','Dependents','PhoneService','PaperlessBilling','Churn']
for i in cat_cols:
    ml[i] = le.fit_transform(ml[i])

In [8]:
ml.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,...,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,tengrp
0,0,0,1,0,1,0,No phone service,DSL,No,Yes,...,No,No,No,Month-to-month,1,Electronic check,29.85,29.85,0,1 year
1,1,0,0,0,34,1,No,DSL,Yes,No,...,No,No,No,One year,0,Mailed check,56.95,1889.5,0,3 year
2,1,0,0,0,2,1,No,DSL,Yes,Yes,...,No,No,No,Month-to-month,1,Mailed check,53.85,108.15,1,1 year
3,1,0,0,0,45,0,No phone service,DSL,Yes,No,...,Yes,No,No,One year,0,Bank transfer (automatic),42.3,1840.75,0,4 year
4,0,0,0,0,2,1,No,Fiber optic,No,No,...,No,No,No,Month-to-month,1,Electronic check,70.7,151.65,1,1 year


## Multi Categorical Cols -- unordered
#### MultipleLines: ['No phone service' 'No' 'Yes'] --> one-hot encoding
#### InternetService: ['DSL' 'Fiber optic' 'No']
#### OnlineSecurity: ['No' 'Yes' 'No internet service']
#### OnlineBackup: ['Yes' 'No' 'No internet service']
#### DeviceProtection: ['No' 'Yes' 'No internet service']
#### TechSupport: ['No' 'Yes' 'No internet service']
#### StreamingTV: ['No' 'Yes' 'No internet service']
#### StreamingMovies: ['No' 'Yes' 'No internet service']
#### Contract: ['Month-to-month' 'One year' 'Two year'] 
#### PaymentMethod: ['Electronic check' 'Mailed check' 'Bank transfer (automatic)''Credit card (automatic)']

In [17]:
multi_cat_cols = ['MultipleLines','InternetService','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies',
                 'Contract','PaymentMethod']

ml = pd.get_dummies(ml, columns=multi_cat_cols, drop_first=False)

In [19]:
ml['tengrp'] = ml['tengrp'].map({'1 year':1, '2 year':2, '3 year':3, '4 year':4, '5 year':5, '6+ year':6})

In [20]:
ml.head(3)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,tengrp,MultipleLines_No,MultipleLines_No phone service,MultipleLines_Yes,InternetService_DSL,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No,DeviceProtection_No internet service,DeviceProtection_Yes,TechSupport_No,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,0,1,0,1,0,1,29.85,29.85,0,1,False,True,False,True,False,False,True,False,False,False,False,True,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,False,True,False
1,1,0,0,0,34,1,0,56.95,1889.5,0,3,True,False,False,True,False,False,False,False,True,True,False,False,False,False,True,True,False,False,True,False,False,True,False,False,False,True,False,False,False,False,True
2,1,0,0,0,2,1,1,53.85,108.15,1,1,True,False,False,True,False,False,False,False,True,False,False,True,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,False,False,True


In [21]:
ml.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 42 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   gender                                   7043 non-null   int64  
 1   SeniorCitizen                            7043 non-null   int64  
 2   Partner                                  7043 non-null   int64  
 3   Dependents                               7043 non-null   int64  
 4   tenure                                   7043 non-null   int64  
 5   PhoneService                             7043 non-null   int64  
 6   PaperlessBilling                         7043 non-null   int64  
 7   MonthlyCharges                           7043 non-null   float64
 8   TotalCharges                             7043 non-null   float64
 9   Churn                                    7043 non-null   int64  
 10  tengrp                                   7043 no

In [23]:
ml[ml.select_dtypes('bool').columns] = ml.select_dtypes('bool').astype(int)

In [24]:
ml.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,tengrp,MultipleLines_No,MultipleLines_No phone service,MultipleLines_Yes,InternetService_DSL,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No,DeviceProtection_No internet service,DeviceProtection_Yes,TechSupport_No,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,0,1,0,1,0,1,29.85,29.85,0,1,0,1,0,1,0,0,1,0,0,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0
1,1,0,0,0,34,1,0,56.95,1889.5,0,3,1,0,0,1,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1
2,1,0,0,0,2,1,1,53.85,108.15,1,1,1,0,0,1,0,0,0,0,1,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1
3,1,0,0,0,45,0,0,42.3,1840.75,0,4,0,1,0,1,0,0,0,0,1,1,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,1,0,1,0,0,0
4,0,0,0,0,2,1,1,70.7,151.65,1,1,1,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0


In [25]:
ml.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 42 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   gender                                   7043 non-null   int64  
 1   SeniorCitizen                            7043 non-null   int64  
 2   Partner                                  7043 non-null   int64  
 3   Dependents                               7043 non-null   int64  
 4   tenure                                   7043 non-null   int64  
 5   PhoneService                             7043 non-null   int64  
 6   PaperlessBilling                         7043 non-null   int64  
 7   MonthlyCharges                           7043 non-null   float64
 8   TotalCharges                             7043 non-null   float64
 9   Churn                                    7043 non-null   int64  
 10  tengrp                                   7043 no

In [27]:
x = ml.drop(columns=('Churn'))
y = ml[['Churn']]

In [28]:
x.head(2)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,tengrp,MultipleLines_No,MultipleLines_No phone service,MultipleLines_Yes,InternetService_DSL,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No,DeviceProtection_No internet service,DeviceProtection_Yes,TechSupport_No,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,0,1,0,1,0,1,29.85,29.85,1,0,1,0,1,0,0,1,0,0,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0
1,1,0,0,0,34,1,0,56.95,1889.5,3,1,0,0,1,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1


In [29]:
y.head(2)

Unnamed: 0,Churn
0,0
1,0


In [30]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.05, random_state=42)

In [32]:
for i in (LogisticRegression(),KNeighborsClassifier(),DecisionTreeClassifier(),RandomForestClassifier(),AdaBoostClassifier(),
         BaggingClassifier(), GradientBoostingClassifier(), CatBoostClassifier(), XGBClassifier(), XGBRFClassifier(), 
         LGBMClassifier()):
    i.fit(xtrain, ytrain)
    ml_preds = i.predict(xtest)
    ml_acc = accuracy_score(ytest, ml_preds)
    ml_conf = confusion_matrix(ytest, ml_preds)
    ml_class = classification_report(ytest, ml_preds)
    print(i)
    print('Accuracy score: ', ml_acc)
    print('Confusion Matrix: ','\n', ml_conf)
    print('Classification Report: ', ml_class)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  return self._fit(X, y)


LogisticRegression()
Accuracy score:  0.8328611898016998
Confusion Matrix:  
 [[239  25]
 [ 34  55]]
Classification Report:                precision    recall  f1-score   support

           0       0.88      0.91      0.89       264
           1       0.69      0.62      0.65        89

    accuracy                           0.83       353
   macro avg       0.78      0.76      0.77       353
weighted avg       0.83      0.83      0.83       353

KNeighborsClassifier()
Accuracy score:  0.8016997167138811
Confusion Matrix:  
 [[238  26]
 [ 44  45]]
Classification Report:                precision    recall  f1-score   support

           0       0.84      0.90      0.87       264
           1       0.63      0.51      0.56        89

    accuracy                           0.80       353
   macro avg       0.74      0.70      0.72       353
weighted avg       0.79      0.80      0.79       353

DecisionTreeClassifier()
Accuracy score:  0.7337110481586402
Confusion Matrix:  
 [[215  49]
 

  return fit_method(estimator, *args, **kwargs)


RandomForestClassifier()
Accuracy score:  0.8130311614730878
Confusion Matrix:  
 [[244  20]
 [ 46  43]]
Classification Report:                precision    recall  f1-score   support

           0       0.84      0.92      0.88       264
           1       0.68      0.48      0.57        89

    accuracy                           0.81       353
   macro avg       0.76      0.70      0.72       353
weighted avg       0.80      0.81      0.80       353



  y = column_or_1d(y, warn=True)


AdaBoostClassifier()
Accuracy score:  0.8016997167138811
Confusion Matrix:  
 [[238  26]
 [ 44  45]]
Classification Report:                precision    recall  f1-score   support

           0       0.84      0.90      0.87       264
           1       0.63      0.51      0.56        89

    accuracy                           0.80       353
   macro avg       0.74      0.70      0.72       353
weighted avg       0.79      0.80      0.79       353



  y = column_or_1d(y, warn=True)


BaggingClassifier()
Accuracy score:  0.8016997167138811
Confusion Matrix:  
 [[244  20]
 [ 50  39]]
Classification Report:                precision    recall  f1-score   support

           0       0.83      0.92      0.87       264
           1       0.66      0.44      0.53        89

    accuracy                           0.80       353
   macro avg       0.75      0.68      0.70       353
weighted avg       0.79      0.80      0.79       353



  y = column_or_1d(y, warn=True)


GradientBoostingClassifier()
Accuracy score:  0.8045325779036827
Confusion Matrix:  
 [[241  23]
 [ 46  43]]
Classification Report:                precision    recall  f1-score   support

           0       0.84      0.91      0.87       264
           1       0.65      0.48      0.55        89

    accuracy                           0.80       353
   macro avg       0.75      0.70      0.71       353
weighted avg       0.79      0.80      0.79       353

Learning rate set to 0.023195
0:	learn: 0.6779510	total: 11.3ms	remaining: 11.3s
1:	learn: 0.6620844	total: 25.4ms	remaining: 12.7s
2:	learn: 0.6481037	total: 38.4ms	remaining: 12.7s
3:	learn: 0.6343226	total: 52ms	remaining: 12.9s
4:	learn: 0.6216135	total: 63.8ms	remaining: 12.7s
5:	learn: 0.6094262	total: 77.6ms	remaining: 12.9s
6:	learn: 0.5975662	total: 90.9ms	remaining: 12.9s
7:	learn: 0.5881793	total: 104ms	remaining: 12.9s
8:	learn: 0.5792838	total: 118ms	remaining: 13s
9:	learn: 0.5697597	total: 131ms	remaining: 13s
10:	learn

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


LGBMClassifier()
Accuracy score:  0.7903682719546742
Confusion Matrix:  
 [[238  26]
 [ 48  41]]
Classification Report:                precision    recall  f1-score   support

           0       0.83      0.90      0.87       264
           1       0.61      0.46      0.53        89

    accuracy                           0.79       353
   macro avg       0.72      0.68      0.70       353
weighted avg       0.78      0.79      0.78       353



### Logistic regression seem to do better when compared to other models