In [10]:
import pandas as pd
import numpy as np

In [12]:
df = pd.read_csv("Telco_CustomerChurn_Kaggle.csv")

In [14]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [16]:
df.shape

(7043, 21)

In [18]:
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [19]:
df.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [22]:
df.duplicated().sum()

0

In [24]:
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.columns

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

In [26]:
df = df.drop('customerid', axis = 1)

In [28]:
from sklearn.preprocessing import LabelEncoder
# encoding
le = LabelEncoder()

In [30]:
# binary encoding
binary_encode = ['gender', 'partner', 'dependents', 'phoneservice', 'paperlessbilling', 'churn']
for i in binary_encode:
    df[i] = le.fit_transform(df[i])  #female = 0, male = 1, no = 0, yes = 1

In [32]:
# multi-category encoding
multi_encode = ['multiplelines', 'internetservice', 'onlinesecurity', 'onlinebackup', 'deviceprotection',
               'techsupport', 'streamingtv', 'streamingmovies', 'contract', 'paymentmethod']
for i in multi_encode:
    df[i] = le.fit_transform(df[i]) 

In [34]:
df.totalcharges = pd.to_numeric(df.totalcharges, errors = 'coerce').fillna(0)

In [36]:
df.dtypes

gender                int64
seniorcitizen         int64
partner               int64
dependents            int64
tenure                int64
phoneservice          int64
multiplelines         int64
internetservice       int64
onlinesecurity        int64
onlinebackup          int64
deviceprotection      int64
techsupport           int64
streamingtv           int64
streamingmovies       int64
contract              int64
paperlessbilling      int64
paymentmethod         int64
monthlycharges      float64
totalcharges        float64
churn                 int64
dtype: object

In [38]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [40]:
# Logistic Regression
x = df.drop('churn', axis = 1)
y = df['churn']

In [42]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 2)

In [44]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(5634, 19)
(1409, 19)
(5634,)
(1409,)


In [46]:
churn_model = LogisticRegression(max_iter=1000)

In [48]:
churn_model.fit(x_train, y_train)

In [50]:
y_pred = churn_model.predict(x_test)

In [52]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_report_str = classification_report(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(f"Confusion Matrix: {conf_matrix}")
print(f"Classification Report:\n{classification_report_str}")

Accuracy: 0.8034066713981547
Confusion Matrix: [[946 115]
 [162 186]]
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1061
           1       0.62      0.53      0.57       348

    accuracy                           0.80      1409
   macro avg       0.74      0.71      0.72      1409
weighted avg       0.80      0.80      0.80      1409



In [61]:
y_pred_train = churn_model.predict(x_train)
accuracy_train = accuracy_score(y_train, y_pred_train)
print(f"Accuracy Train: {accuracy_train}")

Accuracy Train: 0.8047568335108272


In [63]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits=10, shuffle=True, random_state=42)
cv_scores = cross_val_score(churn_model, x, y, cv=kfold, scoring='accuracy')

print("Cross-validation accuracy: %0.2f (+/- %0.2f)" % (cv_scores.mean(), cv_scores.std()))

Cross-validation accuracy: 0.80 (+/- 0.02)
