# **Telecom Customer Churn Prediction**

In [None]:
# import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [2]:
# read dataset
df = pd.read_csv('TelecomCustomerChurn.csv')

# Feature Engineering

In [3]:
df.head()

Unnamed: 0,customerID,Gender,SeniorCitizen,Partner,Dependents,Tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No,DSL,No,...,No,No,No,No,Monthly,Yes,Manual,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Manual,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Monthly,Yes,Manual,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Monthly,Yes,Manual,70.7,151.65,Yes


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   Gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   Tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [5]:
df.describe()

Unnamed: 0,SeniorCitizen,Tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [6]:
df.isna().sum()

customerID          0
Gender              0
SeniorCitizen       0
Partner             0
Dependents          0
Tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [7]:
df.duplicated().sum()

0

In [8]:
# define y and X
y = df['Churn']
X = df.drop(['customerID','Churn'],axis=1)

In [9]:
y.value_counts()

Churn
No     5174
Yes    1869
Name: count, dtype: int64

In [None]:
ros = RandomOverSampler()
X,y = ros.fit_resample(X,y)



In [13]:
y.value_counts()

Churn
No     5174
Yes    5174
Name: count, dtype: int64

In [14]:
# ordinal encoding
from sklearn.preprocessing import OrdinalEncoder

In [15]:
oe = OrdinalEncoder()

In [16]:
X = oe.fit_transform(X)

In [None]:
# train test split
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=2529)

# Logistic Regression for Churn Prediction

In [21]:
lr = LogisticRegression()

# train model
lr.fit(X_train, y_train)

# predict
y_pred_lr = lr.predict(X_test)

# evaluate
# print(classification_report(y_test, y_pred_logisticRegression))
print("Accuracy:", accuracy_score(y_test, y_pred_lr))


Accuracy: 0.7572477773482799


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# RandomForest Model for Churn Prediction

In [None]:
rfc = RandomForestClassifier()

# train model
rfc.fit(X_train,y_train)

# predict
y_pred_rfc = rfc.predict(X_test)

# evaluate
print("Accuracy:", accuracy_score(y_test, y_pred_rfc))


Accuracy: 0.8793969849246231


# Gradient Boosting Machine (GBM) for Churn Prediction

In [None]:
# select mode
gbc = GradientBoostingClassifier(
            n_estimators=100,
            learning_rate=0.1,
            max_depth=3,
            random_state=42
)

# train model
gbc.fit(X_train,y_train)

# predict
y_pred_gbc = gbc.predict(X_test)

# evaluate
print("Accuracy:", accuracy_score(y_test, y_pred_gbc))


Accuracy: 0.7607267104754541
