# **Telecom Customer Churn Prediction**

In [1]:
# import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [10]:
# read dataset
df = pd.read_csv('https://github.com/YBIFoundation/Dataset/raw/main/TelecomCustomerChurn.csv')

In [11]:
df.head()

Unnamed: 0,customerID,Gender,SeniorCitizen,Partner,Dependents,Tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No,DSL,No,...,No,No,No,No,Monthly,Yes,Manual,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Manual,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Monthly,Yes,Manual,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Monthly,Yes,Manual,70.7,151.65,Yes


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   Gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   Tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [13]:
df.describe()

Unnamed: 0,SeniorCitizen,Tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [14]:
df.isna().sum()

Unnamed: 0,0
customerID,0
Gender,0
SeniorCitizen,0
Partner,0
Dependents,0
Tenure,0
PhoneService,0
MultipleLines,0
InternetService,0
OnlineSecurity,0


In [15]:
df.duplicated().sum()

np.int64(0)

In [16]:
df.columns

Index(['customerID', 'Gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'Tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [17]:
# define y and X
y = df['Churn']
X = df.drop(['customerID','Churn'],axis=1)

In [12]:
y.value_counts()

Unnamed: 0_level_0,count
Churn,Unnamed: 1_level_1
No,5174
Yes,1869


In [18]:
# sample oversampling
from imblearn.over_sampling import RandomOverSampler

In [19]:
ros = RandomOverSampler()
print(ros)

RandomOverSampler()


In [20]:
X,y = ros.fit_resample(X,y)
print(X,y)

       Gender  SeniorCitizen Partner Dependents  Tenure PhoneService  \
0      Female              0     Yes         No       1           No   
1        Male              0      No         No      34          Yes   
2        Male              0      No         No       2          Yes   
3        Male              0      No         No      45           No   
4      Female              0      No         No       2          Yes   
...       ...            ...     ...        ...     ...          ...   
10343    Male              0      No         No       1          Yes   
10344  Female              0     Yes         No       7          Yes   
10345    Male              1      No         No      22          Yes   
10346  Female              0     Yes         No      38          Yes   
10347  Female              0      No         No       7          Yes   

      MultipleLines InternetService OnlineSecurity OnlineBackup  \
0                No             DSL             No          Yes   
1

In [21]:
y.value_counts()

Unnamed: 0_level_0,count
Churn,Unnamed: 1_level_1
No,5174
Yes,5174


In [22]:
# ordinal encoding
from sklearn.preprocessing import OrdinalEncoder

In [23]:
oe = OrdinalEncoder()

In [24]:
X = oe.fit_transform(X)

In [25]:
# train test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=2529)

In [26]:
# select mode
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [38]:
knn = KNeighborsClassifier()
log_reg = LogisticRegression(max_iter=1000)  # Increased max_iter for convergence
decision_tree = DecisionTreeClassifier()
svm = SVC()
models = {
    'K-Nearest Neighbors': knn,
    'Logistic Regression': log_reg,
    'Decision Tree': decision_tree,
    'Support Vector Machine': svm
}
accuracies = {}
classification_reports = {}

# Train and evaluate each model
for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report_classification = classification_report(y_test, y_pred)
    accuracies[model_name] = accuracy
    classification_reports[model_name] = report_classification

# Print results in a clean format
print("Model Accuracies:")
print("-----------------")
for model_name, accuracy in accuracies.items():
    print(f"{model_name}: {accuracy:.4f}")

print("\nClassification Reports:")
print("----------------------")
for model_name, report in classification_reports.items():
    print(f"\n{model_name}:")
    print(report)
    print("-" * 50)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model Accuracies:
-----------------
K-Nearest Neighbors: 0.7449
Logistic Regression: 0.7530
Decision Tree: 0.8489
Support Vector Machine: 0.6575

Classification Reports:
----------------------

K-Nearest Neighbors:
              precision    recall  f1-score   support

          No       0.78      0.68      0.73      1290
         Yes       0.72      0.81      0.76      1297

    accuracy                           0.74      2587
   macro avg       0.75      0.74      0.74      2587
weighted avg       0.75      0.74      0.74      2587

--------------------------------------------------

Logistic Regression:
              precision    recall  f1-score   support

          No       0.79      0.69      0.74      1290
         Yes       0.73      0.81      0.77      1297

    accuracy                           0.75      2587
   macro avg       0.76      0.75      0.75      2587
weighted avg       0.76      0.75      0.75      2587

--------------------------------------------------

Decisi

In [28]:
rfc = RandomForestClassifier()

In [29]:
# train model
rfc.fit(X_train,y_train)

In [30]:
# predict
y_pred = rfc.predict(X_test)

In [31]:
# evaluate
from sklearn.metrics import classification_report

In [32]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

          No       0.95      0.82      0.88      1290
         Yes       0.84      0.96      0.90      1297

    accuracy                           0.89      2587
   macro avg       0.90      0.89      0.89      2587
weighted avg       0.90      0.89      0.89      2587

