Customer Churn Prediction

1.Exploratory Data Analysis (EDA)

In [3]:
#import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


->LabelEncoder: Encodes categorical variables into numeric values.
->StandardScaler: Standardizes numerical values to have a mean of 0 and a standard deviation of 1.
->train_test_split: Splits the dataset into training and testing sets.
->RandomForestClassifier: A machine learning model used for classification.
->accuracy_score, precision_score, recall_score, f1_score: Used to evaluate the performance of the model.

In [6]:
#Load csv file
data=pd.read_csv('telco_churn.csv')

In [7]:
data.head()

Unnamed: 0.1,Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,7590-VHVEG,Female,False,True,False,1,False,,DSL,...,False,False,False,False,Month-to-month,True,Electronic check,29.85,29.850000381469727,False
1,1,5575-GNVDE,Male,False,False,False,34,True,False,DSL,...,True,False,False,False,One year,False,Mailed check,56.950001,1889.5,False
2,2,3668-QPYBK,Male,False,False,False,2,True,False,DSL,...,False,False,False,False,Month-to-month,True,Mailed check,53.849998,108.1500015258789,True
3,3,7795-CFOCW,Male,False,False,False,45,False,,DSL,...,True,True,False,False,One year,False,Bank transfer (automatic),42.299999,1840.75,False
4,4,9237-HQITU,Female,False,False,False,2,True,False,Fiber optic,...,False,False,False,False,Month-to-month,True,Electronic check,70.699997,151.64999389648438,True


In [8]:
data.shape

(5043, 22)

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5043 entries, 0 to 5042
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        5043 non-null   int64  
 1   customerID        5043 non-null   object 
 2   gender            5043 non-null   object 
 3   SeniorCitizen     5043 non-null   object 
 4   Partner           5043 non-null   object 
 5   Dependents        5043 non-null   object 
 6   tenure            5043 non-null   int64  
 7   PhoneService      5043 non-null   object 
 8   MultipleLines     4774 non-null   object 
 9   InternetService   5043 non-null   object 
 10  OnlineSecurity    4392 non-null   object 
 11  OnlineBackup      4392 non-null   object 
 12  DeviceProtection  4392 non-null   object 
 13  TechSupport       4392 non-null   object 
 14  StreamingTV       4392 non-null   object 
 15  StreamingMovies   4392 non-null   object 
 16  Contract          5043 non-null   object 


In [10]:
data.isna().apply(pd.value_counts)

  data.isna().apply(pd.value_counts)


Unnamed: 0.1,Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
False,5043.0,5043.0,5043.0,5043.0,5043.0,5043.0,5043.0,5043.0,4774,5043.0,...,4392,4392,4392,4392,5043.0,5043.0,5043.0,5043.0,5038,5042
True,,,,,,,,,269,,...,651,651,651,651,,,,,5,1


In [11]:
data.isnull().sum()

Unnamed: 0            0
customerID            0
gender                0
SeniorCitizen         0
Partner               0
Dependents            0
tenure                0
PhoneService          0
MultipleLines       269
InternetService       0
OnlineSecurity      651
OnlineBackup        651
DeviceProtection    651
TechSupport         651
StreamingTV         651
StreamingMovies     651
Contract              0
PaperlessBilling      0
PaymentMethod         0
MonthlyCharges        0
TotalCharges          5
Churn                 1
dtype: int64

2.Preprocessing

In [13]:
#Remove unnecessary columns
data.drop(columns=["Unnamed: 0", "customerID"], inplace=True)

In [14]:
#Converting the column Totalcharges to numeric
data["TotalCharges"] = pd.to_numeric(data["TotalCharges"], errors="coerce")

3.Handling missing values and categorical values

In [16]:
#Handle the missing values
data.dropna(subset=["Churn"], inplace=True)
fill_columns = ["MultipleLines", "OnlineSecurity", "OnlineBackup", "DeviceProtection", 
                "TechSupport", "StreamingTV", "StreamingMovies"]
data[fill_columns] = data[fill_columns].fillna("No")
data["TotalCharges"].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["TotalCharges"].fillna(0, inplace=True)


->We are replacing the missing values for categorical using NO and for numerical replacing with 0.

In [17]:
categorical_values = ["gender", "SeniorCitizen", "Partner", "Dependents", "PhoneService", "MultipleLines", "InternetService", "OnlineSecurity", "OnlineBackup", 
                      "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies", "Contract", "PaperlessBilling", "PaymentMethod", "Churn"]

In [18]:
# Encode categorical variables
label_encoders = {}
for col in categorical_values:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

-> Label encoders are used for converting the categorical values into numerical 

4.Feature scaling

In [20]:
# Scale numerical features
scaler = StandardScaler()
data[["tenure", "MonthlyCharges", "TotalCharges"]] = scaler.fit_transform(data[["tenure", "MonthlyCharges", "TotalCharges"]])

5.Train classification model

In [22]:
X = data.drop(columns=["Churn"])
y = data["Churn"]

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
# Train Random Forest classifier

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [25]:
y_pred = model.predict(X_test)

In [26]:
score=model.score(X_test,y_test)

In [27]:
print(score)

0.8087215064420218


5.Model performance

In [29]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="weighted")
recall = recall_score(y_test, y_pred, average="weighted")
f1 = f1_score(y_test, y_pred, average="weighted")

In [30]:
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

Accuracy: 0.8087
Precision: 0.8031
Recall: 0.8087
F1-score: 0.8053
