## **Importing Libraries**

In [138]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import *
from sklearn.model_selection import GridSearchCV

## **Importing Dataset**

In [139]:
df = pd.read_csv('/content/customer_churn_.csv')

In [140]:
df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


## **customerID: A unique identifier for each customer**
## **gender: The gender of the customer (Male/Female)**
## **SeniorCitizen: Indicates if the customer is a senior citizen (1 = Yes, 0 = No)**
## **Partner: Indicates if the customer has a partner (Yes/No)**
## **Dependents: Indicates if the customer has dependents (Yes/No)**
## **tenure: Number of months the customer has stayed with the company**
## **PhoneService: Indicates if the customer has a phone service (Yes/No)**
## **MultipleLines: Indicates if the customer has multiple lines (Yes/No/No phone service)**
## **InternetService: Type of internet service (DSL, Fiber optic, None)**
## **OnlineSecurity: Indicates if the customer has online security add-ons (Yes/No/No internet service)**
## **DeviceProtection: Indicates if the customer has device protection add-ons (Yes/No/No internet service)**
## **TechSupport: Indicates if the customer has tech support add-ons (Yes/No/No internet service)**
## **StreamingTV: Indicates if the customer streams TV services (Yes/No/No internet service)**
## **StreamingMovies: Indicates if the customer streams movies (Yes/No/No internet service)**
## **Contract: Type of contract (Month-to-month, One year, Two year)**
## **PaperlessBilling: Indicates if the customer uses paperless billing (Yes/No)**
## **PaymentMethod: The payment method used (e.g., Electronic check, Mailed check, Bank transfer, Credit card)**
## **MonthlyCharges: Monthly charges for the customer**
## **TotalCharges: Total charges billed to the customer**
## **Churn: Indicates if the customer has churned (Yes/No)**

# **EDA**

In [141]:
df.shape

(7043, 21)

In [142]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [143]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'],  errors = 'coerce') # replace " " values as null

In [144]:
df['MonthlyCharges'] = df['MonthlyCharges'].astype(float)

In [145]:
df.drop('customerID', axis =1, inplace=True)

In [146]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null   object 


In [147]:
df.isnull().sum()

Unnamed: 0,0
gender,0
SeniorCitizen,0
Partner,0
Dependents,0
tenure,0
PhoneService,0
MultipleLines,0
InternetService,0
OnlineSecurity,0
OnlineBackup,0


In [148]:
df.dropna(inplace=True)

In [149]:
df.isnull().sum()

Unnamed: 0,0
gender,0
SeniorCitizen,0
Partner,0
Dependents,0
tenure,0
PhoneService,0
MultipleLines,0
InternetService,0
OnlineSecurity,0
OnlineBackup,0


In [150]:
df.duplicated().sum()

np.int64(22)

## **Label Encoding**

In [151]:
le = LabelEncoder()
for col in df.columns:
  if df[col].dtypes == 'object':
    df[col] = le.fit_transform(df[col])

In [152]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7032 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7032 non-null   int64  
 1   SeniorCitizen     7032 non-null   int64  
 2   Partner           7032 non-null   int64  
 3   Dependents        7032 non-null   int64  
 4   tenure            7032 non-null   int64  
 5   PhoneService      7032 non-null   int64  
 6   MultipleLines     7032 non-null   int64  
 7   InternetService   7032 non-null   int64  
 8   OnlineSecurity    7032 non-null   int64  
 9   OnlineBackup      7032 non-null   int64  
 10  DeviceProtection  7032 non-null   int64  
 11  TechSupport       7032 non-null   int64  
 12  StreamingTV       7032 non-null   int64  
 13  StreamingMovies   7032 non-null   int64  
 14  Contract          7032 non-null   int64  
 15  PaperlessBilling  7032 non-null   int64  
 16  PaymentMethod     7032 non-null   int64  
 17  

## **Model Building**

In [153]:
X = df.drop('Churn', axis=1)
y = df['Churn']

In [154]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=50)

In [155]:
X_train

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
6403,1,0,1,1,72,1,2,1,2,2,2,2,2,2,2,1,0,116.60,8337.45
1677,1,0,1,1,6,1,2,2,1,1,1,1,1,1,2,0,3,25.40,153.30
2723,1,0,0,0,21,1,0,2,1,1,1,1,1,1,0,0,1,20.50,402.85
1192,1,0,0,0,7,1,2,1,0,0,2,0,2,0,0,1,2,90.45,593.45
3638,0,1,1,0,2,1,0,0,0,2,0,0,0,0,0,0,1,50.15,115.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019,1,0,1,0,67,1,2,1,0,2,2,0,2,2,1,0,2,104.10,7040.85
6262,0,0,0,0,45,1,2,1,2,2,0,0,0,2,1,0,0,96.75,4442.75
1936,0,0,1,0,51,1,0,0,2,2,0,2,0,0,1,1,0,60.50,3145.15
5609,1,0,1,0,29,1,2,1,0,2,0,0,0,0,0,1,0,80.15,2265.25


## **Random forest**

In [156]:
model_rf = RandomForestClassifier(n_estimators=50, random_state=20)
model_rf.fit(X_train, y_train)

In [157]:
y_pred_rf = model_rf.predict(X_test)

## **Accuracy score of Random forest**

In [158]:
accuracy_score(y_test, y_pred_rf)*100

80.66808813077469

# **Decision Tree**

### **Hyperparameter tuning**

In [159]:
for i in range(1,11):
  temp_model = DecisionTreeClassifier(max_depth=i)
  temp_model.fit(X_train, y_train)
  temp_y_pred = temp_model.predict(X_test)
  acc = accuracy_score(y_test, temp_y_pred)*100
  print(f'The accuracy score of max depth {i} is {acc}')

The accuracy score of max depth 1 is 75.26652452025586
The accuracy score of max depth 2 is 75.12437810945273
The accuracy score of max depth 3 is 79.03340440653874
The accuracy score of max depth 4 is 80.09950248756219
The accuracy score of max depth 5 is 79.53091684434968
The accuracy score of max depth 6 is 79.45984363894812
The accuracy score of max depth 7 is 79.38877043354655
The accuracy score of max depth 8 is 78.67803837953092
The accuracy score of max depth 9 is 76.61691542288557
The accuracy score of max depth 10 is 76.61691542288557


In [160]:
model_dt = DecisionTreeClassifier(max_depth=5)
model_dt.fit(X_train, y_train)
temp_y_pred = model_dt.predict(X_test)

## **Accuracy score of Decision Tree**

In [161]:
accuracy_score(y_test, temp_y_pred)*100

79.53091684434968

## **GridSearchCV**

In [162]:
base_model=RandomForestClassifier(random_state=23)

In [163]:
param_grid={
    'n_estimators':[100,150,200],
    'max_depth':[1,5,10],
    'min_samples_split':[2,5,7],
    'min_samples_leaf':[1,2,4],
    'criterion':['gini','entropy']
}

In [164]:
grid_search=GridSearchCV(estimator=base_model,param_grid=param_grid)

In [166]:
grid_search.fit(X_train, y_train)

In [167]:
print(grid_search.best_params_)

{'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 150}


In [172]:
final_model = RandomForestClassifier(criterion= 'entropy', max_depth= 10, min_samples_leaf= 1, min_samples_split= 5, n_estimators= 150)

In [173]:
final_model.fit(X_train, y_train)

In [174]:
y_pred_final = final_model.predict(X_test)

## **GridSearchCV accuracy score**

In [175]:
accuracy_score(y_test,y_pred_final)*100

81.4498933901919