In [1]:
# Import of initial Libraries
import numpy as np
import pandas as pd

In [2]:
# Loading the dataset
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
# Numbers of rows and columns in the dataset
df.shape

(7043, 21)

In [4]:
# Import libraries for preprocessing and Classification
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, f1_score

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [6]:
# converting TotalCharges column data type to numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors ='coerce')

In [7]:
# Check for null values 
df['TotalCharges'].isna().sum()

11

In [8]:
# Filling all the null values in the TotalCharges column with 0
df['TotalCharges'].fillna(0,inplace= True)

In [9]:
# Converting the 'Churn' column to binary values. Map 'No' to 0 and 'Yes' to 1.
df['Churn'] = df['Churn'].apply(lambda x: 1 if x == "Yes" else 0)

In [10]:
df['Churn'].unique()

array([0, 1], dtype=int64)

In [11]:
categorical = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService','OnlineSecurity',
               'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies','Contract', 'PaperlessBilling', 'PaymentMethod']

numerical = ['tenure', 'MonthlyCharges', 'TotalCharges']

In [15]:
# Scaling the numerical features
sc = StandardScaler()
num_sc = sc.fit_transform(df[numerical])
num_df = pd.DataFrame(num_sc, columns=numerical)
num_df

Unnamed: 0,tenure,MonthlyCharges,TotalCharges
0,-1.277445,-1.160323,-0.992611
1,0.066327,-0.259629,-0.172165
2,-1.236724,-0.362660,-0.958066
3,0.514251,-0.746535,-0.193672
4,-1.236724,0.197365,-0.938874
...,...,...,...
7038,-0.340876,0.665992,-0.127605
7039,1.613701,1.277533,2.242606
7040,-0.870241,-1.168632,-0.852932
7041,-1.155283,0.320338,-0.870513


In [16]:
# Using OneHotEncoder to transform the categorical features.
ohe = OneHotEncoder(sparse_output= False)
enc_cat = ohe.fit_transform(df[categorical])
enc_cat_df = pd.DataFrame(enc_cat)
enc_cat_df.columns = ohe.get_feature_names_out()

In [17]:
enc_cat_df.head()

Unnamed: 0,gender_Female,gender_Male,SeniorCitizen_0,SeniorCitizen_1,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [18]:
# concatenate the two DataFrame 
df_combined = pd.concat([num_df, enc_cat_df], axis= 1)

In [19]:
# calling the combined DataFrame
df_combined

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,gender_Female,gender_Male,SeniorCitizen_0,SeniorCitizen_1,Partner_No,Partner_Yes,Dependents_No,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,-1.277445,-1.160323,-0.992611,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.066327,-0.259629,-0.172165,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,-1.236724,-0.362660,-0.958066,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0.514251,-0.746535,-0.193672,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,-1.236724,0.197365,-0.938874,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,-0.340876,0.665992,-0.127605,0.0,1.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
7039,1.613701,1.277533,2.242606,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
7040,-0.870241,-1.168632,-0.852932,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
7041,-1.155283,0.320338,-0.870513,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [20]:
# Dataset Splitting
X = df_combined
y = df['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size= 0.2, random_state= 1)

In [21]:
# Initializing the Classification models 
Randf = RandomForestClassifier(random_state=1)
ExtTC = ExtraTreesClassifier(random_state=1)
XgbC = XGBClassifier(random_state=1)
LgbC = LGBMClassifier(random_state=1)

In [22]:
#Question 14
# Random Forest Model 
Randf.fit(X_train, y_train)
y_pred1 = Randf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred1)}")

Accuracy: 0.7906316536550745


In [23]:
# ExtraTreesClassifier Model
ExtTC.fit(X_train, y_train)
y_pred2 = ExtTC.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred2)}")

Accuracy: 0.7700496806245565


In [24]:
# XGB  Model
XgbC.fit(X_train, y_train)
y_pred3 = XgbC.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred3)}")

Accuracy: 0.7934705464868701


In [25]:
# LGB Model
LgbC.fit(X_train, y_train)
y_pred4 = LgbC.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred4)}")

[LightGBM] [Info] Number of positive: 1521, number of negative: 4113
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000503 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269968 -> initscore=-0.994785
[LightGBM] [Info] Start training from score -0.994785
Accuracy: 0.8133427963094393


In [26]:
# Initialize hyperparameters
n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None]

In [27]:
#Hyparameter grid
hyperparameter_grid = {'n_estimators' : n_estimators, 'min_samples_leaf': min_samples_leaf, 'min_samples_split' : min_samples_split,
                      'max_features' : max_features}
random_search = RandomizedSearchCV(ExtTC, hyperparameter_grid, cv=5, n_iter= 10, scoring= 'accuracy', n_jobs= -1, verbose= 1, random_state=1)

In [28]:
# Fitting the ExtraTreesClassifier 
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
9 fits failed with the following error:
Traceback (most recent call last):
  File "C:\ProgramData\anaconda3\envs\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\anaconda3\envs\venv\lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "C:\ProgramData\anaconda3\envs\venv\lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "C:\ProgramData\anaconda3\envs\venv\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter

In [29]:
# The best parameter from Randomized_CV
random_search.best_params_

{'n_estimators': 1000,
 'min_samples_split': 9,
 'min_samples_leaf': 8,
 'max_features': 'sqrt'}

In [31]:
# Accuracy of the best estimator (ExtraTreeClassifier)
accuracy = accuracy_score(y_test,random_search.best_estimator_.predict(X_test))
accuracy

0.8041163946061036

In [32]:
#Feature Importance
feature_names = list(df_combined.columns)
feature_importances = random_search.best_estimator_.feature_importances_
sorted_features = sorted(zip(feature_importances, feature_names), reverse=True)

# Printing the top two important features
top_two_features = [feature_name for importance, feature_name in sorted_features[:2]]
top_two_features

['Contract_Month-to-month', 'tenure']