In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import missingno as msn
pd.set_option('display.max_rows', 30)
pd.set_option('display.max_columns', 30)

import matplotlib.style as style
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import RandomizedSearchCV

In [2]:
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.sample(n=5, random_state=42).T

Unnamed: 0,185,2715,3825,1807,132
customerID,1024-GUALD,0484-JPBRU,3620-EHIMZ,6910-HADCM,8587-XYZSF
gender,Female,Male,Female,Female,Male
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,Yes,No,No
Dependents,No,No,Yes,No,No
tenure,1,41,52,1,67
PhoneService,No,Yes,Yes,Yes,Yes
MultipleLines,No phone service,Yes,No,No,No
InternetService,DSL,No,No,Fiber optic,DSL
OnlineSecurity,No,No internet service,No internet service,No,No


In [3]:
# change column name to lower case

df.columns = [col.lower() for col in df.columns]

# **Preprocessing**

In [4]:
df['totalcharges'] = pd.to_numeric(df['totalcharges'], errors='coerce')

df.totalcharges.fillna(-1, inplace=True)

df['churn'] = df.churn.map({'No' : 0, 'Yes': 1})

# Feature Engineering

In [5]:
from sklearn.model_selection import train_test_split

categorical = ['gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod']
numerical = ['tenure', 'monthlycharges', 'totalcharges']

# Features Selection
features = df[categorical + numerical]
target = df['churn']

# Split the data to train set and test set
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2,
                                                    random_state=1)

In [6]:
# Scaling numerical features using standard scaler
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train[numerical]), columns = X_train[numerical].columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test[numerical]), columns = X_test[numerical].columns)

# Encoding categorical features using OneHotEncoder
encoder = OneHotEncoder(sparse=False)
X_train_encoded = pd.DataFrame(encoder.fit_transform(X_train[categorical]))
X_test_encoded = pd.DataFrame(encoder.transform(X_test[categorical]))

# Combining numerical and categorical features
X_train = pd.concat([X_train_scaled, X_train_encoded], axis=1)
X_test = pd.concat([X_test_scaled, X_test_encoded], axis=1)

X_train.columns = [str(col) for col in X_train.columns] # to convert column name to string
X_test.columns = [str(col) for col in X_test.columns]



# Model Training

In [7]:
rf_model = RandomForestClassifier(random_state=1)
et_model = ExtraTreesClassifier(random_state=1)
xgb_model = XGBClassifier(random_state=1)
lgbm_model = LGBMClassifier(random_state=1)

In [14]:
for model in [rf_model, et_model, xgb_model, lgbm_model]: #loop through to fit the models at once
  model.fit(X_train, y_train)
  preds = model.predict(X_test)
  print(f"{model.__class__.__name__} accuracy: {accuracy_score(y_test, preds)}")

RandomForestClassifier accuracy: 0.794889992902768
ExtraTreesClassifier accuracy: 0.7835344215755855
XGBClassifier accuracy: 0.7998580553584103
[LightGBM] [Info] Number of positive: 1521, number of negative: 4113
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000751 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 958
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 190
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269968 -> initscore=-0.994785
[LightGBM] [Info] Start training from score -0.994785
LGBMClassifier accuracy: 0.8069552874378992


In [16]:
# Define the hyperparameters
n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None]

hyperparameter_grid = {'n_estimators': n_estimators,
                       'min_samples_leaf': min_samples_leaf,
                       'min_samples_split': min_samples_split,
                       'max_features': max_features}

# Initialize the classifier
classifier = ExtraTreesClassifier(random_state=1)

# Initialize the RandomizedSearchCV
random_cv = RandomizedSearchCV(estimator=classifier,
                               param_distributions=hyperparameter_grid,
                               cv=5, n_iter=10, scoring='accuracy',
                               n_jobs=-1, verbose=1, random_state=1)

random_cv.fit(X_train, y_train)
# Get the best parameters
best_params = random_cv.best_params_

print(f"The best hyperparameters from RandomizedSearchCV are: {best_params}")

Fitting 5 folds for each of 10 candidates, totalling 50 fits


  warn(


The best hyperparameters from RandomizedSearchCV are: {'n_estimators': 50, 'min_samples_split': 7, 'min_samples_leaf': 6, 'max_features': 'auto'}


In [18]:
classifier.fit(X_test, y_test)  # Fit the model with the optimal hyperparameters

# Get feature importances
importances = classifier.feature_importances_

# Get the indices of the features sorted by importance
indices = np.argsort(importances)[::-1]

# Get the names of the features sorted by importance
features_sorted = X_test.columns[indices]

# Print the two most important features
print(f"The two most important features are: {features_sorted[0]} and {features_sorted[1]}")


The two most important features are: totalcharges and monthlycharges
