In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import ExtraTreesClassifier


In [4]:
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [5]:
df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [6]:

from sklearn.model_selection import train_test_split

# Convert 'TotalCharges' to numeric and fill missing values with 0
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce').fillna(0)

# Map 'Churn' column to binary values
df['Churn'] = df['Churn'].map({'No': 0, 'Yes': 1})

# Define categorical and numerical features
categorical = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']
numerical = ['tenure', 'MonthlyCharges', 'TotalCharges']

# Split the data into 80-20 train-test split
X = df[categorical + numerical]
y = df['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


In [9]:
# Feature Engineering
scaler = StandardScaler()
encoder = OneHotEncoder(sparse=False)

# Fit and transform the training set
X_train_numerical = scaler.fit_transform(X_train[numerical])
X_test_numerical = scaler.transform(X_test[numerical])

X_train_categorical = encoder.fit_transform(X_train[categorical])
X_test_categorical = encoder.transform(X_test[categorical])

# Update column names to be unique
column_names = encoder.get_feature_names_out(categorical)
X_train_categorical = pd.DataFrame(data=X_train_categorical, columns=[f'cat_{name}' for name in column_names], index=X_train.index)
X_test_categorical = pd.DataFrame(data=X_test_categorical, columns=[f'cat_{name}' for name in column_names], index=X_test.index)

X_train_combined = pd.concat([pd.DataFrame(data=X_train_numerical, columns=numerical, index=X_train.index), X_train_categorical], axis=1)
X_test_combined = pd.concat([pd.DataFrame(data=X_test_numerical, columns=numerical, index=X_test.index), X_test_categorical], axis=1)



In [10]:
# Training models
rf = RandomForestClassifier(random_state=1)
et = ExtraTreesClassifier(random_state=1)
xgb = XGBClassifier(random_state=1)
lgbm = LGBMClassifier(random_state=1)

rf.fit(X_train_combined, y_train)
et.fit(X_train_combined, y_train)
xgb.fit(X_train_combined, y_train)
lgbm.fit(X_train_combined, y_train)

# Evaluating models
rf_accuracy = rf.score(X_test_combined, y_test)
et_accuracy = et.score(X_test_combined, y_test)
xgb_accuracy = xgb.score(X_test_combined, y_test)
lgbm_accuracy = lgbm.score(X_test_combined, y_test)

print(f"Random Forest Accuracy: {rf_accuracy}")
print(f"Extra Trees Accuracy: {et_accuracy}")
print(f"XGBoost Accuracy: {xgb_accuracy}")
print(f"LightGBM Accuracy: {lgbm_accuracy}")

[LightGBM] [Info] Number of positive: 1521, number of negative: 4113
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000840 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269968 -> initscore=-0.994785
[LightGBM] [Info] Start training from score -0.994785
Random Forest Accuracy: 0.7913413768630234
Extra Trees Accuracy: 0.7672107877927609
XGBoost Accuracy: 0.7934705464868701
LightGBM Accuracy: 0.8034066713981547


In [7]:
data.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [15]:
# Applying one-hot encoding for the 'gender' feature
data = pd.get_dummies(data)


In [None]:
# Set the best hyperparameters found from RandomizedSearchCV
best_params = {
    'n_estimators': 100,
    'min_samples_split': 5,
    'min_samples_leaf': 1,
    'max_features': 'sqrt',
    'max_depth': 20,
    'bootstrap': False
}

# Initialize the ExtraTreesClassifier with the best parameters
etc_best = ExtraTreesClassifier(random_state=1, **best_params)

# Fit the model to the data
etc_best.fit(X, y)

In [None]:
# Train the ExtraTreesClassifier model
etc_best.fit(X, y)

# Get feature importances
importances = etc_best.feature_importances_

# Create a list of features along with their importances
feature_importances = list(zip(X.columns, importances))

# Sort feature importances in descending order
feature_importances = sorted(feature_importances, key=lambda x: x[1], reverse=True)

# Display the top two most important features
top_features = feature_importances[:2]
print("Two most important features:")
for feature, importance in top_features:
    print(f"{feature}: {importance}")