In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [12]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [6]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].dtype

dtype('float64')

In [11]:
df['TotalCharges'].isnull().sum()

0

In [10]:
df['TotalCharges'].fillna(value=0, inplace=True)

In [13]:
df['Churn'] = df['Churn'].map({'No': 0, 'Yes': 1})

In [16]:
df['Churn'].head()

0    0
1    0
2    1
3    0
4    1
Name: Churn, dtype: int64

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Select Features
categorical = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService',
               'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
               'Contract', 'PaperlessBilling', 'PaymentMethod']
numerical = ['tenure', 'MonthlyCharges', 'TotalCharges']

In [17]:
# to split the data into features and target
X = df[categorical + numerical]
y = df['Churn']

In [18]:
# to split the data into training and testing sets with an 80-20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [19]:
#  to scale Numerical Features
scaler = StandardScaler()
X_train_numerical = pd.DataFrame(scaler.fit_transform(X_train[numerical]), columns=numerical, index=X_train.index)
X_test_numerical = pd.DataFrame(scaler.transform(X_test[numerical]), columns=numerical, index=X_test.index)

In [20]:
# One-Hot Encode Categorical Features
encoder = OneHotEncoder(sparse_output=False, drop='first')
X_train_categorical = pd.DataFrame(encoder.fit_transform(X_train[categorical]), 
                                   columns=encoder.get_feature_names_out(categorical), 
                                   index=X_train.index)
X_test_categorical = pd.DataFrame(encoder.transform(X_test[categorical]), 
                                  columns=encoder.get_feature_names_out(categorical), 
                                  index=X_test.index)

In [21]:
# to combine Scaled Numerical and One-Hot Encoded Categorical Features
X_train_final = pd.concat([X_train_numerical, X_train_categorical], axis=1)
X_test_final = pd.concat([X_test_numerical, X_test_categorical], axis=1)

In [22]:

#  to verify the shapes
print("Training Features Shape:", X_train_final.shape)
print("Testing Features Shape:", X_test_final.shape)

print("\nTraining Features Head:")
print(X_train_final.head())

print("\nTesting Features Head:")
print(X_test_final.head())

Training Features Shape: (5634, 30)
Testing Features Shape: (1409, 30)

Training Features Head:
        tenure  MonthlyCharges  TotalCharges  gender_Male  SeniorCitizen_1  \
1814 -0.825884       -1.497530     -0.890947          1.0              0.0   
5946  0.395961        0.302996      0.389693          0.0              0.0   
3881  1.577078        0.012320      1.060945          1.0              0.0   
2389  1.577078        0.686687      1.775397          1.0              0.0   
3676 -0.092777        0.186726     -0.102671          1.0              0.0   

      Partner_Yes  Dependents_Yes  PhoneService_Yes  \
1814          1.0             1.0               1.0   
5946          0.0             0.0               1.0   
3881          1.0             0.0               1.0   
2389          1.0             1.0               1.0   
3676          0.0             0.0               1.0   

      MultipleLines_No phone service  MultipleLines_Yes  ...  \
1814                             0.0    

In [24]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [25]:
#  to initialize classifiers
models = {
    "RandomForest": RandomForestClassifier(random_state=1),
    "ExtraTrees": ExtraTreesClassifier(random_state=1),
    "XGBoost": XGBClassifier(random_state=1, use_label_encoder=False, eval_metric='logloss'),
    "LightGBM": LGBMClassifier(random_state=1)
}

# to train and evaluate the models
results = {}
for name, model in models.items():
    model.fit(X_train_final, y_train)
    y_pred = model.predict(X_test_final)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    results[name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    }

#  to print results
for name, metrics in results.items():
    print(f"\n{name} Results:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")

[LightGBM] [Info] Number of positive: 1521, number of negative: 4113
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002780 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 638
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269968 -> initscore=-0.994785
[LightGBM] [Info] Start training from score -0.994785

RandomForest Results:
Accuracy: 0.8006
Precision: 0.6057
Recall: 0.5517
F1 Score: 0.5774

ExtraTrees Results:
Accuracy: 0.7828
Precision: 0.5673
Recall: 0.5086
F1 Score: 0.5364

XGBoost Results:
Accuracy: 0.7991
Precision: 0.5988
Recall: 0.5661
F1 Score: 0.5820

LightGBM Results:
Accuracy: 0.8148
Precision: 0.6390
Recall: 0.5747
F1 Score: 0.6051


In [30]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import RandomizedSearchCV

# defining the hyperparameters
n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None] 

# creating the hyperparameter grid
hyperparameter_grid = {'n_estimators': n_estimators,
                       'min_samples_leaf': min_samples_leaf,
                       'min_samples_split': min_samples_split,
                       'max_features': max_features}

# to initialize the ExtraTreesClassifier
extra_trees = ExtraTreesClassifier(random_state=1)

# Initialize RandomizedSearchCV
randomized_search = RandomizedSearchCV(estimator=extra_trees,
                                       param_distributions=hyperparameter_grid,
                                       n_iter=10,
                                       scoring='accuracy',
                                       cv=5,
                                       n_jobs=-1,
                                       verbose=1,
                                       random_state=1,
                                       error_score='raise')

# to fit the RandomizedSearchCV object to the data
randomized_search.fit(X_train, y_train)  # Assuming X_train and y_train are defined

# Retrieve the best hyperparameters
best_hyperparameters = randomized_search.best_params_
print("Best hyperparameters:", best_hyperparameters)


Fitting 5 folds for each of 10 candidates, totalling 50 fits


ValueError: could not convert string to float: 'Female'