In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [3]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
# Convert 'TotalCharges' to numeric values and fill missing values with 0
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(0, inplace=True)

In [5]:
# Convert 'Churn' column to binary values
df['Churn'] = df['Churn'].map({'No': 0, 'Yes': 1})

In [6]:
# Split the data into features and target variable
X = df.drop(columns=['customerID', 'Churn'])
y = df['Churn']

In [7]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [8]:
# Define the features
categorical_features = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
                        'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
                        'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges']

In [9]:
# Preprocessing pipeline
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

In [10]:
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(sparse=False))
])

In [11]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [12]:
# Combine numerical and categorical transformers into a single pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier(random_state=1))])

In [13]:
# Train the model
clf.fit(X_train, y_train)



## The accuracy on the test set using the random forest classifier

In [14]:
# Evaluate the model on the test set
accuracy = clf.score(X_test, y_test)
print(f'Random Forest Classifier Accuracy: {accuracy:.4f}')

Random Forest Classifier Accuracy: 0.7913


## The accuracy on the test set using the xgboost classifier

In [17]:
# Train the XGBoost classifier
xgb_classifier = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = xgb_classifier.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy on the test set: {accuracy:.4f}')

Accuracy on the test set: 0.7970


## The accuracy on the test set using the LGBM classifier

In [18]:
# Train the LGBM classifier
lgbm_classifier = LGBMClassifier(random_state=1)
lgbm_classifier.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 1521, number of negative: 4113
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000873 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 626
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269968 -> initscore=-0.994785
[LightGBM] [Info] Start training from score -0.994785


In [19]:
# Make predictions on the test set
y_pred = lgbm_classifier.predict(X_test)

In [20]:
# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy on the test set: {accuracy:.4f}')

Accuracy on the test set: 0.8062


## Finding the best hyperparameters from the randomized search CV

In [21]:
# Define the hyperparameter grid
n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None]

hyperparameter_grid = {
    'n_estimators': n_estimators,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'max_features': max_features
}

In [22]:
# Initialize the ExtraTreesClassifier
etc = ExtraTreesClassifier(random_state=1)

In [23]:
# Set up the RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=etc,
    param_distributions=hyperparameter_grid,
    n_iter=10,
    scoring='accuracy',
    cv=5,
    verbose=1,
    n_jobs=-1,
    random_state=1
)

In [24]:
# Fit the RandomizedSearchCV
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Administrator\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Administrator\anaconda3\Lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "C:\Users\Administrator\anaconda3\Lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\Administrator\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constr

In [25]:
# Output the best hyperparameters
best_hyperparameters = random_search.best_params_
print("Best Hyperparameters:", best_hyperparameters)

Best Hyperparameters: {'n_estimators': 50, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'log2'}


## Checking whether the accuracy of the new optimal model higher or lower than the initial ExtraTreesClassifier model with no hyperparameter tuning

In [26]:
# Initial ExtraTreesClassifier model with default hyperparameters
initial_et_classifier = ExtraTreesClassifier(random_state=1)
initial_et_classifier.fit(X_train, y_train)
initial_y_pred = initial_et_classifier.predict(X_test)
initial_accuracy = accuracy_score(y_test, initial_y_pred)
print(f'Initial model accuracy: {initial_accuracy:.4f}')

Initial model accuracy: 0.7857


In [27]:
# Get the best hyperparameters
best_params = random_search.best_params_
print(f'Best hyperparameters: {best_params}')

Best hyperparameters: {'n_estimators': 50, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'log2'}


In [28]:
# Train a new ExtraTreesClassifier model with the best hyperparameters
optimal_et_classifier = ExtraTreesClassifier(**best_params, random_state=1)
optimal_et_classifier.fit(X_train, y_train)
optimal_y_pred = optimal_et_classifier.predict(X_test)
optimal_accuracy = accuracy_score(y_test, optimal_y_pred)
print(f'Optimal model accuracy: {optimal_accuracy:.4f}')

Optimal model accuracy: 0.8148


In [29]:
# Compare the accuracies
if optimal_accuracy > initial_accuracy:
    print('The optimal model has a higher accuracy.')
else:
    print('The initial model has a higher or equal accuracy.')

The optimal model has a higher accuracy.


## Finding the feature importance using the optimal ExtraTreesClassifier model

In [30]:
# Train a new ExtraTreesClassifier model with the best hyperparameters
optimal_et_classifier = ExtraTreesClassifier(**best_params, random_state=1)
optimal_et_classifier.fit(X_train, y_train)

In [31]:
# Get feature importances
feature_importances = optimal_et_classifier.feature_importances_

In [32]:
# Create a DataFrame for feature importances
features = X.columns
feature_importances_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})

In [33]:
# Sort the DataFrame by importance
feature_importances_df = feature_importances_df.sort_values(by='Importance', ascending=False)

# Print the two most important features
print(feature_importances_df.head(2))

     Feature  Importance
14  Contract    0.202584
4     tenure    0.119416
