In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score


In [5]:
# Specify the file path in Colab where you uploaded the dataset
file_path = '/content/WA_Fn-UseC_-Telco-Customer-Churn.csv'  # Adjust the path as needed

# Load the dataset
data = pd.read_csv(file_path)


In [6]:
# Convert 'TotalCharges' to numeric and fill missing values with 0
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')
data['TotalCharges'].fillna(0, inplace=True)

# Convert 'Churn' to binary values
data['Churn'] = (data['Churn'] == 'Yes').astype(int)


In [8]:
# Define the categorical and numerical features
categorical = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService',
               'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
               'Contract', 'PaperlessBilling', 'PaymentMethod']
numerical = ['tenure', 'MonthlyCharges', 'TotalCharges']


In [9]:
# Split the data into train and test sets
X = data[categorical + numerical]
y = data['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


In [11]:
# Scale numerical features using StandardScaler
scaler = StandardScaler()
X_train[numerical] = scaler.fit_transform(X_train[numerical])
X_test[numerical] = scaler.transform(X_test[numerical])

# One-hot encode categorical features
encoder = OneHotEncoder(sparse=False)
X_train_encoded = encoder.fit_transform(X_train[categorical])
X_test_encoded = encoder.transform(X_test[categorical])

# Get feature names using get_feature_names_out
feature_names = encoder.get_feature_names_out(input_features=categorical)
X_train_encoded = pd.DataFrame(X_train_encoded, columns=feature_names)
X_test_encoded = pd.DataFrame(X_test_encoded, columns=feature_names)




In [12]:
X_train = pd.concat([X_train_encoded, X_train[numerical]], axis=1)
X_test = pd.concat([X_test_encoded, X_test[numerical]], axis=1)


In [14]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

# Create and train the HistGradientBoostingClassifier
hgb_model = HistGradientBoostingClassifier(random_state=1)
hgb_model.fit(X_train_encoded, y_train)




In [15]:
# Initialize and train the models
models = {
    'Extra Trees': ExtraTreesClassifier(random_state=1),
    'XGBoost': XGBClassifier(random_state=1),
    'LightGBM': LGBMClassifier(random_state=1),
    'HistGradientBoosting': HistGradientBoostingClassifier(random_state=1)  # Add HistGradientBoostingClassifier
}

results = {}

for name, model in models.items():
    model.fit(X_train_encoded, y_train)  # Use X_train_encoded
    y_pred = model.predict(X_test_encoded)  # Use X_test_encoded
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy

# Print the results
for name, accuracy in results.items():
    print(f'{name} Accuracy: {accuracy:.4f}')


[LightGBM] [Info] Number of positive: 1521, number of negative: 4113
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001052 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 86
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269968 -> initscore=-0.994785
[LightGBM] [Info] Start training from score -0.994785
Extra Trees Accuracy: 0.7566
XGBoost Accuracy: 0.7658
LightGBM Accuracy: 0.7885
HistGradientBoosting Accuracy: 0.7899


In [16]:
import warnings

# To ignore all warnings:
warnings.filterwarnings("ignore")

from sklearn.model_selection import cross_val_score, StratifiedKFold, LeaveOneOut
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

# Specify the cross-validation method (e.g., StratifiedKFold, LeaveOneOut)
cross_val_method = StratifiedKFold(n_splits=5)  # Use StratifiedKFold as an example

for name, model in models.items():
    # Cross-validation for the model
    cv_scores = cross_val_score(model, X_train_encoded, y_train, cv=cross_val_method, scoring='accuracy')

    # Calculate additional metrics
    cv_precision = cross_val_score(model, X_train_encoded, y_train, cv=cross_val_method, scoring='precision')
    cv_recall = cross_val_score(model, X_train_encoded, y_train, cv=cross_val_method, scoring='recall')
    cv_f1 = cross_val_score(model, X_train_encoded, y_train, cv=cross_val_method, scoring='f1')

    # Print the results
    print(f'{name} Cross-Validation Accuracy: {cv_scores.mean():.4f}')
    print(f'{name} Cross-Validation Precision: {cv_precision.mean():.4f}')
    print(f'{name} Cross-Validation Recall: {cv_recall.mean():.4f}')
    print(f'{name} Cross-Validation F1 Score: {cv_f1.mean():.4f}')


Extra Trees Cross-Validation Accuracy: 0.7503
Extra Trees Cross-Validation Precision: 0.5474
Extra Trees Cross-Validation Recall: 0.4326
Extra Trees Cross-Validation F1 Score: 0.4832
XGBoost Cross-Validation Accuracy: 0.7519
XGBoost Cross-Validation Precision: 0.5444
XGBoost Cross-Validation Recall: 0.4997
XGBoost Cross-Validation F1 Score: 0.5209
[LightGBM] [Info] Number of positive: 1217, number of negative: 3290
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000606 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 86
[LightGBM] [Info] Number of data points in the train set: 4507, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.270024 -> initscore=-0.994499
[LightGBM] [Info] Start training from score -0.994499
[LightGBM] [Info] Number of positive: 1217, number of negative: 3290
[LightGBM] [Info] Auto-choos

In [17]:
from sklearn.metrics import confusion_matrix

# Choose a model (e.g., HistGradientBoosting)
model = models['HistGradientBoosting']

# Train the model on the full training data
model.fit(X_train_encoded, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_encoded)

# Compute the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

print('Confusion Matrix:')
print(conf_matrix)


Confusion Matrix:
[[922 139]
 [157 191]]


In [19]:
import warnings

# To ignore all warnings:
warnings.filterwarnings("ignore")

from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, LeaveOneOut
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Specify the models you want to evaluate
models = {
    'Extra Trees': ExtraTreesClassifier(random_state=1),
    'XGBoost': XGBClassifier(random_state=1),
    'LightGBM': LGBMClassifier(random_state=1),
    'HistGradientBoosting': HistGradientBoostingClassifier(random_state=1)
}

# Specify the cross-validation methods
cv_methods = {
    'K-Fold': KFold(n_splits=5, shuffle=True, random_state=1),  # K-Fold with 5 splits
    'Stratified K-Fold': StratifiedKFold(n_splits=5, shuffle=True, random_state=1),  # Stratified K-Fold
    'Leave-One-Out': LeaveOneOut()  # Leave-One-Out
}

# Loop over models and cross-validation methods
for model_name, model in models.items():
    print(f"Model: {model_name}")

    for cv_method_name, cv_method in cv_methods.items():
        print(f"Cross-Validation Method: {cv_method_name}")

        # Lists to store evaluation metrics across folds
        accuracy_scores = []
        precision_scores = []
        recall_scores = []
        f1_scores = []
        confusion_matrices = []

        for train_index, test_index in cv_method.split(X_train_encoded, y_train):
            X_train_cv, X_test_cv = X_train_encoded.iloc[train_index], X_train_encoded.iloc[test_index]
            y_train_cv, y_test_cv = y_train.iloc[train_index], y_train.iloc[test_index]

            # Train the model on the training set of the current fold
            model.fit(X_train_cv, y_train_cv)

            # Make predictions on the test set of the current fold
            y_pred_cv = model.predict(X_test_cv)

            # Calculate and store evaluation metrics
            accuracy_scores.append(accuracy_score(y_test_cv, y_pred_cv))
            precision_scores.append(precision_score(y_test_cv, y_pred_cv))
            recall_scores.append(recall_score(y_test_cv, y_pred_cv))
            f1_scores.append(f1_score(y_test_cv, y_pred_cv))
            confusion_matrices.append(confusion_matrix(y_test_cv, y_pred_cv))

        # Calculate and print mean and standard deviation of metrics across folds
        print(f"Mean Accuracy: {np.mean(accuracy_scores):.4f} ± {np.std(accuracy_scores):.4f}")
        print(f"Mean Precision: {np.mean(precision_scores):.4f} ± {np.std(precision_scores):.4f}")
        print(f"Mean Recall: {np.mean(recall_scores):.4f} ± {np.std(recall_scores):.4f}")
        print(f"Mean F1 Score: {np.mean(f1_scores):.4f} ± {np.std(f1_scores):.4f}")

        # Optionally, you can print or analyze confusion matrices for each fold
        for i, cm in enumerate(confusion_matrices):
            print(f"Confusion Matrix (Fold {i + 1}):\n{cm}\n")

        print("========================================")


Model: Extra Trees
Cross-Validation Method: K-Fold
Mean Accuracy: 0.7449 ± 0.0063
Mean Precision: 0.5341 ± 0.0255
Mean Recall: 0.4325 ± 0.0303
Mean F1 Score: 0.4774 ± 0.0239
Confusion Matrix (Fold 1):
[[700 114]
 [165 148]]

Confusion Matrix (Fold 2):
[[708 131]
 [157 131]]

Confusion Matrix (Fold 3):
[[724 101]
 [182 120]]

Confusion Matrix (Fold 4):
[[706 117]
 [183 121]]

Confusion Matrix (Fold 5):
[[701 111]
 [176 138]]

Cross-Validation Method: Stratified K-Fold
Mean Accuracy: 0.7430 ± 0.0076
Mean Precision: 0.5302 ± 0.0185
Mean Recall: 0.4162 ± 0.0350
Mean F1 Score: 0.4658 ± 0.0272
Confusion Matrix (Fold 1):
[[705 118]
 [174 130]]

Confusion Matrix (Fold 2):
[[708 115]
 [182 122]]

Confusion Matrix (Fold 3):
[[702 121]
 [163 141]]

Confusion Matrix (Fold 4):
[[720 102]
 [174 131]]

Confusion Matrix (Fold 5):
[[718 104]
 [195 109]]

Cross-Validation Method: Leave-One-Out


KeyboardInterrupt: ignored

In [20]:
from sklearn.metrics import confusion_matrix

# Loop over models and calculate confusion matrices
for name, model in models.items():
    model.fit(X_train_encoded, y_train)  # Train the model on the full training data
    y_pred = model.predict(X_test_encoded)  # Make predictions on the test set
    conf_matrix = confusion_matrix(y_test, y_pred)  # Compute the confusion matrix

    print(f'Confusion Matrix for {name}:')
    print(conf_matrix)
    print()


Confusion Matrix for Extra Trees:
[[900 161]
 [182 166]]

Confusion Matrix for XGBoost:
[[882 179]
 [151 197]]

[LightGBM] [Info] Number of positive: 1521, number of negative: 4113
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001129 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 86
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269968 -> initscore=-0.994785
[LightGBM] [Info] Start training from score -0.994785
Confusion Matrix for LightGBM:
[[914 147]
 [151 197]]

Confusion Matrix for HistGradientBoosting:
[[922 139]
 [157 191]]



In [21]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import RandomizedSearchCV

# Create an Extra Trees Classifier
et_classifier = ExtraTreesClassifier(random_state=1)

# Define the hyperparameter grid
n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None]
hyperparameter_grid = {
    'n_estimators': n_estimators,
    'min_samples_leaf': min_samples_leaf,
    'min_samples_split': min_samples_split,
    'max_features': max_features
}

# Create the RandomizedSearchCV
randomized_search = RandomizedSearchCV(
    et_classifier,
    param_distributions=hyperparameter_grid,
    scoring='accuracy',
    n_iter=10,
    cv=5,
    n_jobs=-1,
    verbose=1,
    random_state=1
)

# Fit the RandomizedSearchCV to the data
randomized_search.fit(X_train_encoded, y_train)

# Print the best hyperparameters
print("Best Hyperparameters:")
print(randomized_search.best_params_)


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Hyperparameters:
{'n_estimators': 1000, 'min_samples_split': 9, 'min_samples_leaf': 8, 'max_features': 'sqrt'}


In [23]:
from sklearn.ensemble import ExtraTreesClassifier

# Define the best hyperparameters obtained from RandomizedSearchCV
best_hyperparameters = {
    'n_estimators': 1000,
    'min_samples_leaf': 2,
    'min_samples_split': 3,
    'max_features': 'auto'
}

# Create and train the ExtraTreesClassifier model with the best hyperparameters
optimal_et_classifier = ExtraTreesClassifier(random_state=1, **best_hyperparameters)
optimal_et_classifier.fit(X_train_encoded, y_train)

# Calculate the accuracy of the new optimal model
optimal_model_accuracy = optimal_et_classifier.score(X_test_encoded, y_test)

# Calculate the accuracy of the initial ExtraTreesClassifier model
initial_model_accuracy = models['Extra Trees'].score(X_test_encoded, y_test)

print(f"Accuracy of the New Optimal Model: {optimal_model_accuracy:.4f}")
print(f"Accuracy of the Initial ExtraTreesClassifier Model: {initial_model_accuracy:.4f}")

# Compare the accuracies
if optimal_model_accuracy > initial_model_accuracy:
    print("The accuracy of the new optimal model is higher than the initial model.")
elif optimal_model_accuracy < initial_model_accuracy:
    print("The accuracy of the new optimal model is lower than the initial model.")
else:
    print("The accuracy of the new optimal model is the same as the initial model.")


Accuracy of the New Optimal Model: 0.7771
Accuracy of the Initial ExtraTreesClassifier Model: 0.7566
The accuracy of the new optimal model is higher than the initial model.


In [24]:
# Fit the optimal ExtraTreesClassifier model on the entire training data
optimal_et_classifier.fit(X_train_encoded, y_train)

# Get feature importances
feature_importances = optimal_et_classifier.feature_importances_

# Create a DataFrame to associate feature names with importances
importance_df = pd.DataFrame({'Feature': X_train_encoded.columns, 'Importance': feature_importances})

# Sort the DataFrame by importance in descending order
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Print the two most important features
print("Two Most Important Features:")
print(importance_df.head(2))


Two Most Important Features:
                    Feature  Importance
34  Contract_Month-to-month    0.124257
16        OnlineSecurity_No    0.061179


In [25]:
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score

# Create and train the LGBM classifier
lgbm_classifier = LGBMClassifier(random_state=1)
lgbm_classifier.fit(X_train_encoded, y_train)

# Make predictions on the test set
y_pred = lgbm_classifier.predict(X_test_encoded)

# Calculate the accuracy on the test set
accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy on the Test Set using LGBM Classifier: {accuracy:.4f}')


[LightGBM] [Info] Number of positive: 1521, number of negative: 4113
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001163 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 86
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269968 -> initscore=-0.994785
[LightGBM] [Info] Start training from score -0.994785
Accuracy on the Test Set using LGBM Classifier: 0.7885
