In [54]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [8]:
# load the data set
# Data set from : https://www.kaggle.com/datasets/blastchar/telco-customer-churn
from google.colab import files
uploaded = files.upload()

Saving WA_Fn-UseC_-Telco-Customer-Churn.csv to WA_Fn-UseC_-Telco-Customer-Churn (1).csv


In [55]:
# Load the data into data frame
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [60]:
# Train and evaluate models
for name, model in models.items():
    model.fit(x_train, y_train)
    predictions = model.predict(x_test)
    print(f"Model: {name}")
    print(classification_report(y_test, predictions))
    print("="*50)

Model: Logistic Regression
              precision    recall  f1-score   support

           0       0.85      0.91      0.88      1036
           1       0.69      0.56      0.62       373

    accuracy                           0.82      1409
   macro avg       0.77      0.74      0.75      1409
weighted avg       0.81      0.82      0.81      1409

Model: Random Forest
              precision    recall  f1-score   support

           0       0.83      0.91      0.87      1036
           1       0.67      0.49      0.57       373

    accuracy                           0.80      1409
   macro avg       0.75      0.70      0.72      1409
weighted avg       0.79      0.80      0.79      1409

Model: Gradient Boosting
              precision    recall  f1-score   support

           0       0.84      0.91      0.88      1036
           1       0.69      0.53      0.60       373

    accuracy                           0.81      1409
   macro avg       0.76      0.72      0.74      1409
w

In [64]:
# Models Performances:

# In terms of accuracy, SVM and logistic regression perform the best, both achieving an accuracy of 0.82.
# # Logistic regression and SVM have higher precision for class 0 (non-churned customers) compared to other models, indicating fewer false positive predictions.
# Gradient boosting and logistic regression have the highest recall for class 1 (churned customers), indicating better detection of actual churn cases.
# Random forest has lower precision and recall for class 1 compared to other models.
# Overall, logistic regression, SVM, and gradient boosting perform similarly in terms of accuracy and F1-score, while random forest lags slightly behind.

In [63]:
# 5. Tuning & Feature Engineering.
# hyperparameter tuning for SVM and logistic regression. We'll use grid search cross-validation to find the optimal hyperparameters.
# For feature engineering, we can try adding interaction terms or polynomial features to capture non-linear relationships

In [66]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures

In [67]:
# Feature Engineering: Polynomial Features
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)

In [71]:
# Split the polynomial features data
x_train_poly, x_test_poly, _, _ = train_test_split(X_poly, y, test_size=0.2, random_state=42)

In [73]:
from sklearn.model_selection import RandomizedSearchCV

# Logistic Regression Hyperparameter Tuning
log_reg_param_grid = {
    'C': [0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

log_reg_random_search = RandomizedSearchCV(LogisticRegression(random_state=42), log_reg_param_grid,
                                          n_iter=5, cv=3, scoring='accuracy', random_state=42)
log_reg_random_search.fit(x_train_poly, y_train)

print("Best Logistic Regression Parameters:", log_reg_random_search.best_params_)


Best Logistic Regression Parameters: {'solver': 'liblinear', 'penalty': 'l1', 'C': 0.1}


In [74]:
# Here's a breakdown of the best parameters:

# Solver: 'liblinear': Think of the solver as the strategy the logistic regression model uses to find the best fit for the data.
#In this case, 'liblinear' is a good choice for smaller datasets like yours.

# Penalty: 'l1': Penalty refers to a technique used to prevent overfitting, which is when a model learns too much from the training data and performs poorly on new data.
#'l1' penalty specifically encourages the model to only consider the most important features, making it simpler and more focused.

# C: 0.1: C is a value that controls the amount of regularization applied to the model. A smaller C value means stronger regularization,
# which helps prevent the model from becoming too complex and overfitting the data. In this case,
# a value of 0.1 indicates that the model should prioritize simplicity.


In [75]:
# Putting it all together;
# the best logistic regression model for your dataset is one that uses a simple strategy ('liblinear'),
# focuses on the most important features ('l1' penalty), and prioritizes simplicity over complexity (small C value of 0.1).
# This combination should result in a model that is easier to understand and performs well on new dat