Logistic Regression Model

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

Load the dataset

In [None]:
df = pd.read_csv('Bank Customer Churn Prediction Classification Dataset.csv')

# Display the first few rows of the DataFrame
df.head(10)

Unnamed: 0,customer_id,credit_score,country,gender,age,tenure (From how many years he/she is having bank acc in ABC Bank),Account balance,products_number (Number of Product from bank),credit_card (Is this customer have credit card ?),active_member (Is he/she is active Member of bank ?),estimated_salary,churn (Churn Status)
0,15634602,619,France,Female,42,2,0.0,1,1,1,,1
1,15647311,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,15619304,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,15701354,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,15737888,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0
5,15574012,645,Spain,Male,44,8,113755.78,2,1,0,149756.71,1
6,15592531,822,France,Male,50,7,0.0,2,1,1,10062.8,0
7,15656148,376,Germany,Female,29,4,115046.74,4,1,0,119346.88,1
8,15792365,501,,Male,44,4,142051.07,2,0,1,74940.5,0
9,15592389,684,France,Male,27,2,134603.88,1,1,1,71725.73,0


Data Preprocessing

In [None]:
# Drop irrelevant columns
df = df.drop(columns='customer_id', axis=1)

# One-hot encode categorical variables
categorical_columns = ['country', 'gender']
df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Fill missing values in numerical columns with the mean
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns
df[numerical_columns] = df[numerical_columns].fillna(df[numerical_columns].mean())

# Split the data into features (X) and target variable (y)
X = df.drop(columns=['churn (Churn Status)'])
y = df['churn (Churn Status)']

# Scale numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Display the first few rows of the DataFrame
df.head(10)

Unnamed: 0,credit_score,age,tenure (From how many years he/she is having bank acc in ABC Bank),Account balance,products_number (Number of Product from bank),credit_card (Is this customer have credit card ?),active_member (Is he/she is active Member of bank ?),estimated_salary,churn (Churn Status),country_?,country_France,country_Germany,country_Spain,gender_Male
0,619,42,2,0.0,1,1,1,100063.579469,1,0,1,0,0,0
1,608,41,1,83807.86,1,0,1,112542.58,0,0,0,0,1,0
2,502,42,8,159660.8,3,1,0,113931.57,1,0,1,0,0,0
3,699,39,1,0.0,2,0,0,93826.63,0,0,1,0,0,0
4,850,43,2,125510.82,1,1,1,79084.1,0,0,0,0,1,0
5,645,44,8,113755.78,2,1,0,149756.71,1,0,0,0,1,1
6,822,50,7,0.0,2,1,1,10062.8,0,0,1,0,0,1
7,376,29,4,115046.74,4,1,0,119346.88,1,0,0,1,0,0
8,501,44,4,142051.07,2,0,1,74940.5,0,0,0,0,0,1
9,684,27,2,134603.88,1,1,1,71725.73,0,0,1,0,0,1


Training the model

In [None]:
# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Model Implementation - Logistic Regression
logistic_model = LogisticRegression(random_state=42)

# Train the model on the training set
logistic_model.fit(X_train, y_train)

# Make predictions on the testing set
logistic_pred = logistic_model.predict(X_test)

Evaluation

In [None]:
# Evaluation
accuracy = accuracy_score(y_test, logistic_pred)
conf_matrix = confusion_matrix(y_test, logistic_pred)
classification_rpt = classification_report(y_test, logistic_pred)

# Print the evaluation metrics
print(f"Accuracy of Logistic Regression Model: {accuracy}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Classification Report:\n{classification_rpt}")

Accuracy of Logistic Regression Model: 0.8121878121878122
Confusion Matrix:
[[1553   76]
 [ 300   73]]
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.95      0.89      1629
           1       0.49      0.20      0.28       373

    accuracy                           0.81      2002
   macro avg       0.66      0.57      0.59      2002
weighted avg       0.77      0.81      0.78      2002



Hyperparameter Tuning

In [None]:
# Hyperparameters to tune
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization parameter
    'penalty': ['l1', 'l2'],  # Regularization penalty (L1 or L2)
    'solver': ['liblinear', 'lbfgs', 'saga'],  # Optimization algorithm
}

# Intialize GridSearchCV
grid_search = GridSearchCV(LogisticRegression(random_state=42), param_grid, cv=5, scoring='accuracy')

# Perform Grid Search to find the best hyperparameters
grid_search.fit(X_train, y_train)

30 fits failed out of a total of 180.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

 0.80322392        nan 0.80259931 0.8

Evaluation using the best model

In [None]:
# Get the best hyperparameters and model
best_param = grid_search.best_params_
best_model = grid_search.best_estimator_

# Train the best model
best_model.fit(X_train, y_train)

# Make predictions using the best model
best_pred = best_model.predict(X_test)

# Evaluation for the best model
accuracy_best = accuracy_score(y_test, best_pred)
conf_matrix_best = confusion_matrix(y_test, best_pred)
classification_rpt_best = classification_report(y_test, best_pred)

# Print the best hyperparameters and evaluation metrics
print("Best Hyperparameters:", best_param)
print(f"Accuracy with Best Model: {accuracy_best}")
print(f"Confusion Matrix with Best Model:\n{conf_matrix_best}")
print(f"Classification Report with Best Model:\n{classification_rpt_best}")

Best Hyperparameters: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
Accuracy with Best Model: 0.8126873126873126
Confusion Matrix with Best Model:
[[1554   75]
 [ 300   73]]
Classification Report with Best Model:
              precision    recall  f1-score   support

           0       0.84      0.95      0.89      1629
           1       0.49      0.20      0.28       373

    accuracy                           0.81      2002
   macro avg       0.67      0.57      0.59      2002
weighted avg       0.77      0.81      0.78      2002

