# CUSTOMER CHURN PREDICTION

# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Data Preprocessing

In [2]:
# Load the dataset
data = pd.read_csv('Churn_Modelling.csv')

# Drop irrelevant columns
data.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1, inplace=True)

# Encode categorical variables
label_encoder = LabelEncoder()
data['Geography'] = label_encoder.fit_transform(data['Geography'])
data['Gender'] = label_encoder.fit_transform(data['Gender'])

# Define features and target variable
X = data.drop('Exited', axis=1)
y = data['Exited']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Logistic Regression

In [3]:
# Train and evaluate Logistic Regression
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_test)

# Random Forest

In [4]:
# Train and evaluate Random Forest
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train, y_train)
y_pred_rf_clf = rf_clf.predict(X_test)

# Gradient Boosting

In [5]:
# Train and evaluate Gradient Boosting
gb_clf = GradientBoostingClassifier(random_state=42)
gb_clf.fit(X_train, y_train)
y_pred_gb_clf = gb_clf.predict(X_test)

In [6]:
# Function to evaluate models
def evaluate_model(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    return accuracy, precision, recall, f1

# Evaluate Logistic Regression
acc_log_reg, prec_log_reg, rec_log_reg, f1_log_reg = evaluate_model(y_test, y_pred_log_reg)
print(f'Logistic Regression - Accuracy: {acc_log_reg}, Precision: {prec_log_reg}, Recall: {rec_log_reg}, F1 Score: {f1_log_reg}')

# Evaluate Random Forest
acc_rf_clf, prec_rf_clf, rec_rf_clf, f1_rf_clf = evaluate_model(y_test, y_pred_rf_clf)
print(f'Random Forest - Accuracy: {acc_rf_clf}, Precision: {prec_rf_clf}, Recall: {rec_rf_clf}, F1 Score: {f1_rf_clf}')

# Evaluate Gradient Boosting
acc_gb_clf, prec_gb_clf, rec_gb_clf, f1_gb_clf = evaluate_model(y_test, y_pred_gb_clf)
print(f'Gradient Boosting - Accuracy: {acc_gb_clf}, Precision: {prec_gb_clf}, Recall: {rec_gb_clf}, F1 Score: {f1_gb_clf}')


Logistic Regression - Accuracy: 0.815, Precision: 0.5966386554621849, Recall: 0.1806615776081425, F1 Score: 0.27734375
Random Forest - Accuracy: 0.8645, Precision: 0.7479674796747967, Recall: 0.4681933842239186, F1 Score: 0.5758998435054773
Gradient Boosting - Accuracy: 0.8655, Precision: 0.7540983606557377, Recall: 0.4681933842239186, F1 Score: 0.5777080062794349


# Hyperparameter Tuning

In [7]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for the best model (e.g., Random Forest)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf_clf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print(f'Best parameters: {best_params}')

# Evaluate the best model
best_rf_clf = grid_search.best_estimator_
y_pred_best_rf_clf = best_rf_clf.predict(X_test)
acc_best_rf_clf, prec_best_rf_clf, rec_best_rf_clf, f1_best_rf_clf = evaluate_model(y_test, y_pred_best_rf_clf)
print(f'Best Random Forest - Accuracy: {acc_best_rf_clf}, Precision: {prec_best_rf_clf}, Recall: {rec_best_rf_clf}, F1 Score: {f1_best_rf_clf}')


Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best parameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 200}
Best Random Forest - Accuracy: 0.867, Precision: 0.7656903765690377, Recall: 0.46564885496183206, F1 Score: 0.579113924050633
