In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

In [None]:
from google.colab import files
uploaded = files.upload()

# Move the kaggle.json file to the correct location
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

# Set permissions for the kaggle.json file
!chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


In [None]:
!kaggle datasets download -d shantanudhakadd/bank-customer-churn-prediction

Dataset URL: https://www.kaggle.com/datasets/shantanudhakadd/bank-customer-churn-prediction
License(s): other
Downloading bank-customer-churn-prediction.zip to /content
  0% 0.00/262k [00:00<?, ?B/s]
100% 262k/262k [00:00<00:00, 70.7MB/s]


In [None]:
# Unzip the downloaded dataset
!unzip /content/bank-customer-churn-prediction.zip -d /content

Archive:  /content/bank-customer-churn-prediction.zip
  inflating: /content/Churn_Modelling.csv  


In [None]:
!ls /content


bank-customer-churn-prediction.zip  Churn_Modelling.csv  kaggle.json  sample_data


In [None]:
data = pd.read_csv('/content/Churn_Modelling.csv')

In [None]:
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [None]:
data.describe()

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,15690940.0,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,2886.89568,71936.19,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,1.0,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,2500.75,15628530.0,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,5000.5,15690740.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,7500.25,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,10000.0,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


In [None]:
data.isnull().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [None]:
data.dtypes

RowNumber            int64
CustomerId           int64
Surname             object
CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object

Data Cleaning and Preprocessing

In [None]:
data = data.drop(columns=['RowNumber', 'CustomerId', 'Surname'])

In [None]:
label_encoder = LabelEncoder()
data['Geography'] = label_encoder.fit_transform(data['Geography'])
data['Gender'] = label_encoder.fit_transform(data['Gender'])

In [None]:
data.isnull().sum()

CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [None]:
scaler = StandardScaler()
numerical_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
data[numerical_features] = scaler.fit_transform(data[numerical_features])

In [None]:
x = data.drop(columns=['Exited'])
y = data['Exited']

Feature Engineering

In [None]:
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
x_poly = poly.fit_transform(x)

In [None]:
x_poly_df = pd.DataFrame(x_poly, columns=poly.get_feature_names_out(x.columns))
print(x_poly_df.head())

   CreditScore  Geography  Gender       Age    Tenure   Balance  \
0    -0.326221        0.0     0.0  0.293517 -1.041760 -1.225848   
1    -0.440036        2.0     0.0  0.198164 -1.387538  0.117350   
2    -1.536794        0.0     0.0  0.293517  1.032908  1.333053   
3     0.501521        0.0     0.0  0.007457 -1.387538 -1.225848   
4     2.063884        2.0     0.0  0.388871 -1.041760  0.785728   

   NumOfProducts  HasCrCard  IsActiveMember  EstimatedSalary  ...  \
0      -0.911583        1.0             1.0         0.021886  ...   
1      -0.911583        0.0             1.0         0.216534  ...   
2       2.527057        1.0             0.0         0.240687  ...   
3       0.807737        0.0             0.0        -0.108918  ...   
4      -0.911583        1.0             1.0        -0.365276  ...   

   Balance NumOfProducts  Balance HasCrCard  Balance IsActiveMember  \
0               1.117463          -1.225848               -1.225848   
1              -0.106974           0.000

Model Training and Evaluation

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_poly, y, test_size=0.2, random_state=42)

In [None]:
# Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(x_train, y_train)
y_pred_log_reg = log_reg.predict(x_test)
print("Logistic Regression Report")
print(classification_report(y_test, y_pred_log_reg))

Logistic Regression Report
              precision    recall  f1-score   support

           0       0.85      0.96      0.90      1607
           1       0.68      0.32      0.44       393

    accuracy                           0.84      2000
   macro avg       0.77      0.64      0.67      2000
weighted avg       0.82      0.84      0.81      2000



In [None]:
# Random Forest
rf = RandomForestClassifier()
rf.fit(x_train, y_train)
y_pred_rf = rf.predict(x_test)
print("Random Forest Report")
print(classification_report(y_test, y_pred_rf))

Random Forest Report
              precision    recall  f1-score   support

           0       0.88      0.96      0.92      1607
           1       0.73      0.48      0.58       393

    accuracy                           0.86      2000
   macro avg       0.81      0.72      0.75      2000
weighted avg       0.85      0.86      0.85      2000



In [None]:
# Gradient Boosting
gb = GradientBoostingClassifier()
gb.fit(x_train, y_train)
y_pred_gb = gb.predict(x_test)
print("Gradient Boosting Report")
print(classification_report(y_test, y_pred_gb))

Gradient Boosting Report
              precision    recall  f1-score   support

           0       0.88      0.96      0.92      1607
           1       0.72      0.47      0.57       393

    accuracy                           0.86      2000
   macro avg       0.80      0.71      0.74      2000
weighted avg       0.85      0.86      0.85      2000



In [None]:
# Model comparison using ROC AUC
log_reg_auc = roc_auc_score(y_test, log_reg.predict_proba(x_test)[:, 1])
rf_auc = roc_auc_score(y_test, rf.predict_proba(x_test)[:, 1])
gb_auc = roc_auc_score(y_test, gb.predict_proba(x_test)[:, 1])

In [None]:
print(f"Logistic Regression AUC: {log_reg_auc}")
print(f"Random Forest AUC: {rf_auc}")
print(f"Gradient Boosting AUC: {gb_auc}")

Logistic Regression AUC: 0.7745993593549849
Random Forest AUC: 0.8462166950887577
Gradient Boosting AUC: 0.8714988971595327
