In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV

# Load data
data = pd.read_csv("/content/Churn_Modelling.csv")

# Display the first few rows of the dataset
print(data.head())


   RowNumber  CustomerId   Surname  CreditScore Geography  Gender  Age  \
0          1    15634602  Hargrave          619    France  Female   42   
1          2    15647311      Hill          608     Spain  Female   41   
2          3    15619304      Onio          502    France  Female   42   
3          4    15701354      Boni          699    France  Female   39   
4          5    15737888  Mitchell          850     Spain  Female   43   

   Tenure    Balance  NumOfProducts  HasCrCard  IsActiveMember  \
0       2       0.00              1          1               1   
1       1   83807.86              1          0               1   
2       8  159660.80              3          1               0   
3       1       0.00              2          0               0   
4       2  125510.82              1          1               1   

   EstimatedSalary  Exited  
0        101348.88       1  
1        112542.58       0  
2        113931.57       1  
3         93826.63       0  
4         790

In [None]:
# Check for missing values
print(data.isnull().sum())

# Check data types
print(data.dtypes)

# Check for unique values in the target column 'churn'
print(data['Exited'].value_counts())


RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64
RowNumber            int64
CustomerId           int64
Surname             object
CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object
Exited
0    7963
1    2037
Name: count, dtype: int64


In [None]:
# Fill missing values for numeric columns with their median
numeric_columns = data.select_dtypes(include=['int64', 'float64']).columns
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].median())

# Fill missing values for categorical columns with their mode
categorical_columns = data.select_dtypes(include=['object']).columns
data[categorical_columns] = data[categorical_columns].fillna(data[categorical_columns].mode().iloc[0])

# Verify there are no more missing values
print(data.isnull().sum())


RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64


In [None]:
# Identify numeric and categorical features
numeric_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
categorical_features = ['Geography', 'Gender']

# Define column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])


In [None]:
# Separate features and target variable
X = data.drop('Exited', axis=1)
y = data['Exited']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


In [None]:
# Define a preprocessing and modeling pipeline
log_reg_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', LogisticRegression(max_iter=1000))])

# Train the model
log_reg_pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred_lr = log_reg_pipeline.predict(X_test)

# Evaluation
print("Logistic Regression")
print(classification_report(y_test, y_pred_lr))
print("ROC AUC Score:", roc_auc_score(y_test, log_reg_pipeline.predict_proba(X_test)[:,1]))


Logistic Regression
              precision    recall  f1-score   support

           0       0.81      0.97      0.89      2389
           1       0.54      0.12      0.20       611

    accuracy                           0.80      3000
   macro avg       0.68      0.55      0.54      3000
weighted avg       0.76      0.80      0.75      3000

ROC AUC Score: 0.7642214486883759


In [None]:
# Define a preprocessing and modeling pipeline
rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', RandomForestClassifier(random_state=42))])

# Train the model
rf_pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = rf_pipeline.predict(X_test)

# Evaluation
print("Random Forest")
print(classification_report(y_test, y_pred_rf))
print("ROC AUC Score:", roc_auc_score(y_test, rf_pipeline.predict_proba(X_test)[:,1]))


Random Forest
              precision    recall  f1-score   support

           0       0.87      0.96      0.91      2389
           1       0.75      0.44      0.55       611

    accuracy                           0.85      3000
   macro avg       0.81      0.70      0.73      3000
weighted avg       0.84      0.85      0.84      3000

ROC AUC Score: 0.8484081089061363


In [None]:
# Define a preprocessing and modeling pipeline
gb_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', GradientBoostingClassifier(random_state=42))])

# Train the model
gb_pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred_gb = gb_pipeline.predict(X_test)

# Evaluation
print("Gradient Boosting")
print(classification_report(y_test, y_pred_gb))
print("ROC AUC Score:", roc_auc_score(y_test, gb_pipeline.predict_proba(X_test)[:,1]))


Gradient Boosting
              precision    recall  f1-score   support

           0       0.87      0.97      0.92      2389
           1       0.79      0.45      0.57       611

    accuracy                           0.86      3000
   macro avg       0.83      0.71      0.74      3000
weighted avg       0.86      0.86      0.85      3000

ROC AUC Score: 0.8672345084090406
