In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

# To ignore warinings
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/kaggle/input/churn-bank-customer/Churn_Modelling.csv')

In [None]:
df.head()

In [None]:
df.info()

# Missing and Duplicated Rows

In [None]:
df.drop(columns = ['RowNumber','CustomerId'],inplace = True)

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.columns

# Catagorical columns

In [None]:
catagorical_cols = ['Surname','Geography', 'Gender','NumOfProducts', 'HasCrCard', 'IsActiveMember','Exited','Tenure']

## Counts Plots of Catagorical columns

In [None]:
import math

def plot_cat(columns, df):
    num_cols = 2  # Number of subplots per row
    total = len(columns)
    num_rows = math.ceil(total / num_cols)

    plt.figure(figsize=(num_cols * 5, num_rows * 4))

    for idx, col in enumerate(columns):
        plt.subplot(num_rows, num_cols, idx + 1)
        sns.countplot(data=df, x=col, order=df[col].value_counts().index)
        plt.title(col)
        plt.xticks(rotation=45)
        plt.tight_layout()

    plt.show()


In [None]:
plot_cat(catagorical_cols,df)

## Report:
- This shows as that the data is of 3 countries France, Germany, Spain
- Surname column doent seem to much use
- No of products are mostly 1 or 2 many less customers uses 3 or 4 products,
- While credit card is holded by more than double then that of customers not holding Credit card
- the active members are balanced
- On the other hand, Exited customers are very imbalanced less than 2000 of 10000 customers of dataset are exited

In [None]:
df.drop('Surname',inplace = True, axis = 1)

## Ploting relationship between catagorical columns

In [None]:
catagorical_cols = ['Geography', 'Gender','NumOfProducts', 'HasCrCard', 'IsActiveMember','Exited','Tenure']

import itertools

def plot_cat_vs_cat(columns, df):
    pairs = list(itertools.combinations(columns, 2))
    total = len(pairs)
    num_cols = 2  # 2 plots per row
    num_rows = math.ceil(total / num_cols)

    plt.figure(figsize=(num_cols * 6, num_rows * 5))

    for idx, (col1, col2) in enumerate(pairs):
        plt.subplot(num_rows, num_cols, idx + 1)
        sns.countplot(data=df, x=col1, hue=col2)
        plt.title(f"{col1} vs {col2}")
        plt.xticks(rotation=45)
        plt.tight_layout()

    plt.show()


In [None]:
plot_cat_vs_cat(catagorical_cols,df)

## Report 
- Using more products customers have high exiting rate

## Numerical Columns

In [None]:
num_cols = ['CreditScore','Age',
       'Balance','EstimatedSalary']

In [None]:
def plot_num_cols(columns, df):
    for col in columns:
        fig, axs = plt.subplots(1, 2, figsize=(12, 4))
        
        # Title for the whole row
        fig.suptitle(col, fontsize=14, fontweight='bold', y=1.05)

        # Histogram
        sns.histplot(df[col], kde=True, bins=30, ax=axs[0])
        axs[0].set_title('Histogram')
        axs[0].set_xlabel(col)
        axs[0].set_ylabel('Frequency')

        # Boxplot
        sns.boxplot(y=df[col], ax=axs[1])
        axs[1].set_title('Boxplot')
        axs[1].set_ylabel(col)

        plt.tight_layout()
        plt.show()


In [None]:
plot_num_cols(num_cols,df
             )

## Report

## Numerical vs Numerical

In [None]:
def plot_num_vs_num(columns, df):
    pairs = list(itertools.combinations(columns, 2))
    total = len(pairs)
    num_cols = 2
    num_rows = math.ceil(total / num_cols)

    plt.figure(figsize=(num_cols * 6, num_rows * 5))

    for idx, (col1, col2) in enumerate(pairs):
        plt.subplot(num_rows, num_cols, idx + 1)
        sns.scatterplot(data=df, x=col1, y=col2)
        plt.title(f"{col1} vs {col2}")
        plt.tight_layout()

    plt.show()

In [None]:
plot_num_vs_num(num_cols,df)

In [None]:
def plot_num_vs_cat(num_cols, cat_cols, df):
    pairs = list(itertools.product(cat_cols, num_cols))

    for cat_col, num_col in pairs:
        plt.figure(figsize=(12, 5))
        
        # Title for this pair
        plt.suptitle(f'{num_col} vs {cat_col}', fontsize=14, fontweight='bold')

        # --- Left: Distribution plot (histogram per category) ---
        plt.subplot(1, 2, 1)
        sns.kdeplot(x=df[num_col], hue=df[cat_col],common_norm=False)
        plt.title(f'Distribution of {num_col} by {cat_col}')

        # --- Right: Barplot (mean of num_col per category) ---
        plt.subplot(1, 2, 2)
        sns.barplot(data=df, x=cat_col, y=num_col, estimator='mean', ci='sd')
        plt.title(f'Mean {num_col} by {cat_col}')
        plt.xticks(rotation=45)

        plt.tight_layout()
        plt.show()

In [None]:
plot_num_vs_cat(num_cols,catagorical_cols,df)

In [None]:
sns.heatmap(df.corr(numeric_only = True),annot = True)

## Report
- People between the age 40 to 60 are exiting(age vs exited)
- Customers having more balance are not satisfied (balance vs exited)
- Germany have a higher exiting rate

In [None]:
df_mid_elder = df[(df['Age'] >=40) & (df['Age'] <= 60)]

In [None]:
prob1 = len(df_mid_elder[df_mid_elder['Exited'] == 1]) / len(df_mid_elder)
prob1

In [None]:
df_nmid_elder = df[(df['Age'] <=40) | (df['Age'] >= 60)]

In [None]:
prob2 = len(df_nmid_elder[df_nmid_elder['Exited'] == 1]) / len(df_nmid_elder)
prob2

In [None]:
print('Percentage of people exiting the bank in age of 40 to 60 is', prob1 * 100)
print('Percentage of people exiting the bank in age of not in age group of 40 to 60 is', prob2 * 100)

In [None]:
df_without0 = df[df['Balance'] != 0]

In [None]:
df_without0.describe()

In [None]:
def findProb(df, x):
    # Subsets
    df_le = df[df['Balance'] <= x]
    df_gt = df[df['Balance'] > x]
    
    # Avoid division by zero
    prob1 = len(df_le[df_le['Exited'] == 1]) / len(df_le) if len(df_le) > 0 else 0
    prob2 = len(df_gt[df_gt['Exited'] == 1]) / len(df_gt) if len(df_gt) > 0 else 0
    
    return prob1, prob2

    

In [None]:
findProb(df_without0,	100181.975000)

In [None]:
findProb(df_without0,	119839.690000)

In [None]:
findProb(df_without0,	139512.290000)


#### As the balance is increasing the probability of exiting is also increasing

In [None]:
df.columns

In [None]:
len(df) - len(df_without0)

In [None]:
df['Balance'].replace(0, np.nan, inplace=True)

In [None]:
df['Balance'].describe()

In [None]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)
df[['Balance_imputed']] = imputer.fit_transform(df[['Balance']])

In [None]:
df[['Balance_imputed']].hist()

In [None]:
df['Balance'].hist()

In [None]:
df

In [None]:
sns.scatterplot(data = df,x = 'Balance',y = 'Age', hue = 'Exited',alpha = 0.7)

In [None]:
sns.scatterplot(data = df,x = 'CreditScore',y = 'Age', hue = 'Exited',alpha = 0.7)


In [None]:
import time
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score
from sklearn.preprocessing import StandardScaler,OneHotEncoder, OrdinalEncoder
from sklearn.metrics import accuracy_score,f1_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


# ML Modelleri
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

In [None]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(max_depth=10, min_samples_split=4, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=200, max_depth=15, min_samples_split=5, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, max_depth=5, random_state=42),
    "XGBoost": XGBClassifier(n_estimators=200, learning_rate=0.05, max_depth=6, subsample=0.8, colsample_bytree=0.8, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=10, weights="distance", metric="minkowski"),
    "SVR": SVC(kernel='rbf', C=100, gamma='scale'),
    "Neural Network (MLP)": MLPClassifier(hidden_layer_sizes=(128, 64, 32), activation='relu', solver='adam', max_iter=1000, random_state=42)
}

In [None]:
    results = []
    
    for name, model in models.items():
        print(f"Training {name}...")
    
        # Training
        start_time = time.time()
        model.fit(X_train, y_train)
        train_time = time.time() - start_time
        
        # Prediction
        start_time = time.time()
        y_pred = model.predict(X_test)
        predict_time = time.time() - start_time
        
        # Performance Metrics
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
    
        # Store results
        results.append([name, mae, mse, r2, train_time, predict_time])
    
    results_df = pd.DataFrame(results, columns=["Model", "MAE", "MSE", "RÂ² Score", "Training Time (sec)", "Prediction Time (sec)"])
    results_df