In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# /content/drive/MyDrive/Bank /Churn_Modelling.csv

import pandas as pd

# Load dataset
data = pd.read_csv('/content/drive/MyDrive/Bank /Churn_Modelling.csv')

# Overview of data
print(data.info())

In [None]:
print(data.describe())

In [None]:
print(data.isnull().sum())  # Identify missing values

In [None]:
data.head(10)

In [None]:
# Drop irrelevant columns
# RowNumber: Just an index, no predictive value.
# CustomerId: Unique identifier, irrelevant for model training.
# Surname: No direct impact on customer churn prediction.
data.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1, inplace=True)

# Display the updated dataframe
data.head()

In [None]:
print(data['Geography'].unique())

In [None]:
data.head()

In [None]:
data = pd.get_dummies(data, columns=['Geography'], drop_first=False)

In [None]:
print(data.head(7))

In [None]:
print(data.info())

In [None]:
data['Gender'] = data['Gender'].map({'Male': 1, 'Female': 0})

In [None]:
data[['Geography_France', 'Geography_Germany', 'Geography_Spain']] = data[
    ['Geography_France', 'Geography_Germany', 'Geography_Spain']
].astype(int)

In [None]:
data.head(10)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(x='Exited', hue='Exited', data=data, palette='coolwarm', legend=False)
plt.title('Churn Distribution (Exited)')
plt.show()

# The blue bar shows approximately 8,000 customers who stayed
# The salmon/pink bar shows approximately 2,000 customers who exited

In [None]:
sns.barplot(x='Gender', y='Exited', data=data, palette='coolwarm')
plt.title('Churn Rate by Gender')
plt.xticks([0, 1], ['Female', 'Male'])
plt.show()
# Female customers have a churn rate of approximately 25% (0.25)
# Male customers have a churn rate of approximately 16-17% (0.16-0.17)

In [None]:
plt.figure(figsize=(10,5))
sns.histplot(data=data, x='Age', hue='Exited', kde=True, bins=30, palette='coolwarm')
plt.title('Age Distribution of Churned vs. Retained Customers')
plt.show()
# Most customers are between 30-45 years old
# There appears to be a higher proportion of churned customers in the older age ranges (45-60)

In [None]:
sns.boxplot(x='Exited', y='CreditScore', data=data, palette='coolwarm')
plt.title('Credit Score Distribution by Churn')
plt.show()

#The median (middle line in each box) is slightly higher for customers who churned
# The interquartile range (the box height) looks fairly similar between both groups
# The whiskers (lines extending from the boxes) show the range of typical values
# There are a few outliers (dots) at the lower end of the credit score range for customers who churned

In [None]:
plt.figure(figsize=(8,4))
sns.barplot(x=data[['Geography_France', 'Geography_Germany', 'Geography_Spain']].idxmax(axis=1), y=data['Exited'], palette='coolwarm')
plt.title('Churn Rate by Geography')
plt.xlabel('Geography')
plt.ylabel('Churn Rate')
plt.show()

# France has a churn rate of about 16%
# Spain has a churn rate of about 17%
# Germany has a significantly higher churn rate of about 32%

In [None]:
plt.figure(figsize=(12,6))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Feature Correlation Heatmap')
plt.show()

'''
Strongest correlations with churn (Exited):
Age: 0.29 (positive) - Older customers are more likely to churn
IsActiveMember: -0.16 (negative) - Active members are less likely to churn
Geography_Germany: 0.17 (positive) - German customers are more likely to churn
Gender: -0.11 (negative) - Since Gender is likely encoded as 0=Female and 1=Male, this suggests females are more likely to churn
Balance: 0.12 (positive) - Customers with higher balances are somewhat more likely to churn

Other interesting relationships:

Balance and NumOfProducts: -0.30 (negative) - Customers with more products tend to have lower balances
Balance and Geography_Germany: 0.40 (positive) - German customers tend to have higher balances
The Geography variables show negative correlations with each other (as expected with one-hot encoding)
Age and IsActiveMember: 0.09 (slight positive) - Older customers are slightly more likely to be active members'''

In [None]:
sns.pairplot(data[['Age', 'Balance', 'CreditScore', 'Exited']], hue='Exited', palette='coolwarm')
plt.show()

'''

Diagonal plots (from top-left to bottom-right) - These show the distribution of each variable:

Age: Shows a bimodal distribution with peaks around 30-40 and 60-70 years
Balance: Shows two distinct distributions - one at zero (no balance) and one centered around 100,000-120,000
CreditScore: Shows a roughly normal distribution centered around 650-700


Scatter plots (off-diagonal) - These show relationships between pairs of variables:

Blue dots represent retained customers (Exited=0)
Salmon/pink dots represent churned customers (Exited=1)



Key observations from this visualization:

Age patterns: Churned customers (pink) are more prevalent in the higher age ranges, confirming the positive correlation we saw in the heatmap
Balance patterns: Churned customers appear slightly more concentrated in higher balance ranges
CreditScore: Less clear separation between churned and retained customers
Zero balance customers: There's a noticeable horizontal line at Balance=0 that appears to be mostly blue (retained customers)'''

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


X = data.drop(columns=['Exited'])  # Features (all columns except target)
y = data['Exited']  # Target variable (1 = Churn, 0 = Stay)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np

# Train a basic Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Get feature importances
importances = rf.feature_importances_

# Print sorted feature importance
feature_names = X.columns
sorted_indices = np.argsort(importances)[::-1]

print("Feature Importance Ranking:")
for i in sorted_indices:
    print(f"{feature_names[i]}: {importances[i]:.4f}")

'''

Top Influential Factors:

Age (0.2403) - By far the most important predictor of churn, which aligns with what we saw in the visualizations showing older customers more likely to leave
EstimatedSalary (0.1444) - The second most important feature, though interestingly, this wasn't strongly correlated with churn in the correlation heatmap
CreditScore (0.1427) - Nearly tied with salary in importance
Balance (0.1423) - Also highly important, which matches the correlation we saw in the heatmap

Moderately Important:

NumOfProducts (0.1280) - Number of products a customer has with the bank
Tenure (0.0826) - How long a customer has been with the bank

Less Important:

IsActiveMember (0.0398) - Despite showing a noticeable correlation in the heatmap (-0.16), it's ranked lower in feature importance
Geography_Germany (0.0227) - While the bar chart showed Germany having a much higher churn rate, its predictive importance is relatively low
Gender (0.0192) - Also lower importance, despite the bar chart showing a clear difference in churn rates between genders
HasCrCard (0.0183) - Whether the customer has a credit card
11-12. Geography_France (0.0102) and Geography_Spain (0.0094) - Least important factors'''

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(x=y, palette='coolwarm')
plt.title("Class Distribution of Churn")
plt.show()

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

# Check new class balance
sns.countplot(x=y_train_bal, palette='coolwarm')
plt.title("Balanced Class Distribution After SMOTE")
plt.show()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Train model
lr = LogisticRegression()
lr.fit(X_train_bal, y_train_bal)

# Predict on test set
y_pred = lr.predict(X_test)

# Evaluate model
print("Logistic Regression Performance:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_bal, y_train_bal)

y_pred_rf = rf.predict(X_test)

print("Random Forest Performance:")
print(classification_report(y_test, y_pred_rf))
print("Accuracy:", accuracy_score(y_test, y_pred_rf))

In [None]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(50,50), max_iter=500, random_state=42)
mlp.fit(X_train_bal, y_train_bal)

y_pred_mlp = mlp.predict(X_test)

print("Neural Network Performance:")
print(classification_report(y_test, y_pred_mlp))
print("Accuracy:", accuracy_score(y_test, y_pred_mlp))

In [None]:
import numpy as np

# Sample customer data: [CreditScore, Gender, Age, Tenure, Balance, NumOfProducts, HasCrCard, IsActiveMember, EstimatedSalary, Geography_France, Geography_Germany, Geography_Spain]
new_customer = np.array([[700, 1, 35, 5, 120000, 2, 1, 1, 90000, 0, 1, 0]])  # Example: Male, 35 yrs old, Germany

# Scale the new customer data (using the same scaler from training)
new_customer_scaled = scaler.transform(new_customer)

In [None]:
prediction = rf.predict(new_customer_scaled)

if prediction[0] == 1:
    print("This customer is likely to CHURN!")
else:
    print("This customer is likely to STAY.")