# Bank Marketing ML Project

## Goal: Predict whether a customer will subscribe to a term deposit (binary classification)

**Dataset**: `bank_24.pkl`  
**Target Variable**: `deposit` (yes/no)  
**Records**: 11,000 samples, 17 features

---

## Machine Learning Workflow Steps:
1. **Data Loading & Inspection** - Load pickle file and understand the data
2. **Exploratory Data Analysis (EDA)** - Visualize distributions, correlations, class balance
3. **Data Preprocessing** - Handle missing values, encode categorical variables, scale features
4. **Train/Test Split** - Split data for model validation
5. **Model Training** - Train multiple algorithms (Logistic Regression, Decision Tree, Random Forest, KNN)
6. **Model Evaluation** - Compare models using accuracy, precision, recall, F1-score, ROC-AUC
7. **Hyperparameter Tuning** - Optimize the best model
8. **Final Model** - Save the best model for production

---

## 1. Import Required Libraries

In [None]:
# Data handling
import pickle
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# Metrics
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_auc_score, roc_curve
)

# Model persistence
import joblib

# Settings
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)

print("✅ All libraries imported successfully!")

## 2. Load Dataset from Pickle File

Pickle files store Python objects in binary format. We'll load the dataset and inspect its structure.

In [None]:
# Load the pickle file
with open('bank_24.pkl', 'rb') as f:
    df = pickle.load(f)

print(f"✅ Dataset loaded successfully!")
print(f"Shape: {df.shape} (rows, columns)")
print(f"\nColumns: {df.columns.tolist()}")

## 3. Data Inspection & Exploratory Data Analysis (EDA)

Let's understand our data better before building models.

In [None]:
# Display first few rows
print("First 5 rows:")
display(df.head())

# Dataset info
print("\n" + "="*60)
print("Dataset Information:")
print("="*60)
df.info()

In [None]:
# Check for missing values
print("Missing Values:")
missing = df.isnull().sum()
missing_pct = (missing / len(df) * 100).round(2)
missing_df = pd.DataFrame({'Missing': missing, 'Percentage': missing_pct})
print(missing_df[missing_df['Missing'] > 0])

# Note: 'marital' has 447 missing values (4.07%)
print(f"\n⚠️ Found {missing_df[missing_df['Missing'] > 0].shape[0]} columns with missing values")

In [None]:
# Summary statistics for numerical features
print("Numerical Features Summary:")
display(df.describe())

In [None]:
# Target variable distribution (Class balance check)
print("Target Variable Distribution (deposit):")
print(df['deposit'].value_counts())
print(f"\nPercentages:")
print(df['deposit'].value_counts(normalize=True) * 100)

# Visualize
fig, ax = plt.subplots(1, 2, figsize=(12, 4))
df['deposit'].value_counts().plot(kind='bar', ax=ax[0], color=['#FF6B6B', '#4ECDC4'])
ax[0].set_title('Class Distribution (Count)', fontsize=14, fontweight='bold')
ax[0].set_xlabel('Deposit')
ax[0].set_ylabel('Count')

df['deposit'].value_counts(normalize=True).plot(kind='pie', ax=ax[1], autopct='%1.1f%%', startangle=90, colors=['#FF6B6B', '#4ECDC4'])
ax[1].set_title('Class Distribution (%)', fontsize=14, fontweight='bold')
ax[1].set_ylabel('')
plt.tight_layout()
plt.show()

# Check for class imbalance
if df['deposit'].value_counts().min() / df['deposit'].value_counts().max() < 0.3:
    print("\n⚠️ Class imbalance detected! Consider using techniques like SMOTE, class weights, or stratified sampling.")

In [None]:
# Visualize numerical features distribution
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

fig, axes = plt.subplots(3, 3, figsize=(15, 12))
axes = axes.flatten()

for i, col in enumerate(numerical_cols):
    if i < len(axes):
        df[col].hist(bins=30, ax=axes[i], edgecolor='black', alpha=0.7)
        axes[i].set_title(f'{col}', fontsize=12, fontweight='bold')
        axes[i].set_xlabel(col)
        axes[i].set_ylabel('Frequency')

# Remove empty subplots
for j in range(len(numerical_cols), len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.suptitle('Distribution of Numerical Features', y=1.01, fontsize=16, fontweight='bold')
plt.show()

In [None]:
# Categorical features analysis
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
categorical_cols.remove('deposit')  # Remove target variable

print(f"Categorical Features ({len(categorical_cols)}):")
for col in categorical_cols:
    print(f"\n{col}: {df[col].nunique()} unique values")
    print(df[col].value_counts().head())

## 4. Data Preprocessing

Now we'll prepare the data for machine learning models.

In [None]:
# Step 1: Handle missing values in 'marital' column
# Option 1: Fill with mode (most common value)
df['marital'].fillna(df['marital'].mode()[0], inplace=True)

# Option 2 (alternative): Drop rows with missing values
# df.dropna(inplace=True)

print(f"✅ Missing values handled")
print(f"Remaining missing values: {df.isnull().sum().sum()}")

In [None]:
# Step 2: Separate features (X) and target (y)
X = df.drop('deposit', axis=1)
y = df['deposit']

# Encode target variable (yes=1, no=0)
y = y.map({'yes': 1, 'no': 0})

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Target distribution: {y.value_counts().to_dict()}")

In [None]:
# Step 3: Identify categorical and numerical columns
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

print(f"Categorical features ({len(categorical_features)}): {categorical_features}")
print(f"Numerical features ({len(numerical_features)}): {numerical_features}")