# 01 â€¢ EDA (Bank Term Deposit)

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np

PROJ = Path.cwd()
DATA = None
for p in [PROJ/'data', PROJ.parent/'data', PROJ.parent.parent/'data']:
    if (p/'train.csv').exists() and (p/'test.csv').exists():
        DATA = p; break
assert DATA is not None, 'data/train.csv or data/test.csv not found'

train = pd.read_csv(DATA/'train.csv')
test  = pd.read_csv(DATA/'test.csv')
train.shape, test.shape

In [None]:
train.head(10)

## Schema & basic stats

In [None]:
desc_num = train[['age','balance','day','duration','campaign','pdays','previous','y']].describe()
desc_num

## Target distribution

In [None]:
train['y'].value_counts(normalize=True).rename('ratio')

## Categorical levels

In [None]:
cat_cols = ['job','marital','education','default','housing','loan','contact','month','poutcome']
top_levels = {}
for c in cat_cols:
    top_levels[c] = train[c].value_counts().head(20)
top_levels

## Numeric distributions (quick sanity)

In [None]:
for c in ['age','balance','day','duration','campaign','pdays','previous']:
    s = train[c]
    print(c, 'min=', s.min(), 'p1=', s.quantile(0.01), 'p50=', s.quantile(0.5), 'p99=', s.quantile(0.99), 'max=', s.max())

## Target rate by categorical

In [None]:
import pandas as pd
rates = {}
for c in cat_cols:
    g = train.groupby(c)['y'].mean().sort_values(ascending=False)
    rates[c] = g
rates

## Target rate by binned numeric

In [None]:
import pandas as pd
out = {}
bins = {
    'age': 10,
    'balance': 20,
    'duration': 20,
    'campaign': 10,
    'pdays': 10,
    'previous': 10,
}
for c, nb in bins.items():
    bc = pd.qcut(train[c], q=min(nb, train[c].nunique()), duplicates='drop')
    out[c] = train.groupby(bc)['y'].mean()
out

## Simple correlations (numeric only)

In [None]:
train[['age','balance','day','duration','campaign','pdays','previous','y']].corr(numeric_only=True)