# 01 • EDA (Bank Term Deposit)

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

PROJ = Path.cwd()
DATA = None
for p in [PROJ/'data', PROJ.parent/'data', PROJ.parent.parent/'data']:
    if (p/'train.csv').exists() and (p/'test.csv').exists():
        DATA = p; break
assert DATA is not None, 'data/train.csv or data/test.csv not found'

train = pd.read_csv('data/train.csv')
test  = pd.read_csv('data/train.csv')
train.shape, test.shape

((750000, 18), (750000, 18))

In [2]:
train.head(10)

Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,0,42,technician,married,secondary,no,7,no,no,cellular,25,aug,117,3,-1,0,unknown,0
1,1,38,blue-collar,married,secondary,no,514,no,no,unknown,18,jun,185,1,-1,0,unknown,0
2,2,36,blue-collar,married,secondary,no,602,yes,no,unknown,14,may,111,2,-1,0,unknown,0
3,3,27,student,single,secondary,no,34,yes,no,unknown,28,may,10,2,-1,0,unknown,0
4,4,26,technician,married,secondary,no,889,yes,no,cellular,3,feb,902,1,-1,0,unknown,1
5,5,24,admin.,single,secondary,no,1882,yes,no,cellular,20,apr,1010,3,-1,0,unknown,0
6,6,39,blue-collar,married,secondary,no,0,no,no,telephone,21,nov,90,1,-1,0,unknown,0
7,7,50,admin.,single,secondary,no,1595,no,no,telephone,31,jul,49,25,-1,0,unknown,0
8,8,46,blue-collar,married,primary,no,1463,no,no,cellular,4,aug,50,1,-1,0,unknown,0
9,9,39,management,divorced,tertiary,no,25,yes,no,cellular,8,may,119,1,-1,0,unknown,0


## Schema & basic stats

In [3]:
desc_num = train[['age','balance','day','duration','campaign','pdays','previous','y']].describe()
desc_num

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,y
count,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0
mean,40.926395,1204.067397,16.117209,256.229144,2.577008,22.412733,0.298545,0.120651
std,10.098829,2836.096759,8.250832,272.555662,2.718514,77.319998,1.335926,0.325721
min,18.0,-8019.0,1.0,1.0,1.0,-1.0,0.0,0.0
25%,33.0,0.0,9.0,91.0,1.0,-1.0,0.0,0.0
50%,39.0,634.0,17.0,133.0,2.0,-1.0,0.0,0.0
75%,48.0,1390.0,21.0,361.0,3.0,-1.0,0.0,0.0
max,95.0,99717.0,31.0,4918.0,63.0,871.0,200.0,1.0


## Target distribution

In [4]:
train['y'].value_counts(normalize=True).rename('ratio')

y
0    0.879349
1    0.120651
Name: ratio, dtype: float64

## Categorical levels

In [5]:
cat_cols = ['job','marital','education','default','housing','loan','contact','month','poutcome']
top_levels = {}
for c in cat_cols:
    top_levels[c] = train[c].value_counts().head(20)
top_levels

{'job': job
 management       175541
 blue-collar      170498
 technician       138107
 admin.            81492
 services          64209
 retired           35185
 self-employed     19020
 entrepreneur      17718
 unemployed        17634
 housemaid         15912
 student           11767
 unknown            2917
 Name: count, dtype: int64,
 'marital': marital
 married     480759
 single      194834
 divorced     74407
 Name: count, dtype: int64,
 'education': education
 secondary    401683
 tertiary     227508
 primary       99510
 unknown       21299
 Name: count, dtype: int64,
 'default': default
 no     737151
 yes     12849
 Name: count, dtype: int64,
 'housing': housing
 yes    411288
 no     338712
 Name: count, dtype: int64,
 'loan': loan
 no     645023
 yes    104977
 Name: count, dtype: int64,
 'contact': contact
 cellular     486655
 unknown      231627
 telephone     31718
 Name: count, dtype: int64,
 'month': month
 may    228411
 aug    128859
 jul    110647
 jun     93670
 

## Numeric distributions (quick sanity)

In [6]:
for c in ['age','balance','day','duration','campaign','pdays','previous']:
    s = train[c]
    print(c, 'min=', s.min(), 'p1=', s.quantile(0.01), 'p50=', s.quantile(0.5), 'p99=', s.quantile(0.99), 'max=', s.max())

age min= 18 p1= 24.0 p50= 39.0 p99= 66.0 max= 95
balance min= -8019 p1= -974.0 p50= 634.0 p99= 10861.0 max= 99717
day min= 1 p1= 2.0 p50= 17.0 p99= 31.0 max= 31
duration min= 1 p1= 7.0 p50= 133.0 p99= 1206.0 max= 4918
campaign min= 1 p1= 1.0 p50= 2.0 p99= 14.0 max= 63
pdays min= -1 p1= -1.0 p50= -1.0 p99= 363.0 max= 871
previous min= 0 p1= 0.0 p50= 0.0 p99= 6.0 max= 200


## Target rate by categorical

In [7]:
import pandas as pd
rates = {}
for c in cat_cols:
    g = train.groupby(c)['y'].mean().sort_values(ascending=False)
    rates[c] = g
rates

{'job': job
 student          0.340784
 retired          0.246241
 unemployed       0.179823
 management       0.150392
 self-employed    0.129443
 unknown          0.120672
 technician       0.118321
 admin.           0.116453
 housemaid        0.084653
 services         0.082714
 entrepreneur     0.081386
 blue-collar      0.067438
 Name: y, dtype: float64,
 'marital': marital
 single      0.170453
 divorced    0.111576
 married     0.101872
 Name: y, dtype: float64,
 'education': education
 tertiary     0.162649
 unknown      0.133387
 secondary    0.105491
 primary      0.083097
 Name: y, dtype: float64,
 'default': default
 no     0.121947
 yes    0.046307
 Name: y, dtype: float64,
 'housing': housing
 no     0.175778
 yes    0.075251
 Name: y, dtype: float64,
 'loan': loan
 no     0.131378
 yes    0.054736
 Name: y, dtype: float64,
 'contact': contact
 cellular     0.156579
 telephone    0.136799
 unknown      0.042953
 Name: y, dtype: float64,
 'month': month
 mar    0.571355
 s

## Target rate by binned numeric

In [8]:
import pandas as pd
out = {}
bins = {
    'age': 10,
    'balance': 20,
    'duration': 20,
    'campaign': 10,
    'pdays': 10,
    'previous': 10,
}
for c, nb in bins.items():
    bc = pd.qcut(train[c], q=min(nb, train[c].nunique()), duplicates='drop')
    out[c] = train.groupby(bc)['y'].mean()
out

  out[c] = train.groupby(bc)['y'].mean()
  out[c] = train.groupby(bc)['y'].mean()
  out[c] = train.groupby(bc)['y'].mean()
  out[c] = train.groupby(bc)['y'].mean()
  out[c] = train.groupby(bc)['y'].mean()
  out[c] = train.groupby(bc)['y'].mean()


{'age': age
 (17.999, 30.0]    0.183734
 (30.0, 32.0]      0.113020
 (32.0, 34.0]      0.112468
 (34.0, 36.0]      0.111206
 (36.0, 39.0]      0.100454
 (39.0, 42.0]      0.090629
 (42.0, 46.0]      0.092775
 (46.0, 51.0]      0.084443
 (51.0, 56.0]      0.093917
 (56.0, 95.0]      0.214691
 Name: y, dtype: float64,
 'balance': balance
 (-8019.001, -287.0]    0.029747
 (-287.0, -63.0]        0.022356
 (-63.0, 0.0]           0.040933
 (0.0, 11.0]            0.040496
 (11.0, 97.0]           0.057919
 (97.0, 476.0]          0.164509
 (476.0, 556.0]         0.144272
 (556.0, 634.0]         0.121671
 (634.0, 719.0]         0.129469
 (719.0, 837.0]         0.112640
 (837.0, 973.0]         0.143443
 (973.0, 1176.0]        0.143564
 (1176.0, 1390.0]       0.156462
 (1390.0, 1693.0]       0.157375
 (1693.0, 2278.0]       0.180729
 (2278.0, 3025.0]       0.210036
 (3025.0, 4493.0]       0.205552
 (4493.0, 99717.0]      0.270223
 Name: y, dtype: float64,
 'duration': duration
 (0.999, 27.0]      

## Simple correlations (numeric only)

In [9]:
train[['age','balance','day','duration','campaign','pdays','previous','y']].corr(numeric_only=True)

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,y
age,1.0,0.062838,-0.015179,-0.004388,0.002052,-0.021635,0.004541,0.009523
balance,0.062838,1.0,-0.008269,0.109629,-0.027744,0.01004,0.033897,0.122513
day,-0.015179,-0.008269,1.0,-0.056755,0.178806,-0.086197,-0.051082,-0.049625
duration,-0.004388,0.109629,-0.056755,1.0,-0.083016,0.047555,0.040105,0.519283
campaign,0.002052,-0.027744,0.178806,-0.083016,1.0,-0.061465,-0.026707,-0.075829
pdays,-0.021635,0.01004,-0.086197,0.047555,-0.061465,1.0,0.561839,0.089277
previous,0.004541,0.033897,-0.051082,0.040105,-0.026707,0.561839,1.0,0.119552
y,0.009523,0.122513,-0.049625,0.519283,-0.075829,0.089277,0.119552,1.0
