In [3]:

!wget 'https://archive.ics.uci.edu/static/public/222/bank+marketing.zip'

--2024-12-08 16:26:40--  https://archive.ics.uci.edu/static/public/222/bank+marketing.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘bank+marketing.zip’

bank+marketing.zip      [    <=>             ] 999.85K  1.08MB/s    in 0.9s    

2024-12-08 16:26:42 (1.08 MB/s) - ‘bank+marketing.zip’ saved [1023843]



In [1]:
import numpy as np
import seaborn as sns

In [2]:
import zipfile
import pandas as pd

# Path to the outer zip file
outer_zip_path = 'bank+marketing.zip'

# Path to the inner zip file inside the outer zip file
inner_zip_path = 'bank.zip'

# Path to the CSV file inside the inner zip file
csv_file_path = 'bank-full.csv'

# Extract the inner zip file from the outer zip file
with zipfile.ZipFile(outer_zip_path, 'r') as outer_zip:
    outer_zip.extract(inner_zip_path)

# Extract the CSV file from the inner zip file
with zipfile.ZipFile(inner_zip_path, 'r') as inner_zip:
    inner_zip.extract(csv_file_path)

# Read the CSV file into a DataFrame
df = pd.read_csv(csv_file_path, sep=';')

# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [3]:
df = df[['age','job','marital','education','balance','housing','contact','day','month','duration','campaign','pdays','previous','poutcome','y']]

In [4]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [6]:
df.education.value_counts() ## secondary is the most common education level

education
secondary    23202
tertiary     13301
primary       6851
unknown       1857
Name: count, dtype: int64

In [7]:
df.dtypes

age           int64
job          object
marital      object
education    object
balance       int64
housing      object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [9]:
numerical = ['age','balance','day','duration','campaign','pdays','previous']

In [10]:
df[numerical].corr() ## pdays and previous are highly correlated

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


In [18]:
df['y'] = df['y'].map({'yes': 1, 'no': 0})

In [19]:
from sklearn.model_selection import train_test_split
seed = 42
df_full_train, df_test, y_full_train, y_test = train_test_split(df.drop(columns=['y']) , df.y, test_size=0.2, random_state=seed)

In [20]:
df_train ,df_val, y_train, y_val = train_test_split(df_full_train, y_full_train, test_size=0.25, random_state=seed)
y_full_train

3344     0
17965    0
18299    0
10221    0
32192    1
        ..
11284    1
44732    0
38158    0
860      0
15795    0
Name: y, Length: 36168, dtype: int64

In [21]:
from sklearn.metrics import mutual_info_score
def mutual_info_churn_score(series):
    return mutual_info_score(series, y_full_train)
categories = ['job','marital','education','housing','contact','month','poutcome']
mi = df_full_train[categories].apply(mutual_info_churn_score)
mi.sort_values(ascending=False) ## poutcome is the most important feature


poutcome     0.029257
month        0.024774
contact      0.014164
housing      0.009800
job          0.007765
education    0.002458
marital      0.002019
dtype: float64

In [24]:
from  sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False)

dicts = df_train[categories + numerical].to_dict(orient='records')
X_train = dv.fit_transform(dicts)
X_val = dv.transform(df_val[categories + numerical].to_dict(orient='records'))
dv.get_feature_names_out()

array(['age', 'balance', 'campaign', 'contact=cellular',
       'contact=telephone', 'contact=unknown', 'day', 'duration',
       'education=primary', 'education=secondary', 'education=tertiary',
       'education=unknown', 'housing=no', 'housing=yes', 'job=admin.',
       'job=blue-collar', 'job=entrepreneur', 'job=housemaid',
       'job=management', 'job=retired', 'job=self-employed',
       'job=services', 'job=student', 'job=technician', 'job=unemployed',
       'job=unknown', 'marital=divorced', 'marital=married',
       'marital=single', 'month=apr', 'month=aug', 'month=dec',
       'month=feb', 'month=jan', 'month=jul', 'month=jun', 'month=mar',
       'month=may', 'month=nov', 'month=oct', 'month=sep', 'pdays',
       'poutcome=failure', 'poutcome=other', 'poutcome=success',
       'poutcome=unknown', 'previous'], dtype=object)

In [27]:
y_train = y_train.values
y_val = y_val.values

In [38]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
model.score(X_val, y_val) ## 0.9 accuracy


0.9010174740101747

In [None]:
for feature in ['age','balance','marital','previous']:
    temp_list = numerical + categories
    temp_list.remove(feature)
    dicts = df_train[temp_list].to_dict(orient='records')
    X_train_temp = dv.fit_transform(dicts)
    X_val_temp = dv.transform(df_val[categories + numerical].to_dict(orient='records'))
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train_temp, y_train)
    print(f'Accuracy difference without {feature} is {model.score(X_val_temp, y_val)- 0.9010174740101747}') ## age 

Accuracy difference without age is 0.0
Accuracy difference without balance is -0.00022119000221187957
Accuracy difference without marital is -0.000774165007741634
Accuracy difference without previous is 0.00022119000221187957


In [41]:
for C in [0.01, 0.1, 1, 10, 100]:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    print(f'C={C}, Accuracy={model.score(X_val, y_val)}') ## C = 10 is the best

C=0.01, Accuracy=0.8986949789869498
C=0.1, Accuracy=0.900464499004645
C=1, Accuracy=0.9010174740101747
C=10, Accuracy=0.9012386640123866
C=100, Accuracy=0.9012386640123866
