In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as split
import seaborn 
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [30]:
bank = pd.read_csv('bank.csv')
bank_data = bank.copy()
bank.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes


In [31]:
jobs = ['management','blue-collar','technician','admin.','services','retired','self-employed','student','unemployed','entrepreneur','housemaid','unknown']
for j in jobs:
    print("{:20} : {:5}".format(j, len(bank_data[(bank_data.deposit == "yes") & (bank_data.job ==j)])))

management           :  1301
blue-collar          :   708
technician           :   840
admin.               :   631
services             :   369
retired              :   516
self-employed        :   187
student              :   269
unemployed           :   202
entrepreneur         :   123
housemaid            :   109
unknown              :    34


In [32]:
bank_data['job'] = bank_data['job'].replace(['management', 'admin.'], 'white-collar')
bank_data['job'] = bank_data['job'].replace(['services','housemaid'], 'pink-collar')
bank_data['job'] = bank_data['job'].replace(['retired', 'student', 'unemployed', 'unknown'], 'other')

In [33]:
bank_data.job.value_counts()

white-collar     3900
blue-collar      1944
technician       1823
other            1565
pink-collar      1197
self-employed     405
entrepreneur      328
Name: job, dtype: int64

In [34]:
bank_data.poutcome.value_counts()
bank_data['poutcome'] = bank_data['poutcome'].replace(['other'] , 'unknown')

In [35]:
bank_data.poutcome.value_counts()

unknown    8863
failure    1228
success    1071
Name: poutcome, dtype: int64

In [36]:
bank_data.drop('contact', axis=1, inplace=True)

In [37]:
bank_data['default_cat'] = bank_data['default'].map( {'yes':1, 'no':0} )
bank_data.drop('default', axis=1,inplace = True)

In [38]:
bank_data["housing_cat"]=bank_data['housing'].map({'yes':1, 'no':0})

In [39]:
bank_data.drop('housing', axis=1,inplace = True)

In [40]:
bank_data["loan_cat"] = bank_data['loan'].map({'yes':1, 'no':0})
bank_data.drop('loan', axis=1, inplace=True)

In [41]:
bank_data.drop('month', axis=1, inplace=True)
bank_data.drop('day', axis=1, inplace=True)

In [42]:
bank_data["deposit_cat"] = bank_data['deposit'].map({'yes':1, 'no':0})

In [43]:
bank_data.drop('deposit', axis=1, inplace=True)

In [44]:
print("Customers that have not been contacted before:", len(bank_data[bank_data.pdays==-1]))
print("Maximum values on padys    :", bank_data['pdays'].max())

Customers that have not been contacted before: 8324
Maximum values on padys    : 854


In [45]:
bank_data.loc[bank_data['pdays'] == -1, 'pdays'] = 10000

In [46]:
bank_data['recent_pdays'] = np.where(bank_data['pdays'], 1/bank_data.pdays, 1/bank_data.pdays)

In [47]:
bank_data.drop('pdays', axis=1, inplace = True)

In [48]:
bank_data

Unnamed: 0,age,job,marital,education,balance,duration,campaign,previous,poutcome,default_cat,housing_cat,loan_cat,deposit_cat,recent_pdays
0,59,white-collar,married,secondary,2343,1042,1,0,unknown,0,1,0,1,0.000100
1,56,white-collar,married,secondary,45,1467,1,0,unknown,0,0,0,1,0.000100
2,41,technician,married,secondary,1270,1389,1,0,unknown,0,1,0,1,0.000100
3,55,pink-collar,married,secondary,2476,579,1,0,unknown,0,1,0,1,0.000100
4,54,white-collar,married,tertiary,184,673,2,0,unknown,0,0,0,1,0.000100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11157,33,blue-collar,single,primary,1,257,1,0,unknown,0,1,0,0,0.000100
11158,39,pink-collar,married,secondary,733,83,4,0,unknown,0,0,0,0,0.000100
11159,32,technician,single,secondary,29,156,2,0,unknown,0,0,0,0,0.000100
11160,43,technician,married,secondary,0,9,2,5,failure,0,0,1,0,0.005814


In [49]:
bank_with_dummies = pd.get_dummies(data=bank_data, columns = ['job', 'marital', 'education', 'poutcome'],prefix = ['job', 'marital', 'education', 'poutcome'])

In [50]:
bank_with_dummies

Unnamed: 0,age,balance,duration,campaign,previous,default_cat,housing_cat,loan_cat,deposit_cat,recent_pdays,...,marital_divorced,marital_married,marital_single,education_primary,education_secondary,education_tertiary,education_unknown,poutcome_failure,poutcome_success,poutcome_unknown
0,59,2343,1042,1,0,0,1,0,1,0.000100,...,0,1,0,0,1,0,0,0,0,1
1,56,45,1467,1,0,0,0,0,1,0.000100,...,0,1,0,0,1,0,0,0,0,1
2,41,1270,1389,1,0,0,1,0,1,0.000100,...,0,1,0,0,1,0,0,0,0,1
3,55,2476,579,1,0,0,1,0,1,0.000100,...,0,1,0,0,1,0,0,0,0,1
4,54,184,673,2,0,0,0,0,1,0.000100,...,0,1,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11157,33,1,257,1,0,0,1,0,0,0.000100,...,0,0,1,1,0,0,0,0,0,1
11158,39,733,83,4,0,0,0,0,0,0.000100,...,0,1,0,0,1,0,0,0,0,1
11159,32,29,156,2,0,0,0,0,0,0.000100,...,0,0,1,0,1,0,0,0,0,1
11160,43,0,9,2,5,0,0,1,0,0.005814,...,0,1,0,0,1,0,0,1,0,0


In [51]:
bank_with_dummies.describe()

Unnamed: 0,age,balance,duration,campaign,previous,default_cat,housing_cat,loan_cat,deposit_cat,recent_pdays,...,marital_divorced,marital_married,marital_single,education_primary,education_secondary,education_tertiary,education_unknown,poutcome_failure,poutcome_success,poutcome_unknown
count,11162.0,11162.0,11162.0,11162.0,11162.0,11162.0,11162.0,11162.0,11162.0,11162.0,...,11162.0,11162.0,11162.0,11162.0,11162.0,11162.0,11162.0,11162.0,11162.0,11162.0
mean,41.231948,1528.538524,371.993818,2.508421,0.832557,0.015051,0.473123,0.130801,0.47384,0.003124,...,0.115839,0.568984,0.315176,0.134385,0.490593,0.330496,0.044526,0.110016,0.095951,0.794033
std,11.913369,3225.413326,347.128386,2.722077,2.292007,0.121761,0.499299,0.337198,0.499338,0.030686,...,0.320047,0.495241,0.464607,0.34108,0.499934,0.470413,0.20627,0.312924,0.294537,0.404424
min,18.0,-6847.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0001,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,32.0,122.0,138.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0001,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,39.0,550.0,255.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0001,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,49.0,1708.0,496.0,3.0,1.0,0.0,1.0,0.0,1.0,0.001919,...,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
max,95.0,81204.0,3881.0,63.0,58.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [52]:
X = np.array(bank_with_dummies.drop(['deposit_cat'], 1).astype(int))
y = np.array(bank_with_dummies['deposit_cat'])


In [53]:
xtrain,xtest,ytrain,ytest = train_test_split(X,y,test_size=.25,random_state=1)

In [54]:
model = SVC()

In [55]:
model.fit(xtrain,ytrain)

SVC()

In [56]:
model.score(xtest,ytest)

0.7273378717305625