In [52]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm

In [53]:
#reading csv file into a panda dataframe
bank_df = pd.read_csv("bank.csv")
bank_df.head(5)

Unnamed: 0,age,job,marital,education,contact,month,day_of_week,duration,y
0,56,housemaid,married,basic.4y,telephone,may,mon,261,no
1,57,services,married,high.school,telephone,may,mon,149,no
2,37,services,married,high.school,telephone,may,mon,226,no
3,40,admin.,married,basic.6y,telephone,may,mon,151,no
4,56,services,married,high.school,telephone,may,mon,307,no


In [54]:
#No. of rows and columns
bank_df.shape

(41188, 9)

In [55]:
#replace yes and no with 1 and 0
bank_df["y"].replace({"no":0, "yes":1}, inplace=True)

In [56]:
bank_df.head(10)

Unnamed: 0,age,job,marital,education,contact,month,day_of_week,duration,y
0,56,housemaid,married,basic.4y,telephone,may,mon,261,0
1,57,services,married,high.school,telephone,may,mon,149,0
2,37,services,married,high.school,telephone,may,mon,226,0
3,40,admin.,married,basic.6y,telephone,may,mon,151,0
4,56,services,married,high.school,telephone,may,mon,307,0
5,45,services,married,basic.9y,telephone,may,mon,198,0
6,59,admin.,married,professional.course,telephone,may,mon,139,0
7,41,blue-collar,married,unknown,telephone,may,mon,217,0
8,24,technician,single,professional.course,telephone,may,mon,380,0
9,25,services,single,high.school,telephone,may,mon,50,0


In [57]:
#renaming column name
bank_df.rename(columns={"y": "purchase"}, inplace=True)

In [58]:
bank_df.head()

Unnamed: 0,age,job,marital,education,contact,month,day_of_week,duration,purchase
0,56,housemaid,married,basic.4y,telephone,may,mon,261,0
1,57,services,married,high.school,telephone,may,mon,149,0
2,37,services,married,high.school,telephone,may,mon,226,0
3,40,admin.,married,basic.6y,telephone,may,mon,151,0
4,56,services,married,high.school,telephone,may,mon,307,0


Factor Variables

In [59]:
factor_x = bank_df.select_dtypes(exclude=["int64", "float64", "category"]).columns.values
print(factor_x)

['job' 'marital' 'education' 'contact' 'month' 'day_of_week']


In [60]:
#Unique values of all factor variables
for i in factor_x:
    print("Factor Variable = " + i)
    print(bank_df[i].unique())

Factor Variable = job
['housemaid' 'services' 'admin.' 'blue-collar' 'technician' 'retired'
 'management' 'unemployed' 'self-employed' 'unknown' 'entrepreneur'
 'student']
Factor Variable = marital
['married' 'single' 'divorced' 'unknown']
Factor Variable = education
['basic.4y' 'high.school' 'basic.6y' 'basic.9y' 'professional.course'
 'unknown' 'university.degree' 'illiterate']
Factor Variable = contact
['telephone' 'cellular']
Factor Variable = month
['may' 'jun' 'jul' 'aug' 'oct' 'nov' 'dec' 'mar' 'apr' 'sep']
Factor Variable = day_of_week
['mon' 'tue' 'wed' 'thu' 'fri']


Cleaning Dataset and reducing unique variables.

In [61]:
bank_df.replace(["basic.4y", "basic.6y", "basic.9y", "unknown"], "school", inplace=True)

In [62]:
bank_df.replace("professional.course", "certification", inplace=True)

In [63]:
bank_df.replace("university.degree", "degree", inplace=True)

In [64]:
bank_df.education.unique()

array(['school', 'high.school', 'certification', 'degree', 'illiterate'],
      dtype=object)

Replace retired, student and unknown to unemployed and rest to employed.

In [65]:
for i in bank_df.job.unique():
    if i in ["retired", "student", "unknown", "unemployed"]:
        bank_df.job[bank_df.job == i] = "unemployed"

    else:
        bank_df.job[bank_df.job == i] = "employed"

In [66]:
bank_df.job.unique()

array(['employed', 'unemployed'], dtype=object)

convert all X categorical variables

In [67]:
for i in factor_x:
    bank_df[i] = bank_df[i].astype("category", copy=False)

bank_df.dtypes

age               int64
job            category
marital        category
education      category
contact        category
month          category
day_of_week    category
duration          int64
purchase          int64
dtype: object

In [68]:
for var in factor_x:  
    cat_list='var'+'_'+var  
    cat_list = pd.get_dummies(bank_df[var], prefix=var)  
    data1=bank_df.join(cat_list)  
    bank_df = data1  
new_col_set = bank_df.columns  
print(new_col_set)  
# data with new columns  
bank_df.head()

Index(['age', 'job', 'marital', 'education', 'contact', 'month', 'day_of_week',
       'duration', 'purchase', 'job_employed', 'job_unemployed',
       'marital_divorced', 'marital_married', 'marital_school',
       'marital_single', 'education_certification', 'education_degree',
       'education_high.school', 'education_illiterate', 'education_school',
       'contact_cellular', 'contact_telephone', 'month_apr', 'month_aug',
       'month_dec', 'month_jul', 'month_jun', 'month_mar', 'month_may',
       'month_nov', 'month_oct', 'month_sep', 'day_of_week_fri',
       'day_of_week_mon', 'day_of_week_thu', 'day_of_week_tue',
       'day_of_week_wed'],
      dtype='object')


Unnamed: 0,age,job,marital,education,contact,month,day_of_week,duration,purchase,job_employed,...,month_mar,month_may,month_nov,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed
0,56,employed,married,school,telephone,may,mon,261,0,1,...,0,1,0,0,0,0,1,0,0,0
1,57,employed,married,high.school,telephone,may,mon,149,0,1,...,0,1,0,0,0,0,1,0,0,0
2,37,employed,married,high.school,telephone,may,mon,226,0,1,...,0,1,0,0,0,0,1,0,0,0
3,40,employed,married,school,telephone,may,mon,151,0,1,...,0,1,0,0,0,0,1,0,0,0
4,56,employed,married,high.school,telephone,may,mon,307,0,1,...,0,1,0,0,0,0,1,0,0,0


In [69]:
# get the difference of new and old columns  
to_keep = list(set(new_col_set).difference(set(factor_x)))  
to_keep  
# create the final dataset with the final columns set  
# ---------------------------------------------------  
bank_final = bank_df[to_keep]  
bank_final.head(4)  
# reordering the columns  
# ---------------------------------------------------  
bank_final = pd.concat([bank_final['purchase'], bank_final.drop('purchase', axis=1)], axis=1)  
bank_final.head()

Unnamed: 0,purchase,month_oct,marital_school,month_mar,month_jun,education_certification,month_may,day_of_week_wed,day_of_week_fri,contact_telephone,...,month_sep,education_illiterate,education_school,job_employed,marital_married,education_high.school,day_of_week_tue,month_aug,month_nov,marital_single
0,0,0,0,0,0,0,1,0,0,1,...,0,0,1,1,1,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,1,...,0,0,0,1,1,1,0,0,0,0
2,0,0,0,0,0,0,1,0,0,1,...,0,0,0,1,1,1,0,0,0,0
3,0,0,0,0,0,0,1,0,0,1,...,0,0,1,1,1,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,1,...,0,0,0,1,1,1,0,0,0,0
