In [1]:
import pandas as pd
import numpy as np
from pandas.api.types import  is_string_dtype,is_numeric_dtype
import matplotlib.pyplot as plt
from  sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [2]:
# Reading the CSV
Working_df = pd.read_csv('bank-additional-full.csv',sep=';')

In [3]:
Working_df.shape

(41188, 21)

In [4]:
Working_df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [5]:
Working_df.columns

Index([u'age', u'job', u'marital', u'education', u'default', u'housing',
       u'loan', u'contact', u'month', u'day_of_week', u'duration', u'campaign',
       u'pdays', u'previous', u'poutcome', u'emp.var.rate', u'cons.price.idx',
       u'cons.conf.idx', u'euribor3m', u'nr.employed', u'y'],
      dtype='object')

In [6]:
#removing duration for modelling purposes
Working_df_new = Working_df.drop('duration',axis =1)

In [7]:
#Assuming unknowns to be negative in the below columns and mapping accordingly
#default,housing,loan,poutcome
Working_df_new.loc[Working_df_new.default =='unknown','default']='no'
Working_df_new.loc[Working_df_new.housing =='unknown','housing']='no'
Working_df_new.loc[Working_df_new.loan =='unknown','loan']='no'
Working_df_new.loc[Working_df_new.poutcome =='nonexistent','poutcome']='failure'

In [8]:
#Classifying as employed and unemployed with good income levels for term deposit
#Blue collar and housemaid classified as unemployed
employed = ['admin','entrepreneur','management','retired','self-employed','services','technician','housemaid','blue-collar']
unemployed =['student','technician','umemployed','unknown']
Working_df_new.loc[Working_df_new.job.isin(employed),'job']=1
Working_df_new.loc[Working_df_new.job.isin(unemployed),'job']=0

In [9]:
#Classifying as married and not married (as per current status)
Working_df_new.loc[Working_df_new.marital == 'married','marital']=1
Working_df_new.loc[Working_df_new.marital != 'married','marital']=0

In [10]:
#Converting datatype to categories
def conv_to_cats(df):
    for n,c in df.iteritems():
        if is_string_dtype(c): df[n] = c.astype('category').cat.as_ordered()

In [11]:
conv_to_cats(Working_df_new)
print(Working_df_new.education.cat.categories)
#Setting ordinality for education
Working_df_new.education.cat.set_categories(['professional.course','university.degree','high.school','basic.9y','basic.6y','basic.4y','illiterate','unknown'],ordered = True,inplace = True)
Working_df_new.education = Working_df_new.education.cat.codes

Index([u'basic.4y', u'basic.6y', u'basic.9y', u'high.school', u'illiterate',
       u'professional.course', u'university.degree', u'unknown'],
      dtype='object')


In [12]:
#Checking the Pdays values
def minMax(x):
    return pd.Series(index=['min','max','mean','25th','75th'],data=[x.min(),x.max(),x.mean(),x.quantile(0.25),x.quantile(0.75)])
minMax(Working_df_new.query("(pdays <999)")[['pdays']])

min                     pdays    0
dtype: int64
max                    pdays    27
dtype: int64
mean           pdays    6.014521
dtype: float64
25th    pdays    3.0
Name: 0.25, dtype: float64
75th    pdays    7.0
Name: 0.75, dtype: float64
dtype: object

In [13]:
#Setting a level 1 - pdays <=3
#          level 2 - 3>pdays <=7
#          level 3 - 7 < pdays <27
#           999    - as is
Working_df_new.loc[:,'pdays_ord'] = np.where(Working_df_new.pdays<= 3 ,1,\
                        np.where(((Working_df_new.pdays >3)&(Working_df_new.pdays<=7)),2,\
                                 np.where (Working_df_new.pdays != 999,3,Working_df_new.pdays)))
Working_df_new.loc[:,'pdays_bool']= np.where(Working_df_new.pdays == 999 ,0,1)
#Dropping the pdays column
Working_df_new.drop('pdays',axis = 1,inplace=True)

In [14]:
#Splitting into numeric and non numeric data types
quantitative = [f for f in Working_df_new.columns if is_numeric_dtype(Working_df_new[f])]
qualitative = Working_df_new.columns.difference(quantitative)
# dropping the y from the independent variable list and mapping it separately
qualitative = qualitative.drop('y')
Working_df_new.y.replace(['yes', 'no'], [1, 0], inplace=True)

In [15]:
#Replace Categorical variable with their codes
def numericalize(df,col):
    if not is_numeric_dtype(df[col]):
        df[col+'_num']= df[col].cat.codes+1

In [16]:
for i in qualitative:
    numericalize(Working_df_new,i)
    #dropping the categorical column after replacing
    Working_df_new.drop(i,axis =1,inplace=True)

In [17]:
Working_df_new.columns

Index([u'age', u'education', u'campaign', u'previous', u'emp.var.rate',
       u'cons.price.idx', u'cons.conf.idx', u'euribor3m', u'nr.employed', u'y',
       u'pdays_ord', u'pdays_bool', u'contact_num', u'day_of_week_num',
       u'default_num', u'housing_num', u'job_num', u'loan_num', u'marital_num',
       u'month_num', u'poutcome_num'],
      dtype='object')

In [18]:
y = Working_df_new['y']
Working_df_new.drop('y',axis=1,inplace=True)


In [19]:
Working_df_new.shape

(41188, 20)

In [20]:
#Ramdomizing sampling
X_train, X_valid, y_train, y_valid = train_test_split(Working_df_new, y, test_size=0.33, random_state=42)

In [21]:
clf = LogisticRegression()
#Deciding columns to be passed into the model
clf.fit(X_train,y_train)
clf.score(X_train,y_train)
clf.score(X_valid,y_valid)

0.89913926285588175

In [27]:
y_pred_valid =clf.predict(X_valid)
metrics.confusion_matrix(np.array(y_valid),np.array(y_pred_valid))

array([[11891,   165],
       [ 1206,   331]], dtype=int64)

In [28]:
metrics.roc_auc_score(np.array(y_valid),np.array(y_pred_valid))

0.60083422773532669

In [25]:
y_pred_train = clf.predict(X_train)
metrics.confusion_matrix(np.array(y_train),np.array(y_pred_train))


array([[24187,   305],
       [ 2440,   663]], dtype=int64)

In [26]:
metrics.roc_auc_score(np.array(y_train),np.array(y_pred_train))

0.60060557502343848