In [1]:
import pandas as pd
import numpy as np
from pandas.api.types import  is_string_dtype,is_numeric_dtype
import matplotlib.pyplot as plt
from  sklearn.linear_model import LogisticRegression

In [2]:
# Reading the CSV
Working_df = pd.read_csv('bank-additional-full.csv',sep=';')

In [None]:
Working_df.shape

In [None]:
Working_df.head()

In [None]:
Working_df.columns

In [3]:
#removing duration for modelling purposes
Working_df_new = Working_df.drop('duration',axis =1)

In [4]:
#Assuming unknowns to be negative in the below columns and mapping accordingly
#default,housing,loan,poutcome
Working_df_new.loc[Working_df_new.default =='unknown','default']='no'
Working_df_new.loc[Working_df_new.housing =='unknown','housing']='no'
Working_df_new.loc[Working_df_new.loan =='unknown','loan']='no'
Working_df_new.loc[Working_df_new.poutcome =='nonexistent','poutcome']='failure'

In [5]:
#Classifying as employed and unemployed with good income levels for term deposit
#Blue collar and housemaid classified as unemployed
employed = ['admin','entrepreneur','management','retired','self-employed','services','technician','housemaid','blue-collar']
unemployed =['student','technician','umemployed','unknown']
Working_df_new.loc[Working_df_new.job.isin(employed),'job']=1
Working_df_new.loc[Working_df_new.job.isin(unemployed),'job']=0

In [6]:
#Classifying as married and not married (as per current status)
Working_df_new.loc[Working_df_new.marital == 'married','marital']=1
Working_df_new.loc[Working_df_new.marital != 'married','marital']=0

In [7]:
#Converting categorical variables to numbers
def conv_to_cats(df):
    for n,c in df.items():
        if is_string_dtype(c): df[n] = c.astype('category').cat.as_ordered()

In [8]:
conv_to_cats(Working_df_new)
print(Working_df_new.education.cat.categories)
#Setting ordinality for education
Working_df_new.education.cat.set_categories(['professional.course','university.degree','high.school','basic.9y','basic.6y','basic.4y','illiterate','unknown'],ordered = True,inplace = True)
Working_df_new.education = Working_df_new.education.cat.codes

Index(['basic.4y', 'basic.6y', 'basic.9y', 'high.school', 'illiterate',
       'professional.course', 'university.degree', 'unknown'],
      dtype='object')


In [9]:
#Checking the Pdays values
def minMax(x):
    return pd.Series(index=['min','max','mean','25th','75th'],data=[x.min(),x.max(),x.mean(),x.quantile(0.25),x.quantile(0.75)])
minMax(Working_df_new.query("pdays <999")[['pdays']])

min                     pdays    0
dtype: int64
max                    pdays    27
dtype: int64
mean           pdays    6.014521
dtype: float64
25th    pdays    3.0
Name: 0.25, dtype: float64
75th    pdays    7.0
Name: 0.75, dtype: float64
dtype: object

In [10]:
#Setting a level 1 - pdays <=3
#          level 2 - 3>pdays <=7
#          level 3 - 7 < pdays <27
#           999    - as is
Working_df_new.loc[:,'pdays_ord'] = np.where(Working_df_new.pdays<= 3 ,1,\
                        np.where(((Working_df_new.pdays >3)&(Working_df_new.pdays<=7)),2,\
                                 np.where (Working_df_new.pdays != 999,3,Working_df_new.pdays)))
Working_df_new.loc[:,'pdays_bool']= np.where(Working_df_new.pdays == 999 ,0,1)
#Dropping the pdays column
Working_df_new.drop('pdays',axis = 1,inplace=True)

In [11]:
#Splitting into numeric and non numeric data types
quantitative = [f for f in Working_df_new.columns if is_numeric_dtype(Working_df_new[f])]
qualitative = Working_df_new.columns.difference(quantitative)
# dropping the y from the independent variable list and mapping it separately
qualitative = qualitative.drop('y')
Working_df_new.y.replace(['yes', 'no'], [1, 0], inplace=True)

In [12]:
#Replace Categorical variable with their codes
def numericalize(df,col):
    if not is_numeric_dtype(df[col]):
        df[col+'_num']= df[col].cat.codes+1

In [13]:
for i in qualitative:
    numericalize(Working_df_new,i)
    #dropping the categorical column after replacing
    Working_df_new.drop(i,axis =1,inplace=True)

In [14]:
Working_df_new.columns

Index(['age', 'marital', 'education', 'campaign', 'previous', 'emp.var.rate',
       'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'y',
       'pdays_ord', 'pdays_bool', 'contact_num', 'day_of_week_num',
       'default_num', 'housing_num', 'job_num', 'loan_num', 'month_num',
       'poutcome_num'],
      dtype='object')

In [15]:
y = Working_df_new['y']
Working_df_new.drop('y',axis=1,inplace=True)


In [17]:
Working_df_new.shape

(41188, 20)

In [22]:
#Splitting into train and validation
def split_vals(a,n): return a[:n].copy(), a[n:].copy()

# n_valid = 10297  # 25% of the dataset
n_valid = 20594  # 50% of the dataset
n_trn = len(Working_df_new)-n_valid
raw_train, raw_valid = split_vals(Working_df_new, n_trn)
X_train, X_valid = split_vals(Working_df_new, n_trn)
y_train, y_valid = split_vals(y, n_trn)

In [23]:
clf = LogisticRegression()
#Deciding columns to be passed into the model
clf.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [24]:
clf.score(X_train,y_train)

0.9532388074196367

In [25]:
clf.score(X_valid,y_valid)

0.8417985821112945

In [26]:
clf.coef_

array([[-7.52466337e-03,  0.00000000e+00, -1.70143507e-02,
        -3.28250780e-02,  0.00000000e+00,  2.70146574e-03,
        -1.93357049e-02,  6.35818231e-03,  1.04121526e-03,
         1.21540144e-02, -6.36216503e-02,  0.00000000e+00,
        -3.06461037e-02,  3.69180754e-03, -6.36853356e-05,
        -1.54040903e-03, -5.11742971e-03,  2.31560399e-03,
        -6.40382734e-02, -6.36853356e-05]])

In [None]:
X_train.columns

In [None]:
print(list(zip(clf.coef_, X_train.columns)))

In [None]:
coefficients = pd.DataFrame({"Feature":np.array(X_train.columns),"Coefficients":np.transpose(clf.coef_)})

In [27]:
np.transpose(clf.coef_)

array([[-7.52466337e-03],
       [ 0.00000000e+00],
       [-1.70143507e-02],
       [-3.28250780e-02],
       [ 0.00000000e+00],
       [ 2.70146574e-03],
       [-1.93357049e-02],
       [ 6.35818231e-03],
       [ 1.04121526e-03],
       [ 1.21540144e-02],
       [-6.36216503e-02],
       [ 0.00000000e+00],
       [-3.06461037e-02],
       [ 3.69180754e-03],
       [-6.36853356e-05],
       [-1.54040903e-03],
       [-5.11742971e-03],
       [ 2.31560399e-03],
       [-6.40382734e-02],
       [-6.36853356e-05]])

In [28]:
feature_Array = np.ones(len(X_train))
feature_Array =np.array(X_train.columns)

In [29]:
feature_Array

array(['age', 'marital', 'education', 'campaign', 'previous',
       'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m',
       'nr.employed', 'pdays_ord', 'pdays_bool', 'contact_num',
       'day_of_week_num', 'default_num', 'housing_num', 'job_num',
       'loan_num', 'month_num', 'poutcome_num'], dtype=object)

In [None]:
clf.sparsify()

In [None]:
clf.intercept_