In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import glob
import warnings
from IPython.display import display

warnings.filterwarnings('ignore')
mpl.rcParams['savefig.dpi'] = 128
mpl.rcParams['figure.dpi'] = 128
# Plot size to 14" x 7"
mpl.rc('figure', figsize = (14, 7))
# Font size to 14
mpl.rc('font', size = 14)
# Do not display top and right frame lines
mpl.rc('axes.spines', top = False, right = False)
# Remove grid lines
mpl.rc('axes', grid = False)
# Set backgound color to white
mpl.rc('axes', facecolor = 'white')

In [2]:
# read whole year data
allFiles = glob.glob("data/*/*.csv")
df = pd.DataFrame()
list_ = []
for file_ in allFiles:
    df = pd.read_csv(file_,index_col=None, header=0,encoding="utf-8")
    list_.append(df)
df = pd.concat(list_)
df.shape

(279386, 49)

In [3]:
df.drop_duplicates(inplace=True)

In [4]:
def transform_cols(df):
    df.columns = map(str.lower, df.columns)
    df.columns = df.columns.str.replace('_', ' ')
    return df

In [5]:
df = transform_cols(df)
df.columns

Index(['tracking id', 'marketing code', 'original code', 'marketing name',
       'classification', 'app source', 'web source', 'enquired', 'first call',
       'last access', 'last access by', 'next contact date', 'decision',
       'appointed', 'submitted', 'lead approval date',
       'reporting approval date', 'potential settlement date', 'settled',
       'lender', 'hlc', 'rejected reason', 'loan amount', 'enquiry status',
       'hot transfer', 'held', 'held by', 'processing system', 'lead status',
       'client name', 'suburb', 'state', 'post code', 'area echoice direct',
       'email', 'mobile', 'phone', 'home phone', 'dead reason', 'broker name',
       'capacity area', 'lead source', 'commission in', 'commission out',
       'lead assist', 'loan reason', 'property use', 'consumer comment',
       'internal comment'],
      dtype='object')

In [6]:
selected_features = ['marketing code','enquired',
                     'loan amount','loan reason','property use',
                     'suburb', 'state', 'post code']
# selected_features = ['marketing code','enquired','loan amount','property use']
target = 'enquiry status';
whole_set = selected_features + [target]

df = df[whole_set]
df.shape

(256851, 9)

In [7]:
df = df[~df[target].isin(['In Progress','Just Received','On Hold'])]

In [8]:
df.dropna(axis=0, how='any', inplace=True)

In [9]:
# convert to string to do replacement
df['loan amount'] = df['loan amount'].astype("str")
df['loan amount'] = df['loan amount'].str.replace(",","")

In [10]:
def capitalizer(string: str) -> str:
    return string.lower()

In [11]:
invalid_columns= ['500001-$1000000',
                  '300001-$500000',
                  '0-$300000',
                  '250000 - 300000',
                  '250000-350000',
                  '2600 monthly',
                  'not_sure',
                  '1000,001+',
                 '9999-',
                  'I50000',
                  '1.5 M',
                  '1000001+',
                  '9999-',
                  '80-90k']

In [12]:
df = df[~df['loan amount'].isin(invalid_columns)]
df.shape

(168279, 9)

In [13]:
drop_values = ['QLD','victoria',"270-1176","VIC","2575b","6110r","2150s"]
if 'post code' in df.columns: 
    df = df[~df['post code'].isin(drop_values)]

In [14]:
df[df['loan amount'] == "nan"]

Unnamed: 0,marketing code,enquired,loan amount,loan reason,property use,suburb,state,post code,enquiry status


In [15]:
def transform(df): 
    df['loan amount'] = df['loan amount'].astype('float')
    df['loan amount'] = df['loan amount'].astype(int)
    df['enquired'] = pd.DatetimeIndex(df['enquired'])
    df['year'] = df['enquired'].dt.year
    df['month'] = df['enquired'].dt.month
    df['day'] = df['enquired'].dt.day
    df['hour'] = df['enquired'].dt.hour
    df['weekday'] = df['enquired'].dt.dayofweek
    
    if 'post code' in df.columns: 
        df['post code'] = df['post code'].astype('int')
    
    if 'enquired'in df.columns:
        df.drop(['enquired'], axis = 1, inplace = True)

In [16]:
transform(df)

In [17]:
from sklearn.preprocessing import LabelEncoder

class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col].astype(str))
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [18]:
X = df[df.keys()]
X = df.loc[:,df.columns != target]

In [19]:
category_column = X.select_dtypes(include='object')

In [20]:
X = MultiColumnLabelEncoder(columns=category_column.columns).fit_transform(X)

In [21]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 168273 entries, 1 to 2841
Data columns (total 12 columns):
marketing code    168273 non-null int64
loan amount       168273 non-null int64
loan reason       168273 non-null int64
property use      168273 non-null int64
suburb            168273 non-null int64
state             168273 non-null int64
post code         168273 non-null int64
year              168273 non-null int64
month             168273 non-null int64
day               168273 non-null int64
hour              168273 non-null int64
weekday           168273 non-null int64
dtypes: int64(12)
memory usage: 16.7 MB


In [22]:
X.corr()

Unnamed: 0,marketing code,loan amount,loan reason,property use,suburb,state,post code,year,month,day,hour,weekday
marketing code,1.0,0.027223,-0.041408,-0.042057,-0.011394,-0.067629,-0.074586,0.310577,-0.049488,0.039982,0.059369,0.028574
loan amount,0.027223,1.0,-0.057781,0.019784,0.00423,-0.07271,-0.108528,0.074768,0.004439,-0.00075,-0.001363,0.003966
loan reason,-0.041408,-0.057781,1.0,0.096785,-0.012323,0.043509,0.02568,0.026848,0.006747,-0.011477,-0.022969,-0.051413
property use,-0.042057,0.019784,0.096785,1.0,0.01363,0.044194,0.057865,-0.062065,0.008278,0.008884,0.000412,-0.000724
suburb,-0.011394,0.00423,-0.012323,0.01363,1.0,0.016172,-0.003166,0.019837,-0.000161,0.002068,-0.007138,0.004811
state,-0.067629,-0.07271,0.043509,0.044194,0.016172,1.0,0.66537,-0.086592,0.007391,-0.010586,-0.003902,-0.01055
post code,-0.074586,-0.108528,0.02568,0.057865,-0.003166,0.66537,1.0,-0.050882,0.010324,-0.009219,-0.011243,0.001335
year,0.310577,0.074768,0.026848,-0.062065,0.019837,-0.086592,-0.050882,1.0,-0.092301,-0.024464,-0.030756,0.022922
month,-0.049488,0.004439,0.006747,0.008278,-0.000161,0.007391,0.010324,-0.092301,1.0,0.002982,-0.01558,-0.005659
day,0.039982,-0.00075,-0.011477,0.008884,0.002068,-0.010586,-0.009219,-0.024464,0.002982,1.0,0.005572,0.023787
