In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import glob
import warnings
from IPython.display import display

In [2]:
# read whole year data
allFiles = glob.glob("data/*.csv")
df = pd.DataFrame()
list_ = []
for file_ in allFiles:
    df = pd.read_csv(file_,index_col=None, header=0)
    list_.append(df)
df = pd.concat(list_)

df.columns = map(str.lower, df.columns)

In [3]:
selected_features = ['marketing code','suburb', 'state','post code','enquired',
                     'loan amount','loan_reason','property_use']
target = 'enquiry status';
whole_set = selected_features + [target]

In [4]:
df = df[whole_set]
# df = df.replace('On Hold','Rejected')
# convert loan amount to number type, and change string to NaN
df['loan amount'] = pd.to_numeric(df['loan amount'],errors='coerce')

In [5]:
df = df.dropna(axis=0, how='any')
df = df[~df[target].isin(['In Progress','Just Received','On Hold'])]

In [6]:
def changeDateType(df): 
    df['enquired'] = pd.DatetimeIndex(df['enquired'])
    df['loan amount'] = df['loan amount'].astype(int)
    df['post code'] = df['post code'].astype(int)
    return df
    
changeDateType(df)

Unnamed: 0,marketing code,suburb,state,post code,enquired,loan amount,loan_reason,property_use,enquiry status
0,DOM35,Westmead,NSW,2145,2017-07-28 12:06:24,570000,First HomeBuyer,Residence,Rejected
2,A9471,Wallan,VIC,3756,2017-11-24 16:22:51,280000,First HomeBuyer,Residence,Rejected
4,A9234,Sawtell,NSW,2452,2017-11-25 07:42:51,300000,Buying Again,Investment,Rejected
5,A9452,Singleton,NSW,2330,2017-11-25 07:59:03,310000,Refinance,Residence,Rejected
6,GA07,Alfredton,VIC,3350,2017-11-25 15:02:46,270000,First HomeBuyer,Residence,Rejected
7,AH04,West Wyalong,NSW,2671,2017-11-26 10:46:40,109000,Buying Again,Residence,Rejected
8,A9819,Asquith,NSW,2077,2017-11-26 17:57:26,360000,First HomeBuyer,Investment,Rejected
9,A9234,St Johns Park,NSW,2176,2017-11-26 18:03:28,570000,Buying Again,Investment,Rejected
10,GA01,Yorkeys Knob,QLD,4878,2017-11-27 15:18:57,190000,First HomeBuyer,Investment,Rejected
11,A9432,Panania,NSW,2213,2017-11-30 18:25:03,290000,Refinance With Cash Out,Residence,Accepted


In [7]:
# filter years
start_date = '2017-01-01' 
end_date = '2017-12-31'
mask = (df['enquired'] > start_date) & (df['enquired'] <= end_date)
df = df.loc[mask].reset_index(drop=True)

In [8]:
from datetime import datetime
from dateutil.parser import parse

def getDetailDate(df):
    # remove Year feature since it is not important (show below random forest)
    # data_set['Year'] = data_set['Enquired'].dt.year
    df['month'] = df['enquired'].dt.month
    df['day'] = df['enquired'].dt.day
    df['hour'] = df['enquired'].dt.hour
    df['weekday'] = df['enquired'].dt.weekday_name
    
    if("enquired" in df.columns):
        df = df.loc[:,df.columns != 'enquired']
    return df
    
df = getDetailDate(df)

In [9]:
df.columns = df.columns.str.replace('_', ' ')

In [10]:
from sklearn.preprocessing import LabelEncoder

class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col].astype(str))
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [11]:
X = df[df.keys()]
X = df.loc[:,df.columns != target]
y = df[target]

In [12]:
category_column = X.select_dtypes(include='object')

le = LabelEncoder()
y = le.fit_transform(y)
le.classes_

array(['Accepted', 'Rejected'], dtype=object)

In [13]:
from sklearn.model_selection import train_test_split

# split data and labels into a training and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y,
                     test_size=0.4,
                     random_state=0,
                     stratify=y)

In [14]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import KernelPCA
from sklearn import svm
from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(MultiColumnLabelEncoder(columns=category_column.columns),
                         StandardScaler(),
                         svm.SVC(kernel='rbf',probability=True))
pipeline.fit(X_train, y_train)
predit_labels = pipeline.predict(X_test)
print('Test Accuracy: %.3f' % pipeline.score(X_test, y_test))

Test Accuracy: 0.842


In [15]:
from sklearn.externals import joblib
joblib.dump(pipeline, 'app/models/lrpipeline.pkl')

['app/models/lrpipeline.pkl']

In [16]:
model_columns = list(X.columns)
joblib.dump(model_columns, 'app/models/model_columns.pkl')

['app/models/model_columns.pkl']

In [23]:
import json
my_json_string = json.dumps({
    "marketing_code": "P0001",
    "enquired":"27/10/13 21:58",
    "loan amount": "260000",
    "suburb": "THURSDAY ISLAND",
    "state": "QLD",
    "post code": "4875",
    "property_use": "First HomeBuyer",
    "loan_reason": "Refinance"
})
my_json_string

'{"marketing_code": "P0001", "enquired": "27/10/13 21:58", "loan amount": "260000", "suburb": "THURSDAY ISLAND", "state": "QLD", "post code": "4875", "property_use": "First HomeBuyer", "loan_reason": "Refinance"}'

In [24]:
data = json.loads(my_json_string)
data = pd.DataFrame(data,index=[0])
data.dtypes

enquired          object
loan amount       object
loan_reason       object
marketing_code    object
post code         object
property_use      object
state             object
suburb            object
dtype: object

In [25]:
changeDateType(data)
data = getDetailDate(data)

In [26]:
data.columns = data.columns.str.replace('_', ' ')

In [29]:
y_pred = pipeline.predict(data)
pipeline.predict_proba(data)

array([[ 0.63625231,  0.36374769]])