In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import glob
from IPython.display import display

# read whole year data
allFiles = glob.glob("data/*.csv")
df = pd.DataFrame()
list_ = []
for file_ in allFiles:
    df = pd.read_csv(file_,index_col=None, header=0)
    list_.append(df)
df = pd.concat(list_)

In [2]:
selected_features = ['Marketing Code','Suburb', 'State','Post Code','Classification','Enquired',
                     'Loan Amount','loan_reason','property_use']
target = 'Enquiry Status';
whole_set = selected_features + [target]

df = df[whole_set]
df = df.replace('On Hold','Rejected')
# convert loan amount to number type, and change string to NaN
df['Loan Amount'] = pd.to_numeric(df['Loan Amount'],errors='coerce')
df = df.dropna(axis=0, how='any')
df = df[~df[target].isin(['In Progress','Just Received'])]

In [3]:
df['Enquired'] = pd.DatetimeIndex(df['Enquired'])
df['Loan Amount'] = df['Loan Amount'].astype(int)
df['Post Code'] = df['Post Code'].astype(int)

In [4]:
from datetime import datetime
from dateutil.parser import parse

# filter years
start_date = '2017-01-01' 
end_date = '2017-12-31'
mask = (df['Enquired'] > start_date) & (df['Enquired'] <= end_date)
df = df.loc[mask].reset_index(drop=True)

# remove Year feature since it is not important (show below random forest)
# data_set['Year'] = data_set['Enquired'].dt.year
df['Month'] = df['Enquired'].dt.month
df['Day'] = df['Enquired'].dt.day
df['Hour'] = df['Enquired'].dt.hour
df['Weekday'] = df['Enquired'].dt.weekday_name

df = df.loc[:,df.columns != 'Enquiried']

In [5]:
print("List of data types: \n{}".format(df.dtypes))

encoded_columns = list(df.select_dtypes(include=['category','object']))

print("selected encoded_columns: \n{}".format(encoded_columns))

List of data types: 
Marketing Code            object
Suburb                    object
State                     object
Post Code                  int64
Classification            object
Enquired          datetime64[ns]
Loan Amount                int64
loan_reason               object
property_use              object
Enquiry Status            object
Month                      int64
Day                        int64
Hour                       int64
Weekday                   object
dtype: object
selected encoded_columns: 
['Marketing Code', 'Suburb', 'State', 'Classification', 'loan_reason', 'property_use', 'Enquiry Status', 'Weekday']


In [6]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = le.fit_transform(output[col].astype(str))
        else:
            for colname,col in output.iteritems():
                output[colname] = le.fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [7]:
mce = MultiColumnLabelEncoder(columns = encoded_columns);
df = mce.fit_transform(df)

X = df[df.keys()]
X = X.drop(['Enquiry Status'],1)
X = X.drop(['Enquired'],1)
y = df[target]

In [8]:
from sklearn.model_selection import train_test_split

# split data and labels into a training and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y,
                     test_size=0.3,
                     random_state=0,
                     stratify=y)

In [9]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.learning_curve import learning_curve
from sklearn.pipeline import make_pipeline

pipe_svm = make_pipeline(StandardScaler(),PCA(n_components=2),LogisticRegression(random_state=1))

pipe_svm.fit(X_train, y_train)
y_svm_pred = pipe_svm.predict(X_test)
print('Test Accuracy: %.3f' % pipe_svm.score(X_test, y_test))



Test Accuracy: 0.855


In [10]:
from sklearn.ensemble import RandomForestClassifier

pipe_rfc = make_pipeline(StandardScaler(),PCA(n_components=2),RandomForestClassifier(random_state=1))

pipe_rfc.fit(X_train, y_train)
y_rfc_pred = pipe_rfc.predict(X_test)

In [11]:
from sklearn import cross_validation
from sklearn.metrics import classification_report


print("LogisticRegression")
scores = cross_validation.cross_val_score(pipe_svm, X, y, cv=3)
print("cross_validation: {:.2f}".format(np.mean(scores, axis=0)))
print(classification_report(y_test, y_svm_pred))


print("RandomForest")
scores = cross_validation.cross_val_score(pipe_rfc, X, y, cv=3)
print("cross_validation: {:.2f}".format(np.mean(scores, axis=0)))
print(classification_report(y_test, y_rfc_pred))

LogisticRegression
cross_validation: 0.85
             precision    recall  f1-score   support

          0       0.00      0.00      0.00      1346
          1       0.85      1.00      0.92      7916

avg / total       0.73      0.85      0.79      9262

RandomForest


  'precision', 'predicted', average, warn_for)


cross_validation: 0.81
             precision    recall  f1-score   support

          0       0.52      0.32      0.39      1346
          1       0.89      0.95      0.92      7916

avg / total       0.84      0.86      0.84      9262



In [12]:
from sklearn.externals import joblib
joblib.dump(pipe_svm, 'app/models/svmpipeline.pkl')

model_columns = list(X.columns)
joblib.dump(model_columns, 'app/models/svm_model_columns.pkl')

['app/models/svm_model_columns.pkl']