In [9]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import glob
import warnings
from IPython.display import display

from sklearn.externals import joblib
encoder = joblib.load('app/models/encoder.pkl')

In [10]:
df = pd.read_csv("2009-2017.csv")

In [11]:
from sklearn.utils import resample

df_majority = df[df['enquiry status'] == 'Rejected']
df_minority = df[df['enquiry status'] == 'Accepted']


# Upsample minority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,     # sample with replacement
                                 n_samples=50269,    # to match majority class
                                 random_state=123) # reproducible results



# Combine majority class with upsampled minority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

# Display new class counts
df_downsampled['enquiry status'].value_counts()

Accepted    50269
Rejected    50269
Name: enquiry status, dtype: int64

In [12]:
df = df_downsampled

In [13]:
def transform(df): 
    df.columns = map(str.lower, df.columns)
    df.columns = df.columns.str.replace('_', ' ')
    
    if 'loan amount' in df.columns: 
        df['loan amount'] = df['loan amount'].astype('float')
        
    if 'enquired' in df.columns: 
        df['enquired'] = pd.DatetimeIndex(df['enquired'])
        df['year'] = df['enquired'].dt.year
        df['month'] = df['enquired'].dt.month
        df['day'] = df['enquired'].dt.day
        df['hour'] = df['enquired'].dt.hour
        df['weekday'] = df['enquired'].dt.dayofweek
    
    if 'post code' in df.columns: 
        df['post code'] = df['post code'].astype('int')
    
    if 'enquired'in df.columns:
        df.drop(['enquired'], axis = 1, inplace = True)
    
    return df.replace(encoder)

In [14]:
df = transform(df)

In [15]:
target = 'enquiry status';

X = df[df.keys()]
X = df.loc[:,df.columns != target]
y = df[target]

In [16]:
category_column = X.select_dtypes(include='object')

In [17]:
from sklearn.model_selection import train_test_split

# split data and labels into a training and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y,
                     test_size=0.4,
                     random_state=0,
                     stratify=y)

In [21]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline

rf = make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators=1000,
                                               random_state=1, 
                                               n_jobs=-1))

rf.fit(X_train, y_train)
prediction = rf.predict(X_test)

In [22]:
print('Test Accuracy: %.3f' % rf.score(X_test, y_test))

Test Accuracy: 0.745


In [23]:
from sklearn.externals import joblib
joblib.dump(rf, 'app/models/classifier.pkl')

['app/models/classifier.pkl']

In [24]:
model_columns = list(X.columns)
joblib.dump(model_columns, 'app/models/model_columns.pkl')

['app/models/model_columns.pkl']