In [4]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import glob
import warnings
from IPython.display import display

warnings.filterwarnings('ignore')
mpl.rcParams['savefig.dpi'] = 128
mpl.rcParams['figure.dpi'] = 128
# Plot size to 14" x 7"
mpl.rc('figure', figsize = (14, 7))
# Font size to 14
mpl.rc('font', size = 14)
# Do not display top and right frame lines
mpl.rc('axes.spines', top = False, right = False)
# Remove grid lines
mpl.rc('axes', grid = False)
# Set backgound color to white
mpl.rc('axes', facecolor = 'white')

In [5]:
df = pd.read_csv("2009-2017.csv")

In [6]:
df.shape

(268639, 9)

In [7]:
print("Shape of data frame: {}".format(df.shape))
print("Keys of enquiries_dataset: \n{}".format(df.keys()))
print("data ytpes of enquiries_dataset: \n{}".format(df.dtypes))

Shape of data frame: (268639, 9)
Keys of enquiries_dataset: 
Index(['classification', 'loan amount', 'loan reason', 'property use',
       'enquiry status', 'month', 'day', 'hour', 'weekday'],
      dtype='object')
data ytpes of enquiries_dataset: 
classification    object
loan amount        int64
loan reason       object
property use      object
enquiry status    object
month              int64
day                int64
hour               int64
weekday            int64
dtype: object


In [8]:
test = df[df['enquiry status'] == 'Rejected']
test.shape

(218370, 9)

In [9]:
test = df[df['enquiry status'] == 'Accepted']
test.shape

(50269, 9)

In [10]:
from sklearn.utils import resample

df_majority = df[df['enquiry status'] == 'Rejected']
df_minority = df[df['enquiry status'] == 'Accepted']


# Upsample minority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,     # sample with replacement
                                 n_samples=50269,    # to match majority class
                                 random_state=123) # reproducible results



# Combine majority class with upsampled minority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

# Display new class counts
df_downsampled['enquiry status'].value_counts()

Rejected    50269
Accepted    50269
Name: enquiry status, dtype: int64

In [11]:
df = df_downsampled

In [12]:
print("Shape of new data frame: {}".format(df.shape))

Shape of new data frame: (100538, 9)


In [13]:
print("data ytpes of enquiries_dataset: \n{}".format(df.dtypes))

data ytpes of enquiries_dataset: 
classification    object
loan amount        int64
loan reason       object
property use      object
enquiry status    object
month              int64
day                int64
hour               int64
weekday            int64
dtype: object


In [14]:
target = 'enquiry status';

X = df[df.keys()]
X = df.loc[:,df.columns != target]
y = df[target]

In [15]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
y = le.fit_transform(y)

In [16]:
category_column = X.select_dtypes(include='object')

In [17]:
from sklearn.preprocessing import LabelEncoder

transform_dict = {}
for col in category_column.columns:
    le = preprocessing.LabelEncoder()
    X[col] = le.fit_transform(X[col])
    transform_dict[col] = dict(zip(le.classes_, le.transform(le.classes_)))

In [18]:
inverse_transform_dict = {}
for col, d in transform_dict.items():
    inverse_transform_dict[col] = {v:k for k, v in d.items()}

In [19]:
X.shape

(100538, 8)

In [20]:
# Create correlation matrix
corr_matrix = df.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

In [21]:
to_drop

[]

In [22]:
from sklearn.model_selection import train_test_split

# split data and labels into a training and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y,
                     test_size=0.4,
                     random_state=0,
                     stratify=y)

## Selecting Best Models Using Exhaustive Search

In [23]:
# from sklearn.model_selection import GridSearchCV
# from sklearn import linear_model

# # Create logistic regression
# logistic = linear_model.LogisticRegression()

# # Create range of candidate penalty hyperparameter values
# penalty = ['l1', 'l2']

# # Create range of candidate regularization hyperparamet values
# C = np.logspace(0, 4, 10)

# # Create dictionary hyperparameter candidates
# hyperparameters = dict(C=C, penalty=penalty)

# # Create grid search
# gridsearch = GridSearchCV(logistic, hyperparameters, cv=5, verbose=0)

# # Fit grid search
# best_model = gridsearch.fit(X, y)

In [24]:
# # View best hyperparameters
# print('Best Penalty:', best_model.best_estimator_.get_params()['penalty'])
# print('Best C:', best_model.best_estimator_.get_params()['C'])

## Creating A Baseline Classification Model

In [25]:
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Create dummy classifer
# dummy = DummyClassifier(strategy='uniform', random_state=1)
pipe_dummy = make_pipeline(StandardScaler(),
                       DummyClassifier(strategy='uniform', random_state=1))

# "Train" model
pipe_dummy.fit(X_train, y_train)

# Get accuracy score
pipe_dummy.score(X_test, y_test)

0.50096976327829723

In [26]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline

pipe_rf = make_pipeline(StandardScaler(),
                        RandomForestClassifier(n_estimators=1000,random_state=1, n_jobs=-1))

pipe_rf.fit(X_train, y_train)
rf_label = pipe_rf.predict(X_test)
print('Test Accuracy: %.3f' % pipe_rf.score(X_test, y_test))

Test Accuracy: 0.745


In [27]:
# from sklearn.model_selection import learning_curve

# train_sizes, train_scores, test_scores = learning_curve(estimator=pipe_rf,
#                                                         X=X_train,y=y_train,train_sizes=np.linspace(0.1, 1.0, 10),
#                                                         cv=10,n_jobs=1)
# train_mean = np.mean(train_scores,axis=1)
# train_std = np.std(train_scores, axis=1)      
# test_mean = np.mean(test_scores, axis=1)
# test_std = np.std(test_scores, axis=1)


# plt.plot(train_sizes, train_mean,color='blue', marker='o',markersize=5,label='training accuracy')
# plt.fill_between(train_sizes,train_mean + train_std,train_mean - train_std,alpha=0.15, color='blue')
# plt.plot(train_sizes, test_mean,color='green', linestyle='--',marker='s', markersize=5,label='validation accuracy')
# plt.fill_between(train_sizes,test_mean + test_std,test_mean - test_std,alpha=0.15, color='green')
# plt.grid()
# plt.xlabel('Number of training samples')
# plt.ylabel('Accuracy')               
# plt.legend(loc='lower right')
# plt.ylim([0.8, 1.0])
# plt.show()

In [28]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn import cross_validation

scores = cross_validation.cross_val_score(pipe_rf, X, y, cv=5)
print("Random forest cross_validation: {:.2f}".format(np.mean(scores, axis=0)))

print("Random forest")
print(classification_report(y_test, rf_label))



Random forest cross_validation: 0.68
Random forest
             precision    recall  f1-score   support

          0       0.71      0.82      0.76     20108
          1       0.79      0.67      0.73     20108

avg / total       0.75      0.74      0.74     40216



In [29]:
prob_y_2 = pipe_rf.predict_proba(X)
prob_y_2 = [p[1] for p in prob_y_2]
print( roc_auc_score(y, prob_y_2) )

0.969317593181


In [30]:
print( np.unique( rf_label ) )

[0 1]


In [31]:
# from sklearn.ensemble import RandomForestClassifier

# from sklearn import metrics

# # rescale data
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

# rfc = RandomForestClassifier(n_estimators=500,random_state=1)
# rfc.fit(X_train_scaled, y_train)
# pred_forest = rfc.predict(X_test)

# print("Test score: {:.2f}".format(rfc.score(X_test_scaled, y_test)))


In [32]:
# from sklearn.model_selection import learning_curve
# train_sizes, train_scores, test_scores = learning_curve(estimator=pipe_rf,
#                                                         X=X_train,y=y_train,train_sizes=np.linspace(0.1, 1.0, 10),
#                                                         cv=10,n_jobs=1,
#                                                        verbose=1)
# train_mean = np.mean(train_scores,axis=1)
# train_std = np.std(train_scores, axis=1)      
# test_mean = np.mean(test_scores, axis=1)
# test_std = np.std(test_scores, axis=1)


# plt.plot(train_sizes, train_mean,color='blue', marker='o',markersize=5,label='training accuracy')
# plt.fill_between(train_sizes,train_mean + train_std,train_mean - train_std,alpha=0.15, color='blue')
# plt.plot(train_sizes, test_mean,color='green', linestyle='--',marker='s', markersize=5,label='validation accuracy')
# plt.fill_between(train_sizes,test_mean + test_std,test_mean - test_std,alpha=0.15, color='green')
# plt.grid()
# plt.xlabel('Number of training samples')
# plt.ylabel('Accuracy')               
# plt.legend(loc='lower right')
# plt.ylim([0.8, 1.0])
# plt.show()

In [33]:
# plt.savefig('myfig.png')

In [34]:
classifer = pipe_rf.steps[1]

In [35]:
feat_labels = X.columns[0:]
importances = pipe_rf.steps[1][1].feature_importances_

# reverse the list
indices = np.argsort(importances)[::-1]

for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30,feat_labels[indices[f]],importances[indices[f]]))
    
# plt.title('Feature Importance')
# plt.bar(range(X_train.shape[1]),importances[indices],align='center')
# plt.xticks(range(X_train.shape[1]),feat_labels[indices], rotation=90)
# plt.xlim([-1, X_train.shape[1]])
# plt.tight_layout()
# plt.show()

 1) loan amount                    0.360307
 2) day                            0.140194
 3) hour                           0.127441
 4) classification                 0.111714
 5) month                          0.095004
 6) weekday                        0.066136
 7) loan reason                    0.065642
 8) property use                   0.033563


In [36]:
# classifer.oob_score_

In [37]:
# import pickle
# import os

# dest = os.path.join('movieclassifier', 'pkl_objects')
# if not os.path.exists(dest):
#     os.makedirs(dest)

# pickle.dump(stop,open(os.path.join(dest, 'stopwords.pkl'),'wb'),protocol=4)
# pickle.dump(clf,
            
# from sklearn.externals import joblib
# joblib.dump(pipe_rf, 'app/models/classifier.pkl')

In [38]:
from sklearn.externals import joblib
joblib.dump(pipe_rf, 'app/models/classifier.pkl')

['app/models/classifier.pkl']

In [39]:
model_columns = list(X.columns)
joblib.dump(model_columns, 'app/models/model_columns.pkl')

['app/models/model_columns.pkl']

In [40]:
joblib.dump(transform_dict, 'app/models/transform_dict.pkl')

['app/models/transform_dict.pkl']

In [41]:
clf = joblib.load('app/models/classifier.pkl')
model_columns = joblib.load('app/models/model_columns.pkl')
encoder = joblib.load('app/models/encoder.pkl')

In [42]:
import json
my_json_string = json.dumps({
    "classification": "REA",
    "marketing_code": "OME",
    "enquired":"29/5/15 17:10",
    "loan amount": "2010000",
    "property_use": "Residence",
    "loan_reason": "Refinance"
})
my_json_string

'{"classification": "REA", "marketing_code": "OME", "enquired": "29/5/15 17:10", "loan amount": "2010000", "property_use": "Residence", "loan_reason": "Refinance"}'

In [43]:
# test = pd.read_json(my_json_string, orient='index')
# data = pd.read_json(my_json_string, typ='series',orient='index')
# data = pd.DataFrame(data=data)
data = json.loads(my_json_string)
data = pd.DataFrame(data,index=[0])

In [44]:
# def transform_cols(df):

#     return df

In [45]:
# transform_cols(data)

In [61]:
def transform(df): 
    df.columns = map(str.lower, df.columns)
    df.columns = df.columns.str.replace('_', ' ')
    
    if 'loan amount' in df.columns: 
        df['loan amount'] = df['loan amount'].astype('float')
        
    if 'enquired' in df.columns: 
        df['enquired'] = pd.DatetimeIndex(df['enquired'])
        df['year'] = df['enquired'].dt.year
        df['month'] = df['enquired'].dt.month
        df['day'] = df['enquired'].dt.day
        df['hour'] = df['enquired'].dt.hour
        df['weekday'] = df['enquired'].dt.dayofweek
    
    if 'post code' in df.columns: 
        df['post code'] = df['post code'].astype('int')
    
    if 'enquired'in df.columns:
        df.drop(['enquired'], axis = 1, inplace = True)
    
    return df.replace(encoder)

In [62]:
data = transform(data)

In [63]:
def cleanFeatures(data) :
    for col in model_columns: 
        if col not in data.columns:
            data[col] = 0

#  test any error column in test data
for col in data.columns: 
    if col not in model_columns:
        data.drop(col, axis = 1, inplace = True)

In [64]:
# cleanFeatures(data)
data

Unnamed: 0,classification,loan amount,loan reason,property use,month,day,hour,weekday
0,25,304000.0,4,2,12,6,21,0
1,7,250000.0,0,2,1,6,10,3
3,25,200000.0,0,2,8,15,10,0
5,7,295554.0,1,2,3,15,21,3
7,7,318988.0,0,2,7,25,22,2
8,25,350000.0,0,2,8,9,19,3
9,31,210000.0,4,2,8,20,17,0
10,7,337090.0,0,2,9,6,19,3
12,7,525329.0,0,2,9,18,16,1
13,22,333575.0,4,2,9,24,16,0


In [65]:
prediction = clf.predict(data)

In [66]:
prediction

array([0, 0, 0, ..., 0, 0, 0])

In [67]:
print("posibility is: {}".format(np.max(clf.predict_proba(data))))

posibility is: 0.998


In [68]:
data = test.loc[:,test.columns != target]

In [69]:
data = transform(data)

In [73]:
data['property use'].value_counts()

2    39547
0     8807
1     1915
Name: property use, dtype: int64

In [57]:
# cleanFeatures(data)

In [58]:
data.shape

(50269, 8)

In [59]:
model_columns

['classification',
 'loan amount',
 'loan reason',
 'property use',
 'month',
 'day',
 'hour',
 'weekday']

In [74]:
prediction = clf.predict(data)

In [75]:
prediction

array([0, 0, 0, ..., 0, 0, 0])

In [76]:
proba = []
result = clf.predict_proba(data)
for row in range(len(result)):
    proba.append(np.max(result[row]))

In [77]:
data = test.loc[:,test.columns != target]

In [80]:
data["prediction"] = prediction
data["re"] = proba

In [81]:
data[data['prediction'] == 1]

Unnamed: 0,classification,loan amount,loan reason,property use,month,day,hour,weekday,prediction,re
7,Direct,318988,Buying Again,Residence,7,25,22,2,1,0.645000
10,Direct,337090,Buying Again,Residence,9,6,19,3,1,0.547000
29,Direct,237100,First HomeBuyer,Residence,12,1,16,5,1,0.603000
30,Direct,0,Buying Again,Residence,12,4,22,1,1,0.519000
31,Direct,340000,First HomeBuyer,Residence,12,5,17,2,1,0.570000
41,Direct,260000,Buying Again,Residence,12,30,13,6,1,0.660000
114,Google AdWords,62550,Buying Again,Investment,2,23,21,5,1,0.524000
133,Direct,197000,Buying Again,Residence,3,3,22,6,1,0.528000
160,Direct,551000,First HomeBuyer,Residence,3,11,21,0,1,0.562000
185,Google AdWords,364662,First HomeBuyer,Residence,3,15,18,4,1,0.512000
