In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import glob
import warnings
from IPython.display import display

warnings.filterwarnings('ignore')
mpl.rcParams['savefig.dpi'] = 128
mpl.rcParams['figure.dpi'] = 128
# Plot size to 14" x 7"
mpl.rc('figure', figsize = (14, 7))
# Font size to 14
mpl.rc('font', size = 14)
# Do not display top and right frame lines
mpl.rc('axes.spines', top = False, right = False)
# Remove grid lines
mpl.rc('axes', grid = False)
# Set backgound color to white
mpl.rc('axes', facecolor = 'white')

In [2]:
df = pd.read_csv("2011-2017_short.csv")

In [3]:
df.shape

(198252, 9)

In [4]:
print("Shape of data frame: {}".format(df.shape))
print("Keys of enquiries_dataset: \n{}".format(df.keys()))
print("data ytpes of enquiries_dataset: \n{}".format(df.dtypes))

Shape of data frame: (198252, 9)
Keys of enquiries_dataset: 
Index(['marketing code', 'loan amount', 'loan reason', 'property use',
       'enquiry status', 'month', 'day', 'hour', 'weekday'],
      dtype='object')
data ytpes of enquiries_dataset: 
marketing code     object
loan amount       float64
loan reason        object
property use       object
enquiry status     object
month               int64
day                 int64
hour                int64
weekday             int64
dtype: object


In [5]:
test = df[df['enquiry status'] == 'Rejected']
test.shape

(156169, 9)

In [6]:
test = df[df['enquiry status'] == 'Accepted']
test.shape

(42083, 9)

In [7]:
from sklearn.utils import resample

df_majority = df[df['enquiry status'] == 'Rejected']
df_minority = df[df['enquiry status'] == 'Accepted']


# Upsample minority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,     # sample with replacement
                                 n_samples=42084,    # to match majority class
                                 random_state=123) # reproducible results



# Combine majority class with upsampled minority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

# Display new class counts
df_downsampled['enquiry status'].value_counts()

Rejected    42084
Accepted    42083
Name: enquiry status, dtype: int64

In [8]:
df = df_downsampled

In [9]:
print("Shape of new data frame: {}".format(df.shape))

Shape of new data frame: (84167, 9)


In [10]:
print("data ytpes of enquiries_dataset: \n{}".format(df.dtypes))

data ytpes of enquiries_dataset: 
marketing code     object
loan amount       float64
loan reason        object
property use       object
enquiry status     object
month               int64
day                 int64
hour                int64
weekday             int64
dtype: object


In [11]:
from sklearn.preprocessing import LabelEncoder

class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col].astype(str))
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [12]:
target = 'enquiry status';
# df = df.iloc[:number_of_rows]
X = df[df.keys()]
X = df.loc[:,df.columns != target]

le = LabelEncoder()
y = df[target]
y = le.fit_transform(y)
le.classes_

array(['Accepted', 'Rejected'], dtype=object)

In [13]:
category_column = X.select_dtypes(include='object')

In [14]:
# from sklearn import preprocessing

# X = MultiColumnLabelEncoder(columns = category_column.columns).fit_transform(X)
# X.dtypes

In [15]:
# indexes = [df.columns.get_loc(c) for c in df.columns if c in category_column.columns ]
# indexes

In [16]:
# enc = preprocessing.OneHotEncoder(categorical_features=indexes)
# test = enc.fit_transform(X)

# test

In [17]:
# X = MultiColumnLabelEncoder(columns = category_column.columns).fit_transform(X)
X = pd.get_dummies(X, columns=category_column.columns)

In [18]:
X.shape

(84167, 788)

In [19]:
from sklearn.model_selection import train_test_split

# split data and labels into a training and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y,
                     test_size=0.4,
                     random_state=0,
                     stratify=y)

In [20]:
# from sklearn import svm
# from sklearn.metrics import accuracy_score
# from sklearn.preprocessing import StandardScaler
# from sklearn.decomposition import PCA
# from sklearn.pipeline import make_pipeline

# pipe_svm = make_pipeline(StandardScaler(),
#                         svm.SVC())

# pipe_svm.fit(X_train, y_train)
# svm_label = pipe_svm.predict(X_test)

In [21]:
# print( np.unique( svm_label ) )

In [22]:
# from sklearn.model_selection import learning_curve

# train_sizes, train_scores, test_scores = learning_curve(estimator=pipe_rf,
#                                                         X=X_train,y=y_train,train_sizes=np.linspace(0.1, 1.0, 10),
#                                                         cv=10,n_jobs=1)
# train_mean = np.mean(train_scores,axis=1)
# train_std = np.std(train_scores, axis=1)      
# test_mean = np.mean(test_scores, axis=1)
# test_std = np.std(test_scores, axis=1)


# plt.plot(train_sizes, train_mean,color='blue', marker='o',markersize=5,label='training accuracy')
# plt.fill_between(train_sizes,train_mean + train_std,train_mean - train_std,alpha=0.15, color='blue')
# plt.plot(train_sizes, test_mean,color='green', linestyle='--',marker='s', markersize=5,label='validation accuracy')
# plt.fill_between(train_sizes,test_mean + test_std,test_mean - test_std,alpha=0.15, color='green')
# plt.grid()
# plt.xlabel('Number of training samples')
# plt.ylabel('Accuracy')               
# plt.legend(loc='lower right')
# plt.ylim([0.8, 1.0])
# plt.show()

In [23]:
# from sklearn import cross_validation
# from sklearn.metrics import classification_report
# from sklearn.metrics import roc_auc_score

# scores = cross_validation.cross_val_score(pipe_svm, X, y, cv=5)
# print("SVM cross_validation: {:.2f}".format(np.mean(scores, axis=0)))

# print("Report\n")
# print(classification_report(y_test, svm_label))

In [24]:
# prob_y_2 = pipe_lr.predict_proba(X)
# prob_y_2 = [p[1] for p in prob_y_2]
# print( roc_auc_score(y, prob_y_2) )

In [25]:
# from sklearn.model_selection import learning_curve

# train_sizes, train_scores, test_scores = learning_curve(estimator=pipe_lr,
#                                                         X=X_train,y=y_train,train_sizes=np.linspace(0.1, 1.0, 10),
#                                                         cv=10,n_jobs=1)
# train_mean = np.mean(train_scores,axis=1)
# train_std = np.std(train_scores, axis=1)      
# test_mean = np.mean(test_scores, axis=1)
# test_std = np.std(test_scores, axis=1)


# plt.plot(train_sizes, train_mean,color='blue', marker='o',markersize=5,label='training accuracy')
# plt.fill_between(train_sizes,train_mean + train_std,train_mean - train_std,alpha=0.15, color='blue')
# plt.plot(train_sizes, test_mean,color='green', linestyle='--',marker='s', markersize=5,label='validation accuracy')
# plt.fill_between(train_sizes,test_mean + test_std,test_mean - test_std,alpha=0.15, color='green')
# plt.grid()
# plt.xlabel('Number of training samples')
# plt.ylabel('Accuracy')               
# plt.legend(loc='lower right')
# plt.ylim([0.8, 1.0])
# plt.show()

In [26]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline

pipe_rf = make_pipeline(StandardScaler(),
                        RandomForestClassifier(n_estimators=500,random_state=1,class_weight="balanced"))

pipe_rf.fit(X_train, y_train)
rf_label = pipe_rf.predict(X_test)
# print('Test Accuracy: %.3f' % pipe_rf.score(X_test, y_test))

In [27]:
# from sklearn.model_selection import learning_curve

# train_sizes, train_scores, test_scores = learning_curve(estimator=pipe_rf,
#                                                         X=X_train,y=y_train,train_sizes=np.linspace(0.1, 1.0, 10),
#                                                         cv=10,n_jobs=1)
# train_mean = np.mean(train_scores,axis=1)
# train_std = np.std(train_scores, axis=1)      
# test_mean = np.mean(test_scores, axis=1)
# test_std = np.std(test_scores, axis=1)


# plt.plot(train_sizes, train_mean,color='blue', marker='o',markersize=5,label='training accuracy')
# plt.fill_between(train_sizes,train_mean + train_std,train_mean - train_std,alpha=0.15, color='blue')
# plt.plot(train_sizes, test_mean,color='green', linestyle='--',marker='s', markersize=5,label='validation accuracy')
# plt.fill_between(train_sizes,test_mean + test_std,test_mean - test_std,alpha=0.15, color='green')
# plt.grid()
# plt.xlabel('Number of training samples')
# plt.ylabel('Accuracy')               
# plt.legend(loc='lower right')
# plt.ylim([0.8, 1.0])
# plt.show()

In [28]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn import cross_validation

scores = cross_validation.cross_val_score(pipe_rf, X, y, cv=5)
print("Random forest cross_validation: {:.2f}".format(np.mean(scores, axis=0)))

print("Random forest")
print(classification_report(y_test, rf_label))



Random forest cross_validation: 0.65
Random forest
             precision    recall  f1-score   support

          0       0.73      0.77      0.75     16833
          1       0.75      0.72      0.74     16834

avg / total       0.74      0.74      0.74     33667



In [29]:
prob_y_2 = pipe_rf.predict_proba(X)
prob_y_2 = [p[1] for p in prob_y_2]
print( roc_auc_score(y, prob_y_2) )

0.963785648497


In [30]:
print( np.unique( rf_label ) )

[0 1]


In [31]:
# from sklearn.ensemble import RandomForestClassifier

# from sklearn import metrics

# # rescale data
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

# rfc = RandomForestClassifier(n_estimators=500,random_state=1)
# rfc.fit(X_train_scaled, y_train)
# pred_forest = rfc.predict(X_test)

# print("Test score: {:.2f}".format(rfc.score(X_test_scaled, y_test)))


In [32]:
# train_sizes, train_scores, test_scores = learning_curve(estimator=rfc,
#                                                         X=X_train,y=y_train,train_sizes=np.linspace(0.1, 1.0, 10),
#                                                         cv=10,n_jobs=1)
# train_mean = np.mean(train_scores,axis=1)
# train_std = np.std(train_scores, axis=1)      
# test_mean = np.mean(test_scores, axis=1)
# test_std = np.std(test_scores, axis=1)


# plt.plot(train_sizes, train_mean,color='blue', marker='o',markersize=5,label='training accuracy')
# plt.fill_between(train_sizes,train_mean + train_std,train_mean - train_std,alpha=0.15, color='blue')
# plt.plot(train_sizes, test_mean,color='green', linestyle='--',marker='s', markersize=5,label='validation accuracy')
# plt.fill_between(train_sizes,test_mean + test_std,test_mean - test_std,alpha=0.15, color='green')
# plt.grid()
# plt.xlabel('Number of training samples')
# plt.ylabel('Accuracy')               
# plt.legend(loc='lower right')
# plt.ylim([0.8, 1.0])
# plt.show()

In [33]:
pipe_rf.steps[1]

('randomforestclassifier',
 RandomForestClassifier(bootstrap=True, class_weight='balanced',
             criterion='gini', max_depth=None, max_features='auto',
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=500, n_jobs=1, oob_score=False, random_state=1,
             verbose=0, warm_start=False))

In [34]:
feat_labels = X.columns[0:]
importances = pipe_rf.steps[1][1].feature_importances_

# reverse the list
indices = np.argsort(importances)[::-1]

for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30,feat_labels[indices[f]],importances[indices[f]]))
    
# plt.title('Feature Importance')
# plt.bar(range(X_train.shape[1]),importances[indices],align='center')
# plt.xticks(range(X_train.shape[1]),feat_labels[indices], rotation=90)
# plt.xlim([-1, X_train.shape[1]])
# plt.tight_layout()
# plt.show()

 1) loan amount                    0.240895
 2) day                            0.125211
 3) hour                           0.117649
 4) month                          0.089093
 5) weekday                        0.063861
 6) loan reason_Refinance With Cash Out 0.014875
 7) loan reason_First HomeBuyer    0.013633
 8) loan reason_Buying Again       0.013628
 9) property use_Residence         0.012894
10) marketing code_A9452           0.010644
11) property use_Investment        0.009421
12) loan reason_Refinance          0.009094
13) marketing code_P0001           0.008499
14) marketing code_IPH             0.007859
15) marketing code_OME             0.007114
16) property use_Residence,Investment 0.006688
17) marketing code_A119            0.006333
18) marketing code_A5981           0.004863
19) marketing code_A9432           0.004706
20) loan reason_Buying Again,Refinance 0.004612
21) marketing code_A9234           0.004301
22) marketing code_DOM35           0.004151
23) marketing code_D

539) marketing code_A7031           0.000024
540) marketing code_A9362           0.000024
541) marketing code_A9229           0.000024
542) marketing code_P00022          0.000023
543) loan reason_Loan Topup         0.000023
544) marketing code_A9465           0.000023
545) marketing code_A9230           0.000023
546) marketing code_A5602           0.000023
547) marketing code_A9126           0.000023
548) marketing code_A6965           0.000023
549) marketing code_A5982           0.000023
550) marketing code_DOM03           0.000023
551) marketing code_A9248           0.000022
552) marketing code_A9359           0.000022
553) marketing code_DOM02           0.000022
554) marketing code_A9473           0.000022
555) marketing code_A141            0.000021
556) marketing code_CPLDOM06        0.000021
557) marketing code_A9514.02        0.000021
558) marketing code_A6655.11        0.000021
559) marketing code_EDOM13.17       0.000021
560) marketing code_A7071           0.000021
561) marke

In [35]:
# import pickle
# import os

# dest = os.path.join('movieclassifier', 'pkl_objects')
# if not os.path.exists(dest):
#     os.makedirs(dest)

# pickle.dump(stop,open(os.path.join(dest, 'stopwords.pkl'),'wb'),protocol=4)
# pickle.dump(clf,
            
# from sklearn.externals import joblib
# joblib.dump(pipe_rf, 'app/models/classifier.pkl')

In [36]:
from sklearn.externals import joblib
joblib.dump(pipe_rf, 'app/models/classifier.pkl')

['app/models/classifier.pkl']

In [37]:
model_columns = list(X.columns)
joblib.dump(model_columns, 'app/models/model_columns.pkl')

['app/models/model_columns.pkl']

In [38]:
clf = joblib.load('app/models/classifier.pkl')
model_columns = joblib.load('app/models/model_columns.pkl')

In [113]:
import json
my_json_string = json.dumps({
    "marketing_code": "IPH",
    "enquired":"29/5/15 17:10",
    "loan amount": "530000",
    "property_use": "Residence",
    "loan_reason": "Refinance"
})
my_json_string

'{"marketing_code": "IPH", "enquired": "29/5/15 17:10", "loan amount": "530000", "property_use": "Residence", "loan_reason": "Refinance"}'

In [114]:
# test = pd.read_json(my_json_string, orient='index')
# data = pd.read_json(my_json_string, typ='series',orient='index')
# data = pd.DataFrame(data=data)
data = json.loads(my_json_string)
data = pd.DataFrame(data,index=[0])

In [115]:
def transform_cols(df):
    df.columns = map(str.lower, df.columns)
    df.columns = df.columns.str.replace('_', ' ')
    return df

In [116]:
transform_cols(data)

Unnamed: 0,enquired,loan amount,loan reason,marketing code,property use
0,29/5/15 17:10,530000,Refinance,IPH,Residence


In [117]:
def transform(df): 
    df['loan amount'] = df['loan amount'].astype('float')
    df['enquired'] = pd.DatetimeIndex(df['enquired'])
    df['month'] = df['enquired'].dt.month
    df['day'] = df['enquired'].dt.day
    df['hour'] = df['enquired'].dt.hour
    df['weekday'] = df['enquired'].dt.dayofweek
    
    if 'post code' in df.columns: 
        df['post code'] = df['post code'].astype('int')
    
    if 'enquired'in df.columns:
        df.drop(['enquired'], axis = 1, inplace = True)

In [118]:
transform(data)

In [119]:
data = pd.get_dummies(data)

data

Unnamed: 0,loan amount,month,day,hour,weekday,loan reason_Refinance,marketing code_IPH,property use_Residence
0,530000.0,5,29,17,4,1,1,1


In [120]:
def cleanFeatures(data) :
    for col in model_columns: 
        if col not in data.columns:
            data[col] = 0

#  test any error column in test data
for col in data.columns: 
    if col not in model_columns:
        print(col)

In [121]:
cleanFeatures(data)

In [122]:
prediction = clf.predict(data)

In [123]:
prediction

array([0])

In [124]:
print("posibility is: {}".format(np.max(clf.predict_proba(data))))

posibility is: 0.646


In [125]:
data = test.loc[:,test.columns != target]

In [126]:
data = pd.get_dummies(data)

In [127]:
cleanFeatures(data)

transform(test)

In [128]:
prediction = clf.predict(data)

In [129]:
prediction

array([0, 0, 0, ..., 0, 0, 0])

In [130]:
# print("posibility is: {}".format(np.max(clf.predict_proba(data))))

In [131]:
proba = []
result = clf.predict_proba(data)
for row in range(len(result)):
    proba.append(np.max(result[row]))

In [132]:
data = test.loc[:,test.columns != target]

In [133]:
data["proba"] = proba
data

Unnamed: 0,marketing code,loan amount,loan reason,property use,month,day,hour,weekday,proba
0,2406,250000.00,Buying Again,Residence,1,6,10,3,0.664000
2,A1111,200000.00,Buying Again,Residence,8,15,10,0,0.796000
4,OME,295554.00,First HomeBuyer,Residence,3,15,21,3,0.740000
6,OME,318988.00,Buying Again,Residence,7,25,22,2,0.696000
7,A1111,350000.00,Buying Again,Residence,8,9,19,3,0.746000
8,IPH,210000.00,Refinance With Cash Out,Residence,8,20,17,0,0.566000
9,OME,337090.00,Buying Again,Residence,9,6,19,3,0.708000
11,OME,525329.00,Buying Again,Residence,9,18,16,1,0.734000
12,A9016,333575.00,Refinance,Residence,9,24,16,0,0.750000
13,A7133,360000.00,"Buying Again,Refinance","Residence,Investment",10,3,22,2,0.700000
