In [442]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import glob
import warnings
from IPython.display import display

warnings.filterwarnings('ignore')
mpl.rcParams['savefig.dpi'] = 128
mpl.rcParams['figure.dpi'] = 128
# Plot size to 14" x 7"
mpl.rc('figure', figsize = (14, 7))
# Font size to 14
mpl.rc('font', size = 14)
# Do not display top and right frame lines
mpl.rc('axes.spines', top = False, right = False)
# Remove grid lines
mpl.rc('axes', grid = False)
# Set backgound color to white
mpl.rc('axes', facecolor = 'white')

In [443]:
df = pd.read_csv("2009-2017.csv")
df.shape

(268639, 9)

In [444]:
from sklearn.utils import resample

df_majority = df[df['enquiry status'] == 'Rejected']
df_minority = df[df['enquiry status'] == 'Accepted']


# Upsample minority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,     # sample with replacement
                                 n_samples=50269,    # to match majority class
                                 random_state=123) # reproducible results



# Combine majority class with upsampled minority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

# Display new class counts
df_downsampled['enquiry status'].value_counts()

Accepted    50269
Rejected    50269
Name: enquiry status, dtype: int64

In [445]:
# Separate majority and minority classes
df_majority = df[df['enquiry status'] == 'Rejected']
df_minority = df[df['enquiry status'] == 'Accepted']
 
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=218370,    # to match majority class
                                 random_state=123) # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
df_upsampled['enquiry status'].value_counts()

Rejected    218370
Accepted    218370
Name: enquiry status, dtype: int64

In [446]:
df = df_downsampled

In [447]:
target = 'enquiry status';

X = df[df.keys()]
X = df.loc[:,df.columns != target]
y = df[target]

In [448]:
category_column = X.select_dtypes(include='object')

In [449]:
from sklearn import preprocessing

transform_dict = {}
for col in category_column.columns:
    le = preprocessing.LabelEncoder()
    X[col] = le.fit_transform(X[col])
    transform_dict[col] = dict(zip(le.classes_, le.transform(le.classes_)))
    

le = preprocessing.LabelEncoder()
y = le.fit_transform(y)

In [450]:
from sklearn.preprocessing import StandardScaler

feat_labels = X.columns[0:]
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [451]:
# from sklearn.model_selection import GridSearchCV
# from sklearn import linear_model
# from sklearn.ensemble import RandomForestClassifier

# # Create logistic regression
# randomForest = RandomForestClassifier()

# # Create range of candidate penalty hyperparameter values
# n_estimators = [100, 500, 1000]
# max_features = [3, 5, 8]

# # Create dictionary hyperparameter candidates
# hyperparameters = dict(n_estimators=n_estimators, max_features=max_features)

# # Create grid search
# gridsearch = GridSearchCV(randomForest, hyperparameters, cv=5, verbose=0 , n_jobs=-1)

# # Fit grid search
# best_model = gridsearch.fit(X, y)

In [452]:
# # View best hyperparameters
# print('Best n_estimators:', best_model.best_estimator_.get_params()['n_estimators'])
# print('Best max_features:', best_model.best_estimator_.get_params()['max_features'])

In [453]:
from sklearn.model_selection import train_test_split

# split data and labels into a training and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y,
                     test_size=0.4,
                     random_state=0,
                     stratify=y)

In [454]:
from sklearn.ensemble import RandomForestClassifier

randomForest = RandomForestClassifier(n_estimators=100, class_weight='balanced', n_jobs=-1)
randomForest.fit(X_train, y_train)
rf_label = randomForest.predict(X_test)
print('Test Accuracy: %.3f' % randomForest.score(X_test, y_test))

Test Accuracy: 0.743


In [455]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn import cross_validation

scores = cross_validation.cross_val_score(randomForest, X, y, cv=5)
print("Random forest cross_validation: {:.2f}".format(np.mean(scores, axis=0)))

print("Random forest")
print(classification_report(y_test, rf_label))

Random forest cross_validation: 0.68
Random forest
             precision    recall  f1-score   support

          0       0.71      0.82      0.76     20108
          1       0.78      0.67      0.72     20108

avg / total       0.75      0.74      0.74     40216



In [456]:
prob_y_2 = randomForest.predict_proba(X)
prob_y_2 = [p[1] for p in prob_y_2]
print( roc_auc_score(y, prob_y_2) )

0.967454760776


In [457]:
# feat_labels = X.columns[0:]
importances = randomForest.feature_importances_

# reverse the list
indices = np.argsort(importances)[::-1]

for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30,feat_labels[indices[f]],importances[indices[f]]))

 1) loan amount                    0.367462
 2) day                            0.141022
 3) hour                           0.125577
 4) classification                 0.111818
 5) month                          0.093629
 6) weekday                        0.066603
 7) loan reason                    0.065295
 8) property use                   0.028593


In [510]:
from sklearn.externals import joblib
encoder = joblib.load('app/models/encoder.pkl')

In [511]:
import json
my_json_string = json.dumps({
#     'marketing_code':'P0001',
    "classification": "Direct",
    "enquired":"29/1/15 12:10",
    "loan amount": "1184000",
    "property_use": "Residence",
    "loan_reason": "Refinance"
})

In [512]:
data = json.loads(my_json_string)
data = pd.DataFrame(data,index=[0])

In [513]:
def transform_cols(df):
    df.columns = map(str.lower, df.columns)
    df.columns = df.columns.str.replace('_', ' ')
    return df

In [514]:
transform_cols(data)

Unnamed: 0,classification,enquired,loan amount,loan reason,property use
0,Direct,29/1/15 12:10,1184000,Refinance,Residence


In [515]:
data.dtypes

classification    object
enquired          object
loan amount       object
loan reason       object
property use      object
dtype: object

In [516]:
def transform(df): 
    df['loan amount'] = df['loan amount'].astype('float')
    df['enquired'] = pd.DatetimeIndex(df['enquired'])
    df['month'] = df['enquired'].dt.month
    df['day'] = df['enquired'].dt.day
    df['hour'] = df['enquired'].dt.hour
    df['weekday'] = df['enquired'].dt.dayofweek
    
    if 'post code' in df.columns: 
        df['post code'] = df['post code'].astype('int')
    
    if 'enquired'in df.columns:
        df.drop(['enquired'], axis = 1, inplace = True)

    return df.replace(encoder)

In [517]:
data = transform(data)
data.dtypes

classification      int64
loan amount       float64
loan reason         int64
property use        int64
month               int64
day                 int64
hour                int64
weekday             int64
dtype: object

In [518]:
data

Unnamed: 0,classification,loan amount,loan reason,property use,month,day,hour,weekday
0,7,1184000.0,4,2,1,29,12,3


In [519]:
randomForest.predict(data)

array([0])

In [520]:
print("posibility is: {}".format(np.max(randomForest.predict_proba(data))))

posibility is: 0.57


In [521]:
df = pd.read_csv("2009-2017_origin.csv")

In [522]:
df['classification'].value_counts()

Domain                         47775
Google AdWords                 26835
Direct                         25516
Coregistration                 21864
REA                            19767
Alternative Media              19131
Affiliate                      17338
None                           14050
Omnilead                       12845
Home Sales                      8089
All Homes                       7678
Partners                        5800
First Home Buyers               5516
Domain Campaigns                5171
Phone                           4731
Email eChoice Database          4357
Finder                          3971
Domain Internal CBA Program     3620
Domain Campaigns CPA            1975
Domain Competitions             1791
External Email                  1275
Folio                           1173
Dynamic Home Loans              1093
Retention                        901
Refinancing AdWords              893
Portals                          775
Portal Campaigns                 735
M

In [523]:
test = df[df['enquiry status'] == 'Accepted']
# test = test[test['classification'] == 'Domain']
test = test.loc[:,test.columns != 'enquiry status']

In [524]:
test = transform(test)

In [525]:
# test

In [526]:
prediction = randomForest.predict(test)

In [533]:
proba = []
result = randomForest.predict_proba(test)
for row in range(len(result)):
    proba.append(np.max(result[row]))

In [534]:
test["prediction"] = prediction
test["proba"] = proba

In [535]:
test.sort_values(by=['proba'],ascending=False)

Unnamed: 0,classification,loan amount,loan reason,property use,month,day,hour,weekday,prediction,proba
94476,0,100000.00,4,0,5,1,0,4,0,0.81
50243,2,472137.34,1,2,2,18,0,1,0,0.80
259581,2,360800.00,1,2,6,6,0,0,0,0.79
61142,2,324000.00,1,2,6,8,0,6,0,0.78
64444,2,324000.00,1,2,6,8,0,6,0,0.78
149772,2,320000.00,1,2,6,7,0,2,0,0.78
90936,2,400000.00,1,2,11,5,0,3,0,0.78
155838,2,320000.00,1,2,6,7,0,2,0,0.78
175363,2,358000.00,1,2,2,23,0,3,0,0.78
3544,0,242000.00,4,2,5,1,15,2,0,0.77


In [542]:
test[test['proba'] < 0.56].count()

classification    2281
loan amount       2281
loan reason       2281
property use      2281
month             2281
day               2281
hour              2281
weekday           2281
prediction        2281
proba             2281
dtype: int64