In [14]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import glob
import warnings
from IPython.display import display

warnings.filterwarnings('ignore')
mpl.rcParams['savefig.dpi'] = 128
mpl.rcParams['figure.dpi'] = 128
# Plot size to 14" x 7"
mpl.rc('figure', figsize = (14, 7))
# Font size to 14
mpl.rc('font', size = 14)
# Do not display top and right frame lines
mpl.rc('axes.spines', top = False, right = False)
# Remove grid lines
mpl.rc('axes', grid = False)
# Set backgound color to white
mpl.rc('axes', facecolor = 'white')

In [15]:
df = pd.read_csv("2011-2017_short.csv")

In [16]:
from sklearn.preprocessing import LabelEncoder

class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col].astype(str))
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [17]:
category_column = X.select_dtypes(include='object')

In [30]:
df.dtypes

marketing code     object
loan amount       float64
property use       object
enquiry status     object
year                int64
month               int64
day                 int64
hour                int64
weekday             int64
dtype: object

In [18]:
from sklearn.preprocessing import LabelEncoder

target = 'enquiry status';
# df = df.iloc[:number_of_rows]
X = df[df.keys()]
X = df.loc[:,df.columns != target]

le = LabelEncoder()
y = df[target]
y = le.fit_transform(y)

In [23]:
X = MultiColumnLabelEncoder(columns=category_column.columns).fit_transform(X)

marketing code      int64
loan amount       float64
property use        int64
year                int64
month               int64
day                 int64
hour                int64
weekday             int64
dtype: object

In [35]:
from sklearn.covariance import EllipticEnvelope

# Create detector
outlier_detector = EllipticEnvelope(contamination=.1)

# Fit detector
outlier_detector.fit(X)

# Predict outliers
result = outlier_detector.predict(X)

for i in range(len(result)) :
    if result[i] < -1:
        print(i)

In [33]:
# Create one feature
feature = X.iloc[:,1]

# Create a function to return index of outliers
def indicies_of_outliers(x):
    q1, q3 = np.percentile(x, [25, 75])
    iqr = q3 - q1
    lower_bound = q1 - (iqr * 1.5)
    upper_bound = q3 + (iqr * 1.5)
    return np.where((x > upper_bound) | (x < lower_bound))

# Run function
indicies_of_outliers(feature)

(array([    60,    112,    159, ..., 198152, 198266, 198285]),)

In [36]:
from sklearn.cluster import KMeans

# # Create DataFrame
# dataframe = pd.DataFrame(X, columns=["feature_1", "feature_2"])

# Make k-means clusterer
clusterer = KMeans(3, random_state=0)

# Fit clusterer
clusterer.fit(X)

# Predict values
X["group"] = clusterer.predict(X)

In [43]:
X["group"].value_counts()

1    163506
0     34558
2       246
Name: group, dtype: int64

In [46]:
# X[X["group"] == 1]

In [47]:
from sklearn.externals import joblib
model_columns = joblib.load('app/models/model_columns.pkl')

In [48]:
model_columns

['loan amount',
 'year',
 'month',
 'day',
 'hour',
 'weekday',
 'marketing code_1824',
 'marketing code_2405',
 'marketing code_2406',
 'marketing code_2408',
 'marketing code_2673',
 'marketing code_2699',
 'marketing code_2825',
 'marketing code_2826',
 'marketing code_3004',
 'marketing code_3005',
 'marketing code_3007',
 'marketing code_3012',
 'marketing code_3014',
 'marketing code_3035',
 'marketing code_3036',
 'marketing code_446',
 'marketing code_5030',
 'marketing code_6500',
 'marketing code_7001',
 'marketing code_7063',
 'marketing code_A1004',
 'marketing code_A10045',
 'marketing code_A1005',
 'marketing code_A111',
 'marketing code_A1111',
 'marketing code_A1112',
 'marketing code_A112',
 'marketing code_A113',
 'marketing code_A114',
 'marketing code_A119',
 'marketing code_A120',
 'marketing code_A123',
 'marketing code_A125',
 'marketing code_A12665',
 'marketing code_A12666',
 'marketing code_A12667',
 'marketing code_A12668',
 'marketing code_A12669',
 'marketi

In [7]:
# mce = MultiColumnLabelEncoder(columns = encoded_columns);
# df = mce.fit_transform(df)

X = df[df.keys()]
X = df.loc[:,df.columns != target]
y = df[target]

In [10]:
print("Keys of enquiries_dataset: \n{}".format(X.keys()))

Keys of enquiries_dataset: 
Index(['Marketing Code', 'Suburb', 'State', 'Post Code', 'Classification',
       'Loan Amount', 'loan_reason', 'property_use', 'Month', 'Day', 'Hour',
       'Weekday'],
      dtype='object')


In [8]:
from sklearn.model_selection import train_test_split

# split data and labels into a training and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y,
                     test_size=0.3,
                     random_state=0,
                     stratify=y)

In [9]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.learning_curve import learning_curve
from sklearn.pipeline import make_pipeline

pipe_svm = make_pipeline(MultiColumnLabelEncoder(columns = encoded_columns),StandardScaler(),
                         PCA(n_components=2),LogisticRegression(random_state=1))


pipe_svm.fit(X_train, y_train)
y_svm_pred = pipe_svm.predict(X_test)
y_svm_train_pred = pipe_svm.predict(X_train)



Pipeline(memory=None,
     steps=[('multicolumnlabelencoder', <__main__.MultiColumnLabelEncoder object at 0x110555e80>), ('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), (...nalty='l2', random_state=1, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])


KeyError: 'Enquiry Status'

In [None]:
from sklearn.ensemble import RandomForestClassifier

pipe_rfc = make_pipeline(StandardScaler(),PCA(n_components=2),RandomForestClassifier(n_estimators=10,
                                                                                    random_state=1))

pipe_rfc.fit(X_train, y_train)
y_rfc_train_pred = pipe_rfc.predict(X_train)
y_rfc_pred = pipe_rfc.predict(X_test)

In [None]:
from sklearn import cross_validation
from sklearn.metrics import classification_report
from sklearn.metrics import r2_score

print("LogisticRegression")
scores = cross_validation.cross_val_score(pipe_svm, X, y, cv=3)
print("cross_validation: {:.2f}".format(np.mean(scores, axis=0)))
print(classification_report(y_test, y_svm_pred))
print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_svm_train_pred),r2_score(y_test, y_svm_pred)))

print("RandomForest")
scores = cross_validation.cross_val_score(pipe_rfc, X, y, cv=3)
print("cross_validation: {:.2f}".format(np.mean(scores, axis=0)))
print(classification_report(y_test, y_rfc_pred))
print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_rfc_train_pred),r2_score(y_test, y_rfc_pred)))

In [None]:
from sklearn.externals import joblib
joblib.dump(pipe_rfc, 'app/models/rfpipeline.pkl')

model_columns = list(X.columns)
joblib.dump(model_columns, 'app/models/rf_model_columns.pkl')