## Lending Tree APR Modeling

In [39]:
import time
import pandas as pd
import numpy as np
from subprocess import call
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor

import keras
from keras.layers import Input, Dense, Activation, Dropout, ReLU
from keras.optimizers import Adam, RMSprop, SGD
from keras.layers.advanced_activations import LeakyReLU, PReLU
from keras.models import Model
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt

## Data Preprocessing

In [2]:
%%time
offer_status = pd.read_csv('data/RULO_OfferStatusChange.csv', low_memory=False)
questioner = pd.read_csv('data/RULO_Questioner_Data.csv', low_memory=False)

Wall time: 595 ms


In [3]:
%%time
months= ['JAN','FEB','MAR','APR','MAY','JUNE', 'JULY', 'AUGUST','SEP']
data=None
i = 0;
for month in months:
    filename = "./data/Offer_2018-" + months[i] + ".csv"
    sheet_data = pd.read_csv(filename)
    if data is None:
        data = sheet_data
    else:
        data = pd.concat([data, sheet_data], axis=0)
    i+=1
data['AmortizationType']=data['AmortizationType'].apply(lambda x: 'Fixed' if x=='FIXED' else x)
data.drop('Status', axis=1, inplace=True)

Wall time: 12.4 s


**Merge all the three dataset**

In [11]:
offer_status_prod=pd.merge(data, offer_status,on=['QuestionerId','QuotesId','OfferId'], how='inner', suffixes=['_offer','_offerProd'])
offer_data_questioner=pd.merge(offer_status_prod, questioner,on=['QuestionerId','QuotesId'], how='inner', suffixes=['_offer','_questioner'])

**Feature Selection**

We did select features based on obvious significance and availability and try including and excluding other features to see it that improves the accuracy

In [12]:
desired_columns = ['EstimatedCreditScore','EstimatedPropertyValue','CurrentMortgageBalance','LoanAmountRequested',
                   'RequestedLoanTypeId','PropertyState', 'Veteran', 'PropertyTypeId', 'PropertyUse', 'IsFHALoan', 
                   'IsJumboLoan', 'IsVALoan','APRPercentage']

data=offer_data_questioner[desired_columns].dropna().reset_index(drop=True)

features_cols = desired_columns[:-1]
target_col = desired_columns[-1]
numeric_cols = desired_columns[0:4]
#Update the propertyType amd LoanType to category which will make it easy for dummy encoding
data.PropertyTypeId = data.PropertyTypeId.apply(lambda x: "Type_"+str(x))
data.RequestedLoanTypeId = data.RequestedLoanTypeId.apply(lambda x: "Type_"+str(x))
#Seperate Feature and target and perform dummy encoding the categorical column
features = pd.get_dummies(data[features_cols])
target = data[target_col]

**Perform Normalization on of the numeric columns**

In [13]:
for col in numeric_cols:
    features[col]=(features[col]-features[col].mean())/features[col].std()

**Data Conversion:**
We will convert pandas dataframe to numpy array

In [14]:
X = np.array(features)
y = np.array(target)

**Train Test Split**

In [15]:
train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2, random_state=11)

## Learning Model

**Linear Regression**

In [16]:
lr = LinearRegression().fit(train_X, train_y)
train_pred = lr.predict(train_X)
train_mse = mean_squared_error(train_pred, train_y)
print("Training MSE: %.3f" %train_mse)

val_pred = lr.predict(val_X)
val_mse = mean_squared_error(val_pred, val_y)
print("Validation MSE: %.3f" %val_mse)

Training MSE: 0.222
Validation MSE: 0.213


**Deep Learning**

In [81]:
seed=11
feature_size = train_X.shape[1]
input_layer = Input(shape=(feature_size,))
last_layer = Dense(300, input_dim=feature_size,kernel_initializer=
                                keras.initializers.glorot_normal(seed=seed))(input_layer)
last_layer = Dropout(0.5)(last_layer)
last_layer = PReLU(weights=None)(last_layer)

last_layer= Dense(100, kernel_initializer=keras.initializers.glorot_normal(seed=seed))(last_layer)
last_layer = Dropout(0.5)(last_layer)
last_layer = PReLU(weights=None, alpha_initializer="zero")(last_layer)

last_layer = Dense(50,  kernel_initializer=keras.initializers.glorot_normal(seed=seed))(last_layer)
last_layer = Dropout(0.5)(last_layer)
last_layer = PReLU(weights=None, alpha_initializer="zero")(last_layer)

last_layer = Dense(10, kernel_initializer=keras.initializers.glorot_normal(seed=seed))(last_layer)
last_layer = Dropout(0.3)(last_layer)
last_layer = PReLU(weights=None, alpha_initializer="zero")(last_layer)                                             

output_layer = Dense(1,kernel_initializer=keras.initializers.glorot_normal(seed=seed))(last_layer)
model = Model(input_layer, output_layer)

model.compile(loss="mean_squared_error", optimizer=Adam(0.001))
model.fit(train_X, train_y, validation_data=(val_X, val_y), epochs=1500, batch_size=32, verbose=0)

<keras.callbacks.History at 0x243b1ecfd68>

**Prediction on Train and Test**

In [82]:
train_pred = model.predict(train_X)
train_mse = mean_squared_error(train_pred, train_y)
print("Training MSE: %.3f" %train_mse)

val_pred = model.predict(val_X)
val_mse = mean_squared_error(val_pred, val_y)
print("Validation MSE: %.3f" %val_mse)

Training MSE: 0.080
Validation MSE: 0.159


We can see that this model is overfitting a bit though it is performing better then the linear regression and the overfitting can be reduced more by tuning the dropouts

**LBFGS**:

Let's try teh lbfgs optimizer which which uses 2nd order derivative of the lost function and see if that improves out learning.

In [83]:
mlr = MLPRegressor(hidden_layer_sizes=(100,10), activation='relu', solver='lbfgs', alpha=0.001, batch_size=64, 
                   learning_rate='constant', learning_rate_init=0.001, max_iter=3000, 
                   random_state=11, tol=0.0001, verbose=False, warm_start=False, 
                   early_stopping=True, epsilon=1e-08)

mlr.fit(X, y)
train_pred = mlr.predict(train_X)
train_mse = mean_squared_error(train_pred, train_y)
print("Training MSE: %.3f" %train_mse)

val_pred = mlr.predict(val_X)
val_mse = mean_squared_error(val_pred, val_y)
print("Validation MSE: %.3f" %val_mse)

Training MSE: 0.062
Validation MSE: 0.061


We can see that this approach gave us a better MSE with neural network then the one we used previously.

**See Some prediction of the APR on the validation data**

First columns shows the the predicted APR and the second column shows actual APR

In [85]:
pd.DataFrame({"Predicted APR":val_pred, "Actual APR":val_y})

Unnamed: 0,Predicted APR,Actual APR
0,3.848653,3.711
1,4.871358,4.787
2,4.334078,4.567
3,3.430230,3.433
4,4.376498,4.360
5,4.585208,4.627
6,3.991868,3.995
7,4.174042,4.106
8,3.656466,3.574
9,3.713638,3.764
