## Landing Tree APR Modeling

In [1]:
import time
import pandas as pd
import numpy as np
from subprocess import call
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt

## Data Preprocessing

In [2]:
%%time
offer_status = pd.read_csv('data/RULO_OfferStatusChange.csv', low_memory=False)
questioner = pd.read_csv('data/RULO_Questioner_Data.csv', low_memory=False)

Wall time: 503 ms


In [3]:
%%time
months= ['JAN','FEB','MAR','APR','MAY','JUNE', 'JULY', 'AUGUST','SEP']
data=None
i = 0;
for month in months:
    filename = "./data/Offer_2018-" + months[i] + ".csv"
    sheet_data = pd.read_csv(filename)
    if data is None:
        data = sheet_data
    else:
        data = pd.concat([data, sheet_data], axis=0)
    i+=1
data['AmortizationType']=data['AmortizationType'].apply(lambda x: 'Fixed' if x=='FIXED' else x)
data.drop('Status', axis=1, inplace=True)

Wall time: 10.1 s


**Merge all the three dataset**

In [4]:
offer_status_prod=pd.merge(data, offer_status,on=['QuestionerId','QuotesId','OfferId'], how='inner', suffixes=['_offer','_offerProd'])
offer_data_questioner=pd.merge(offer_status_prod, questioner,on=['QuestionerId','QuotesId'], how='inner', suffixes=['_offer','_questioner'])

**Feature Selection**

We did select features based on obvious significance and availability and try including and excluding other features to see it that improves the accuracy

In [5]:
desired_columns = ['EstimatedCreditScore','EstimatedPropertyValue','CurrentMortgageBalance','LoanAmountRequested',
                   'RequestedLoanTypeId','PropertyState', 'Veteran', 'PropertyTypeId', 'PropertyUse', 'IsFHALoan', 
                   'IsJumboLoan', 'IsVALoan','APRPercentage']

data=offer_data_questioner[desired_columns].dropna().reset_index(drop=True)

features_cols = desired_columns[:-1]
target_col = desired_columns[-1]
numeric_cols = desired_columns[0:4]
#Update the propertyType amd LoanType to category which will make it easy for dummy encoding
data.PropertyTypeId = data.PropertyTypeId.apply(lambda x: "Type_"+str(x))
data.RequestedLoanTypeId = data.RequestedLoanTypeId.apply(lambda x: "Type_"+str(x))
#Seperate Feature and target and perform dummy encoding the categorical column
features = pd.get_dummies(data[features_cols])
target = data[target_col]

**Perform Normalization on of the numeric columns**

In [6]:
for col in numeric_cols:
    features[col]=(features[col]-features[col].mean())/features[col].std()

**Data Conversion:**
We will convert pandas dataframe to numpy array

In [7]:
X = np.array(features)
y = np.array(target)

**Train Test Split**

In [8]:
train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2, random_state=11)

## Learning Model

**Linear Regression**

In [9]:
lr = LinearRegression().fit(train_X, train_y)
train_pred = lr.predict(train_X)
train_mse = mean_squared_error(train_pred, train_y)
print("Training MSE: %.3f" %train_mse)

val_pred = lr.predict(val_X)
val_mse = mean_squared_error(val_pred, val_y)
print("Validation MSE: %.3f" %val_mse)

Training MSE: 0.222
Validation MSE: 0.213


**Neural Network**

In [10]:
mlr = MLPRegressor(hidden_layer_sizes=(100,10), activation='relu', solver='lbfgs', alpha=0.0001, batch_size=32, 
                   learning_rate='constant', learning_rate_init=0.001, max_iter=1500, 
                   random_state=11, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, 
                   early_stopping=True, epsilon=1e-08)

mlr.fit(X, y)
train_pred = mlr.predict(train_X)
train_mse = mean_squared_error(train_pred, train_y)
print("Training MSE: %.3f" %train_mse)

val_pred = mlr.predict(val_X)
val_mse = mean_squared_error(val_pred, val_y)
print("Validation MSE: %.3f" %val_mse)

Training MSE: 0.070
Validation MSE: 0.069


**See Some prediction of the APR on the validation data**

First columns shows the the predicted APR and the second column shows actual APR

In [11]:
np.hstack((val_pred.reshape(-1,1), val_y.reshape(-1,1)))[0:10]

array([[3.85499911, 3.711     ],
       [4.94848189, 4.787     ],
       [4.55714265, 4.567     ],
       [3.42446088, 3.433     ],
       [4.38664338, 4.36      ],
       [4.60658601, 4.627     ],
       [4.07591409, 3.995     ],
       [4.183947  , 4.106     ],
       [3.61628957, 3.574     ],
       [3.67148852, 3.764     ]])