# MDST Quicken Loans Starter Tutorial

In [3]:
# MDST / QL Lending Strategies Prediction Challenge Starter code
#
# Description:
#
# Loads data and trains several simple classifiers. Generates
# submission files in the correct Kaggle format.
#
# Usage:
#
# python starter.py
# Make sure that your data is saved in ./data . This code will need
# to be modified if you use a different location.
#
# Authors:
#
# Arya Farahi, Jonathan Stroud

import os, sys
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
from sklearn.cross_validation import train_test_split
from sklearn.metrics import auc, roc_curve, roc_auc_score




First, let's load the data and do some data wrangling. You don't need to worry about the details here, but note some of the techniques that are being used so that we can potentially change them later.

In order for this to work, make sure you have the data loaded into a directory called `data`.

In [4]:

##### Load data tables

# Change if you use a different data location.
path = "./data/"
dtypes = {'id': str, 'zip': str}
          

train = pd.read_csv(os.path.join(path,'loan_table_train.csv'), dtype=dtypes).set_index('id')
test = pd.read_csv(os.path.join(path,'loan_table_test.csv'), dtype=dtypes).set_index('id')

##### Drop some columns

# We won't use these for now, but they might be useful later.
todrop = ['zip', 'state']

train.drop(todrop, axis=1, inplace=True)
test.drop(todrop, axis=1, inplace=True)

##### Convert date strings to numerical values

for column_name in ['datekey', 'largest_open_mortgage_open_date']:
    ###########################################
    if column_name == 'datekey':
        day_time = pd.to_datetime(test[column_name], format='%Y%m%d')
    else:
        day_time = pd.to_datetime(test[column_name], infer_datetime_format=True, yearfirst=True)
    test.drop(column_name, axis=1, inplace=True)
    test[column_name+'_month'] = np.array(day_time.dt.month)
    test[column_name+'_year'] = np.array(day_time.dt.year)
    test[column_name+'_day'] = np.array(day_time.dt.day)
    test[column_name+'_dayofweek'] = np.array(day_time.dt.dayofweek)
    ###########################################
    if column_name == 'datekey':
        day_time = pd.to_datetime(train[column_name], format='%Y%m%d')
    else:
        day_time = pd.to_datetime(train[column_name], infer_datetime_format=True, yearfirst=True)
    train.drop(column_name, axis=1, inplace=True)
    train[column_name+'_month'] = np.array(day_time.dt.month)
    train[column_name+'_year'] = np.array(day_time.dt.year)
    train[column_name+'_day'] = np.array(day_time.dt.day)
    train[column_name+'_dayofweek'] = np.array(day_time.dt.dayofweek)


##### Convert categorical variables to dummy variables

cols = test.select_dtypes(exclude=['float', 'int']).columns
train = pd.get_dummies(train, columns=cols)
test = pd.get_dummies(test, columns=cols)



# Imputation
Like most real-world datasets, this contains some missing values. For now, we'll impute using a value of -10.

In [5]:
##### Impute Missing Values

# For now, we just fill in a default value. Better solutions will
# improve performance.

train.fillna(value=-10.0, inplace=True)
test.fillna(value=-10.0, inplace=True)

# select features and the response 
features = list( test.columns )
response = ['result']


Let's take a look at the data now.

In [6]:
train.head()

Unnamed: 0_level_0,FICO,largest_open_mortgage_UPB,largest_open_mortgage_payment,largest_open_mortgage_original_term,total_revolving_tradeline_debt,total_revolving_tradeline_payment,total_student_loan_tradeline_debt,total_student_loan_tradeline_payment,total_mortgage_tradeline_debt,total_mortgage_tradeline_payment,...,strategy_Biakabutuka,strategy_Brady,strategy_Darboh,strategy_Funchess,strategy_Henne,strategy_Hurst,strategy_Nordin,strategy_Peppers,strategy_Woodson,strategy_Zoltan
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
83,660,269720.0,1792.0,360.0,10418.0,351.0,0.0,0.0,269720.0,1792.0,...,0,0,0,0,0,0,0,0,1,0
98,714,194889.0,1804.0,180.0,31911.0,958.0,0.0,0.0,194889.0,1804.0,...,0,0,0,0,0,0,0,0,0,1
165,0,105838.0,890.0,360.0,891.0,80.0,0.0,0.0,105838.0,890.0,...,0,0,1,0,0,0,0,0,0,0
245,787,9805.0,827.0,180.0,149160.0,715.0,0.0,0.0,155811.0,1405.0,...,0,0,0,0,0,0,1,0,0,0
293,778,101419.0,1038.0,360.0,12875.0,252.0,0.0,0.0,101419.0,1038.0,...,0,0,0,0,0,0,0,0,0,1


# Modeling
This is where the fun begins. We'll explore some different variants of our models here, and see how we can improve them.

In [8]:

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble     import RandomForestClassifier

classifiers = {
    "LR": LogisticRegression(),#solver='sag', tol=1e-1, C=1.e4 / train[features].shape[0]),
    "RF": RandomForestClassifier(max_depth=25),
}

X = train[features]
Y = np.array(train[response]).ravel()
mask = Y > -1
Y[mask] = 1.0
Y[~mask] = 0.0


##### Fit the model

if not os.path.exists('./submission/'):
    os.makedirs('./submission/')

for classifier_label in classifiers.keys():
    clf = classifiers[classifier_label]

    clf.fit(X, Y)

    y_pred = np.array(clf.predict_proba(test[features])[:, 1])

    df = {"id":test.index.values, "target":y_pred}
    df = pd.DataFrame(df, columns=["id", "target"])

    df.to_csv("./submission/Submission_%s.csv"%classifier_label, index=False)


What if we try different imputation methods?

In [16]:
from sklearn.preprocessing import Imputer
vals = train[features].values
imputer = Imputer(strategy='mean')
imputed_X = imputer.fit_transform(vals)

In [17]:
for classifier_label in classifiers.keys():
    clf = classifiers[classifier_label]

    clf.fit(imputed_X, Y)

    y_pred = np.array(clf.predict_proba(test[features])[:, 1])

    df = {"id":test.index.values, "target":y_pred}
    df = pd.DataFrame(df, columns=["id", "target"])

    df.to_csv("./submission/Submission_%s_imputed.csv"%classifier_label, index=False)