## Get Outputs from Regression Runs

Here are returns from some of the linear regression model outputs. They were the "best" at the time they were generated. Below, the best model (which required some qualitative wrangling) yielded an rmse of 0.447. 

In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
DATA_DIR = '/home/ec2-user/daniel/common_vars_regs_outputs'

In [3]:
best_models = pd.DataFrame(columns=['aic','rmse','rsquared','tables'])
for f in os.listdir(DATA_DIR):
    dta = pd.read_pickle(os.path.join(DATA_DIR, f))
    best_models = best_models.append(dta)

In [4]:
best_models.sort_values(by='rmse', inplace=True)
best_models[['aic','rmse','rsquared']].head(20)

Unnamed: 0,aic,rmse,rsquared
"(u'all001', u'estimated_loss_rate', u'estimated_return', u'lender_yield')",184948.009367,0.510979,0.219581
"(u'all001', u'borrower_rate', u'estimated_loss_rate', u'estimated_return')",185565.062726,0.511623,0.217494
"(u'all001', u'effective_yield', u'estimated_loss_rate', u'lender_yield')",187801.141715,0.511692,0.209885
"(u'estimated_loss_rate', u'estimated_return', u'lender_yield')",185746.940599,0.511855,0.216879
"(u'all001', u'borrower_rate', u'effective_yield', u'estimated_loss_rate')",188431.345946,0.512397,0.207727
"(u'all021', u'borrower_rate', u'estimated_loss_rate', u'estimated_return')",186346.300564,0.512499,0.214844
"(u'borrower_rate', u'estimated_loss_rate', u'estimated_return')",186356.247821,0.512511,0.214811
"(u'effective_yield', u'estimated_loss_rate', u'lender_yield')",188812.267805,0.512725,0.206421
"(u'borrower_rate', u'effective_yield', u'estimated_loss_rate')",189437.008777,0.513442,0.204273
"(u'investment_typeid', u'lender_yield', 'term')",196363.253767,0.521516,0.18006


### Variables from the top 20 Regressions

In [5]:
variables = []
for i in best_models.index:
    variables.extend(eval(i))

In [6]:
variables = list(set(variables))
variables

[u'investment_typeid',
 'term',
 u'months_employed',
 u'investment_type_description',
 u'all051',
 u'borrower_rate',
 u'all001',
 u'partial_funding_indicator',
 u'effective_yield',
 u'lender_yield',
 u'all026',
 u'estimated_return',
 u'all021',
 u'installment_balance',
 u'employment_status_description',
 u'estimated_loss_rate',
 u'amount_funded',
 u'funding_threshold']

In [7]:
len(variables)

18

In [8]:
best_models.tail()

Unnamed: 0,aic,rmse,rsquared,tables
"(u'all001', u'all021', u'effective_yield', u'estimated_loss_rate', u'lender_yield')",187767.903607,87.313822,0.210006,"[[[Dep. Variable:, simple_return, R-squared:..."
"(u'all001', u'all021', u'borrower_rate', u'effective_yield', u'estimated_loss_rate')",188398.051619,87.434014,0.207848,"[[[Dep. Variable:, simple_return, R-squared:..."
"(u'all001', u'all021', u'all026', u'all051', u'installment_balance', u'investment_typeid', u'months_employed')",193691.826634,88.229071,0.15021,"[[[Dep. Variable:, simple_return, R-squared:..."
"(u'all001', u'all021', u'all026', u'installment_balance', u'investment_typeid', u'months_employed')",193690.136127,88.229273,0.150209,"[[[Dep. Variable:, simple_return, R-squared:..."
"(u'all001', u'all021', u'installment_balance', u'investment_typeid', u'months_employed')",194942.041122,88.44037,0.145351,"[[[Dep. Variable:, simple_return, R-squared:..."


### Save Best Model

In [64]:
if not os.path.exists('/home/ec2-user/daniel/models'):
    os.makedirs('/home/ec2-user/daniel/models')    
MODELS_DIR = '/home/ec2-user/daniel/models'

In [65]:
best_models.index[0]

"(u'all001', u'estimated_loss_rate', u'estimated_return', u'lender_yield')"

In [66]:
import os
import pandas as pd
import statsmodels.formula.api as smf

from math import sqrt
from db_conn import DBConn

In [67]:
#setup connection to db
eng = DBConn(<db creds>)
con = eng.connect()

In [68]:
#variable categories   
CAT_STMT = "select name, category from column_category"
CATS = pd.read_sql(CAT_STMT, con)        

In [69]:
#functions

def get_sql_data(varset, dataset):
    '''Given list of variables varset, get data from sql merged data. Dataset
    is either train, dev, or test.
    '''
    stmt = 'select %s from merged_%s where term<>12' %(','.join(varset), dataset)
    data = pd.read_sql(stmt, con)
    return data

def get_vars_by_type(varset):
    '''return 2-tuple of numerical variables, and categorical variables.'''
    cat_vars = []
    num_vars = []
    for c in varset:
        #set formula    
        if CATS.category[CATS.name == 'prior_prosper_loan_earliest_pay_off'].values[0] == 'categorical':
            cat_vars.append(c)
        else:
            num_vars.append(c)
    return (num_vars, cat_vars)

def get_formula(dep_var, varsets):
    '''varsets[0] is numerical variables, varsets[1] is categorical variables'''
    cat_vars = ['C(%s)' for c in varsets[1]]
    all_vars = cat_vars + varsets[0]
    formula = dep_var + '~' + '+'.join(all_vars)
    return formula

In [70]:
ind_variables = list(eval(best_models.index[0]))
#add important variables
ind_variables += ['age_in_months', 'term']
formula = get_formula('simple_return',
                      get_vars_by_type(ind_variables))
formula

u'simple_return~all001+estimated_loss_rate+estimated_return+lender_yield+age_in_months+term'

In [71]:
#get train data
data_train = get_sql_data(ind_variables+['simple_return'], 'train')
#get dev data
data_dev = get_sql_data(ind_variables+['simple_return'], 'dev')

In [72]:
data_t = data_train.dropna()
data_d = data_dev.dropna()

In [73]:
#fit model
reg = smf.ols(formula=formula, data=data_t, missing='drop')
model = reg.fit()

In [74]:
model.summary()

0,1,2,3
Dep. Variable:,simple_return,R-squared:,0.395
Model:,OLS,Adj. R-squared:,0.395
Method:,Least Squares,F-statistic:,25180.0
Date:,"Sun, 27 Nov 2016",Prob (F-statistic):,0.0
Time:,19:49:09,Log-Likelihood:,-62981.0
No. Observations:,231064,AIC:,126000.0
Df Residuals:,231057,BIC:,126000.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,0.3793,0.003,112.791,0.000,0.373 0.386
all001,0.0023,5.86e-05,39.596,0.000,0.002 0.002
estimated_loss_rate,-3.3952,0.120,-28.411,0.000,-3.629 -3.161
estimated_return,0.4790,0.058,8.261,0.000,0.365 0.593
lender_yield,2.5431,0.091,27.909,0.000,2.365 2.722
age_in_months,0.0108,4.61e-05,234.798,0.000,0.011 0.011
term,-0.0045,6.09e-05,-74.616,0.000,-0.005 -0.004

0,1,2,3
Omnibus:,1477.233,Durbin-Watson:,1.58
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2240.723
Skew:,-0.012,Prob(JB):,0.0
Kurtosis:,3.482,Cond. No.,14100.0


In [75]:
#predict
dev_pred = model.predict(data_d)

In [52]:
rmse = sqrt(2*sum(
            (dev_pred - data_d['simple_return'])**2
              )/float(len(dev_pred))
       )


In [53]:
rmse

0.4471252979449878

In [55]:
model.save(os.path.join(MODELS_DIR,'rmse_best_model.pkl'))

#### test

In [56]:
import pickle

In [61]:
modl = pickle.load(open(os.path.join(MODELS_DIR,'rmse_best_model.pkl')))

In [63]:
modl.aic

125976.86328574317

### DAMN!!! Looks like there is a bug with pickling model for prediction later.
Model works fine before pickle, but then breaks when loaded from pickle.

https://github.com/statsmodels/statsmodels/issues/1263

https://github.com/statsmodels/statsmodels/issues/1729

In [79]:
import datetime as dt
startdate = '2014-10-06'
enddate = dt.datetime.strptime(startdate, '%Y-%m-%d') + dt.timedelta(days=3)
dataset = 'dev'
edate = enddate.strftime('%Y-%m-%d')

stmt = ("select * from merged_%s where origination_date >= " %(dataset) +
        "'%s' and origination_date < '%s' and term <> 12" %(startdate, edate))

In [80]:
listings = pd.read_sql(stmt, con, index_col='loan_number')
listings = listings[~(listings.listing_number.isnull())]

In [84]:
sublist = listings[listings.prosper_rating == 'AA']

In [85]:
mypred = model.predict(sublist)

In [86]:
mypred[:10]

array([ 0.64766666,  0.66668492,  0.67469061,  0.66545971,  0.65381871,
        0.66668492,  0.63990411,  0.65850241,  0.75126643,  0.67317677])

In [87]:
model.save('myexample.pkl')

In [93]:
from statsmodels.iolib.smpickle import load_pickle
import statsmodels.api as sm

In [100]:
smodel = load_pickle('myexample.pkl')
smodel2 = sm.load('myexample.pkl')
smodel3 = model.load('myexample.pkl')

In [101]:
smodel3.predict(sublist)

AttributeError: 'DataFrame' object has no attribute 'design_info'