## Multiple Regressions

This code iteratively investigates regressions of the form: 
    
    simple_return ~ age_in_months + borrower_rate + payment + C(term) + <additional terms>
   
where C() is used to evaluate a term categorically. The outputs are stored in the pickled pandas dataframe: best_aic_models.pkl. 

In [1]:
import os
import time
import sqlalchemy as sql
import scipy as sp
import pandas as pd
import numpy as np
import itertools as it
import statsmodels.formula.api as smf
import statsmodels.api as sm
import logging

In [2]:
#setup log
logging.basicConfig(filename='logger.log')
log = logging.getLogger()
log.setLevel('INFO')

In [3]:
log.info('Testing write to log')

In [4]:
#db connect
creds   = '<creds>'
ip      = 'ip-address'
port    = 'portno'
dbase   = 'p2p_lending'
constring = 'postgresql://%s@%s:%d/%s' %(creds, ip, port, dbase)
eng = sql.create_engine(constring)
con = eng.connect()

In [5]:
#get columns that we care about (from single regression tests)
cols = ['prosper_score','iln023','rtr002','iln022','all903','all146','iln130','investment_typeid',
'bankcard_utilization','rev130','listing_category_id','rev118','ale403','rep075','fin001','all078',
'rep071','all075','rep078','all071','delinquencies_over30_days','rtl071','rtl075','ale908','rtl078',
'fin801','ale901','rtr906','bac905','bac904','iln026','all092','heq001','all090','all091','all155',
'rtr904','all153','all151','rtr084','prior_prosper_loan_earliest_pay_off','effective_yield','rev114',
'all804','all807','estimated_return','all801','all803','all062','rev023','rev022','rep005','rep001',
'rep002','all005','all001','all502','all002','all085','all084','all128','ale078','all081',
'total_open_revolving_accounts','fil001','ale071','all126','all127','ale075','all125','cru001',
'cap801','rtr078','bac075','all701','bac071','total_trade_items','bac078','all501','ale804','all503',
'all504','all505','ale801','iln084','rtr075','satisfactory_accounts','rtr071','bac022','bac403',
'income_range_description','cap026','bac023','iln007','iln006','iln005','iln002','iln001','rev084',
'rev002','rev001','iln804','rev005','current_credit_lines','ale007','ale005','ale002','borrower_state',
'ale001','ale084','delinquencies_last7_years','ale081','rep084','rep081','income_range','fil023',
'fil022','all086','all101','all102','bac804','rep026','rtr001','bac801','rtr905','rtr005','iln078',
'all130','borrower_apr','ale601','iln071','rev081','iln075','prior_prosper_loans_late_cycles','rtl501',
'aut001','rtl503','rtl502','all904','all905','all906','bnk026','all901','rev086','iln085','rev075',
'partial_funding_indicator','rev085','rtr501','rev071','iln403','rev078','iln081','all115','rev550',
'all129','iln102','iln101','scorex_change','rep904','credit_lines_last7_years','rtr081','iln801',
'bac550','rep905','lfi801','ale022','ale023','rep906','ale026','all026','all024','all023','all022',
'iln908','fico_score','rep501','rep503','bac026','iln901','was_delinquent_derog','iln601','bac903',
'ale503','ale502','ale501','ale905','rep901','bnk001','rep903','rtl001','rtl002','rep908','rtl005',
'rtl081','rtl084','prior_prosper_loans','iln502','iln503','ref001','iln501','iln504','ale906',
'prior_prosper_loans_cycles_billed','iln914','rev504','rev115','all403','delinquencies_over90_days',
'rtr022','open_credit_lines','rtr023','all010','iln126','is_homeowner','bac084','scorex','bac081',
'delinquencies_over60_days','bac005','rev127','bac001','bac002','rev128','rev129','rtl904','rev903',
'rep601','bac906','aut071','rev904','rev905','all602','rtl905','rev906','rtl906']

cols.remove('income_range_description')  #redundant with income_range
print len(cols)

235


In [6]:
#get data
reg_cols = ['simple_return','age_in_months','borrower_rate','payment','term']
stmt = 'select %s from merged_train where term<>12' %(','.join(cols + reg_cols))
data = pd.read_sql(stmt, con)

In [7]:
pd.options.display.max_rows = 550

Since not all records have all of the fields filled in (some are NaN), it's good to check how many observations we have for each field type.

In [11]:
#check number of observations for each field
data.count().sort_values(ascending=False)

total_open_revolving_accounts          231070
satisfactory_accounts                  231070
is_homeowner                           231070
borrower_apr                           231070
estimated_return                       231070
partial_funding_indicator              231070
effective_yield                        231070
income_range                           231070
was_delinquent_derog                   231070
total_trade_items                      231070
delinquencies_over30_days              231070
listing_category_id                    231070
prior_prosper_loans                    231070
investment_typeid                      231070
delinquencies_over90_days              231070
delinquencies_over60_days              231070
all084                                 231064
all081                                 231064
all001                                 231064
rep002                                 231064
all062                                 231064
ref001                            

In [13]:
#check correlation (note: this only looks at numerical fields)
corr_matrix = data.corr()

Most of the fields aren't highly correlated.

In [18]:
print corr_matrix.shape
print (corr_matrix > .90).sum().sum()

(229, 229)
769


In [16]:
corr_matrix.head()

Unnamed: 0,prosper_score,iln023,rtr002,iln022,all903,all146,iln130,investment_typeid,bankcard_utilization,rev130,...,rev903,rep601,bac906,aut071,rev904,rev905,all602,rtl905,rev906,rtl906
prosper_score,1.0,-0.047764,0.044946,-0.069794,-0.129533,-0.075582,-0.035954,0.067559,-0.271823,-0.02806,...,-0.096066,0.004058,-0.051951,0.047272,-0.076269,-0.066587,-0.019463,-0.048934,-0.056151,-0.043203
iln023,-0.047764,1.0,0.039586,0.742872,-0.089822,-0.000456,-0.000556,-0.004395,-0.05252,-0.024428,...,-0.030311,0.000877,-0.01782,0.430657,-0.022066,-0.021405,0.038705,-0.015491,-0.019853,-0.014336
rtr002,0.044946,0.039586,1.0,0.032704,0.01061,0.08109,0.016901,0.016547,-0.0547,0.074361,...,0.030126,0.014405,0.089859,0.109725,0.040815,0.048057,0.018547,0.061364,0.059575,0.067428
iln022,-0.069794,0.742872,0.032704,1.0,-0.050276,0.01203,-0.000663,0.01436,-0.079659,-0.008708,...,-0.005667,-0.000699,0.001621,0.30656,0.001195,0.000977,0.026061,-0.001311,0.000376,-0.001725
all903,-0.129533,-0.089822,0.01061,-0.050276,1.0,0.40516,0.152064,0.007075,0.022181,0.180396,...,0.841454,0.02524,0.632991,-0.087216,0.759839,0.719135,0.018021,0.41871,0.666808,0.395069


In [45]:
#determine number of parameters we will be able to test for multiple reg
sp.misc.comb(235,2)/10.**8    #~100 million

0.00027494999999999999

In [27]:
#keep track of categorical fields
cat_stmt = "select name, category from column_category"
cats = pd.read_sql(cat_stmt, con)

In [28]:
cats.category[cats.name=='amount_funded'].values[0]

u'numerical'

In [10]:
best_models = pd.DataFrame(columns=['aic','rsquared'])

In [12]:
best_models = pd.read_pickle('best_multiple_reg.pkl')    #ADDED FOR SECOND RUN
best_models.sort_values(by='aic', inplace=True)

In [None]:
for i, cur_cols in enumerate(it.combinations(cols, 2)):
    #log progress
    if i % 1000 == 0:
        log.info(time.ctime()+'::iter: %d  fields: %s' %(i, ','.join(cur_cols)))
    #setup formula
    formula = 'simple_return ~ age_in_months + borrower_rate + payment + C(term)'
    for c in cur_cols:
        if cats.category[cats.name==c].values[0] == 'categorical':
            formula += '+C(%s)' %c
        else:
            formula += '+%s' %c
    #run regression
    df = data[['simple_return','age_in_months','borrower_rate','payment','term'] + list(cur_cols)]
    reg = smf.ols(formula=formula, data=df, missing='drop')
    model = reg.fit()
        
    #compare reg
    if len(best_models) < 100:
        best_models = best_models.append(
                                    pd.DataFrame(
                                            {'aic':[model.aic], 'rsquared':[model.rsquared]}, 
                                            index=[','.join(cur_cols)]
                                                )
                                        )
    else:
        #sort
        best_models.sort_values(by='aic', inplace=True)
        #compare
        if model.aic < best_models.aic.max():
            #pop
            best_models = best_models[:-1]
            #append
            best_models = best_models.append(
                                        pd.DataFrame(
                                                {'aic':[model.aic], 'rsquared':[model.rsquared]}, 
                                                index=[','.join(cur_cols)]
                                                    )
                                            )
            #write
            best_models.sort_values(by='aic', inplace=True)
            best_models.to_pickle('best_multiple_reg.pkl')

Check Output

In [9]:
pd.options.display.max_rows = 550

In [17]:
outputs = pd.read_pickle('bmr.pkl')
outputs = outputs.drop_duplicates()

In [18]:
outputs.sort_values(by='aic', inplace=True)

Get Variables Involved

In [19]:
top_fields = set(','.join(outputs.index).split(','))
top_fields

{'ale002',
 'ale005',
 'ale022',
 'ale023',
 'ale071',
 'ale075',
 'ale078',
 'ale081',
 'ale084',
 'ale403',
 'ale801',
 'ale804',
 'ale905',
 'ale908',
 'all002',
 'all005',
 'all023',
 'all071',
 'all086',
 'all091',
 'all092',
 'all126',
 'all146',
 'all151',
 'all153',
 'all155',
 'all505',
 'all807',
 'aut001',
 'aut071',
 'bac002',
 'bac005',
 'bac022',
 'bac023',
 'bac071',
 'bac078',
 'bac081',
 'bac084',
 'bac550',
 'bac801',
 'bac804',
 'bnk001',
 'cap801',
 'cru001',
 'fico_score',
 'fil001',
 'fin001',
 'fin801',
 'iln005',
 'iln007',
 'iln071',
 'iln101',
 'iln403',
 'iln801',
 'prior_prosper_loan_earliest_pay_off',
 'rep001',
 'rep071',
 'rep078',
 'rep081',
 'rep084',
 'rep501',
 'rep901',
 'rep904',
 'rep905',
 'rep906',
 'rev002',
 'rev005',
 'rev022',
 'rev023',
 'rev078',
 'rev081',
 'rev084',
 'rev085',
 'rev086',
 'rev115',
 'rev128',
 'rev129',
 'rev130',
 'rev550',
 'rtl002',
 'rtl005',
 'rtl071',
 'rtl905',
 'rtr002',
 'rtr005',
 'rtr022',
 'rtr023',
 'rtr071'}

In [20]:
len(top_fields)

88

Seek Better AIC by adding fields to regression

In [21]:
outputs.aic[outputs.index[0]]

4093.2656349968784

In [22]:
best_models = pd.DataFrame(columns=['aic','rsquared'])

In [43]:
for i, base_v in enumerate(outputs.index):
    base_vars = base_v.split(',')
    #get base model for comparison
    base_aic = outputs.aic[base_v]
    base_rsquared = outputs.rsquared[base_v]
    last_aic = base_aic
    var_str = base_v
    cur_aic = 0
    best_mod = [base_v, base_aic, base_rsquared]
    variables = list(outputs.index.difference([base_v]))
    variables = list(set(','.join(variables).split(',')))
    formula = 'simple_return ~ age_in_months + borrower_rate + payment + C(term)'
    #add base vars to formula
    for c in base_vars:
        #setup formula
        if cats.category[cats.name==c].values[0] == 'categorical':
            formula += '+C(%s)' %c
        else:
            formula += '+%s' %c
    #get new variable
    while cur_aic < last_aic and variables:
        cur_aic = last_aic
        v = variables.pop()
        var_str += (','+v)
        #log progress
        log.info(time.ctime()+':aic:iter: %d  fields: %s' %(i, var_str))   
        
        #run regression
        #add v to formula
        if cats.category[cats.name==v].values[0] == 'categorical':
            formula += '+C(%s)' %v
        else:
            formula += '+%s' %v
        #run regression
        df = data[['simple_return','age_in_months','borrower_rate','payment','term'] + var_str.split(',')]
        reg = smf.ols(formula=formula, data=df, missing='drop')
        model = reg.fit()
        #check aic
        cur_aic = model.aic
        #save best aic model
        if cur_aic < best_mod[1]:
            best_mod = [var_str, cur_aic, model.rsquared]
    best_models = best_models.append(pd.DataFrame({'aic':[best_mod[1]], 'rsquared':[best_mod[2]]}, index=[best_mod[0]]))
    best_models.to_pickle('best_aic_models.pkl')

In [None]:
cats.category[cats.name==v]

In [38]:
list(set(','.join(variables).split(',')))

['all146',
 'rev130',
 'ale403',
 'fin001',
 'rep071',
 'rep078',
 'all071',
 'rtl071',
 'ale908',
 'ale905',
 'all092',
 'all091',
 'all155',
 'all153',
 'all151',
 'prior_prosper_loan_earliest_pay_off',
 'all807',
 'rev023',
 'rev022',
 'rep001',
 'all005',
 'all002',
 'ale078',
 'fil001',
 'ale071',
 'all126',
 'ale075',
 'iln071',
 'cap801',
 'bac071',
 'bac078',
 'ale804',
 'all505',
 'ale801',
 'rtr071',
 'bac022',
 'bac023',
 'iln007',
 'iln005',
 'rev086',
 'rev085',
 'rev084',
 'rev550',
 'iln801',
 'ale005',
 'ale002',
 'ale084',
 'ale081',
 'rep084',
 'rep081',
 'bac804',
 'bac801',
 'rtr005',
 'cru001',
 'rev081',
 'aut001',
 'iln403',
 'rev078',
 'rev002',
 'iln101',
 'rev005',
 'bac550',
 'ale022',
 'ale023',
 'all086',
 'all023',
 'fico_score',
 'rep501',
 'rep904',
 'rep905',
 'rep906',
 'rep901',
 'bnk001',
 'rtl002',
 'rtl005',
 'rev115',
 'fin801',
 'bac084',
 'bac081',
 'bac005',
 'bac002',
 'rev128',
 'rev129',
 'aut071',
 'rtr022',
 'rtr023',
 'rtl905']