## Regressions on Common Variables

Investigates the best regressions using common fields, evaluated by rmse. Generates the file: common_rmse_best.pkl

In [1]:
import os
import logging
import Queue
import pandas as pd
import numpy as np
import threading as th
import statsmodels.formula.api as smf

from math import sqrt
from itertools import combinations
from db_conn import DBConn

In [2]:
#get data
def get_sql_data(varset, dataset):
    '''Given list of variables varset, get data from sql merged data. Dataset
    is either train, dev, or test.
    '''
    stmt = 'select %s from merged_%s where term<>12' %(','.join(varset), dataset)
    data = pd.read_sql(stmt, con)
    return data

def get_vars_by_type(varset):
    '''return 2-tuple of numerical variables, and categorical variables.'''
    cat_vars = []
    num_vars = []
    for c in varset:
        #set formula    
        if CATS.category[CATS.name == 'prior_prosper_loan_earliest_pay_off'].values[0] == 'categorical':
            cat_vars.append(c)
        else:
            num_vars.append(c)
    return (num_vars, cat_vars)

def get_formula(dep_var, varsets):
    '''varsets[0] is numerical variables, varsets[1] is categorical variables'''
    cat_vars = ['C(%s)' for c in varsets[1]]
    all_vars = cat_vars + varsets[0]
    formula = dep_var + '~' + '+'.join(all_vars)
    return formula

def reg_worker():
    while not q.empty():
        r = q.get()
        top5_outpt = pd.DataFrame(np.ones((5,4))*np.inf, 
                              columns=['rmse','aic','rsquared','tables'])
        for fields in combinations(ind_variables, r):
            #run regression with variables and r chosen
            data_t = data_train[list(fields)+['simple_return']].dropna()
            varsets = get_vars_by_type(fields)
            formula = get_formula('simple_return', varsets)
            reg = smf.ols(formula=formula, data=data_t, missing='drop')
            model = reg.fit()
            #check results
            data_d = data_dev[list(fields)+['simple_return']].dropna()
            pred_dev = model.predict(data_d)
            rmse = sqrt(2*sum(
                        (pred_dev - data_d['simple_return'])**2
                          )/float(len(pred_dev))
                   )
            #keep track of best outputs
            if rmse < top5_outpt.rmse.max():
                top5_outpt.sort_values(by=['rmse','aic'], inplace=True)
                top5_outpt = top5_outpt[:-1]
                top5_outpt = top5_outpt.append(pd.DataFrame({'rmse': [rmse],
                                                             'aic': [model.aic],
                                                             'rsquared': [model.rsquared],
                                                             'tables': [model.summary().tables]},
                                                            index=[str(fields)])
                                              )
                top5_outpt.sort_values(by=['rmse','aic'], inplace=True)
                #save top results for each r
                log.info('%s: r: %d updated reg outputs',
                         th.current_thread().getName(), r)
                top5_outpt.to_pickle('common_vars_regs_outputs/' +
                                     'rmse_common_vars%d.pkl' %r)

In [5]:
#plan:
#    3 threads
#    pull jobs from queue (1,88) --> need worker
#    for each job:
#        best_results_job = ?
#        iterate over combinations
#            run regressions, get output, compare
#    get best results overall

#setup logging
logging.basicConfig(filename='rmse_common_vars_regs-copy1'+'.log')
log = logging.getLogger()
log.setLevel('INFO')

In [4]:
#setup connection to db
eng = DBConn(username='',
             password='')   #fill in 
con = eng.connect()

In [None]:
log.info('db connected')

In [5]:
#variable categories   
CAT_STMT = "select name, category from column_category"
CATS = pd.read_sql(CAT_STMT, con)        

In [8]:
#create data_dir (if necessary)
if not os.path.exists('common_vars_regs_outputs'):
    os.mkdir('common_vars_regs_outputs')

#### Get Fields

In [9]:
pd.options.display.max_rows = 200

In [10]:
v = pd.read_pickle('common_listing_fields.pkl')
v = list(v.index.difference(['borrower_city', 'member_key', 'listing_number',
                             'listing_monthly_payment', 'listing_term', 'first_recorded_credit_line']))
v = [el for el in v if 'date' not in el]
v += ['term','payment']
print v
print len(v)

[u'all001', u'all021', u'all026', u'all051', u'all052', u'all062', u'all064', u'all081', u'all084', u'all141', u'all142', u'all201', u'all207', u'all208', u'all701', u'all806', u'all901', u'amount_delinquent', u'amount_funded', u'amount_remaining', u'bac026', u'bac401', u'bac403', u'bankcard_utilization', u'borrower_apr', u'borrower_rate', u'borrower_state', u'credit_lines_last7_years', u'current_credit_lines', u'current_delinquencies', u'delinquencies_last7_years', u'delinquencies_over30_days', u'delinquencies_over60_days', u'delinquencies_over90_days', u'dti_wprosper_loan', u'effective_yield', u'employment_status_description', u'estimated_loss_rate', u'estimated_return', u'funding_threshold', u'iln201', u'income_range', u'income_range_description', u'income_verifiable', u'inquiries_last6_months', u'installment_balance', u'investment_type_description', u'investment_typeid', u'is_homeowner', u'lender_indicator', u'lender_yield', u'listing_category_id', u'listing_status', u'listing_stat

In [11]:
#variables
ind_variables = v

In [12]:
#get train data
data_train = get_sql_data(ind_variables+['simple_return'], 'train')
log.info('downloaded training data')
#get dev data
data_dev = get_sql_data(ind_variables+['simple_return'], 'dev')
log.info('downloaded dev data')

In [19]:
#setup queue
q = Queue.Queue()
for i in xrange(min(4,len(ind_variables)), 0, -1):
    q.put(i)

In [None]:
#threads
threads = []
for i in xrange(4):
    thread = th.Thread(target=reg_worker)
    threads.append(thread)
    thread.start()

for t in threads:
    t.join()

In [None]:
import pandas as pd

#check overall best rmse
all_outputs = pd.DataFrame(columns=['rmse','aic','rsquared','tables'])
for f in os.listdir('common_vars_regs_outputs'):
    all_outputs = all_outputs.append(
                        pd.read_pickle('common_vars_regs_outputs'+'/'+f))

all_outputs.sort_values(by=['rmse','aic'], inplace=True)
all_outputs.to_pickle('common_rmse_best.pkl')
all_outputs