## Regression Exploration

In [1]:
import os
import copy
import pickle as pk
from collections import Counter
import itertools as it
import sqlalchemy as sql
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import statsmodels.api as sm

In [None]:
#==============================================================================
# setup
#==============================================================================

In [3]:
#db connection
creds = '<creds>'
eng = sql.create_engine('postgresql://%s@localhost:5432/p2p_lending' %creds)
eng = eng.connect()

In [None]:
#dataframe for outputs
months3 = np.array([[i]*5 for i in range(1,61)]).flatten()
months6 = np.array([[i]*5 for i in range(1,37)]).flatten()
vrs = ['pvalue','R_squared','coef','conf_low','conf_high']
variables3 = np.array(vrs*60)
variables6 = np.array(vrs*36)
mindex3 = pd.MultiIndex.from_tuples(zip(months3,variables3), names=['month', 'var'])
mindex6 = pd.MultiIndex.from_tuples(zip(months6,variables6), names=['month', 'var'])
regouts3 = pd.DataFrame(columns=mindex3, index=[])
regouts6 = pd.DataFrame(columns=mindex6, index=[])

In [None]:
#series for pointers to raw output
raw_out3 = pd.Series()
raw_out6 = pd.Series()

#columns to iterate over
cols = pd.read_sql('select * from p2p_lending limit 1', con=eng).columns
cols = [c for c in cols if 'date' not in c]


In [None]:
#==============================================================================
# iteration
#==============================================================================
class Reg(object):
    def __init__(self, df, col):
        self.categorical = 0
        if self._chk_cat(df,col):
            self.categorical = 1
        formula = ('simple_return ~ C(%s)' %col if self.categorical 
                        else 'simple_return ~ %s' %col) 
        self.model = smf.ols(formula=formula, data=df, missing='drop')
        return
        
    def _chk_cat(self,df,col,max_cat=20):
        '''check if column is categorical'''
        return 1 if (len(pd.unique(df[col])) < max_cat or df[col].dtype=str) else 0
    
    def get_outputs(self):
        outputs = pd.DataFrame(self.model.pvalues, columns=['pvalue'])
        outputs['rsquared'] = self.model.rsquared
        outputs['coef'] = self.model.params
        outputs['conf_low'] = self.model.conf_int()[0]
        outputs['conf_high'] = self.model.conf_int()[1]        
        del outputs['Intercept']
        return outputs

    def get_model_out(self):
        return copy.deepcopy(self.model)

for col in cols:
    for m in xrange(1,37):
        stmt3 = "select simple_return,%s from p2p_lending where age_in_months=%s and term=36" %(col,m)
        df = pd.read_sql(stmt3, con=eng)
        reg = Reg(df,col)
        outpts3 = reg.get_outputs()
        if m == 1:
            out_rows3 = pd.DataFrame(columns=mindex3, 
                                   data=np.matrix([[0]*len(mindex3)]*len(outpts3)))
        out_rows3[m] = outpts3
    regouts3.append(out_rows3)
    raw_out3[col] = reg.get_model_out()
    for m in xrange(1,61):
        stmt6 = "select simple_return,%s from p2p_lending where age_in_months=%s and term=60" %(col,m)
        df = pd.read_sql(stmt6, con=eng)
        reg = Reg(df,col)
        outpts6 = reg.get_outputs()
        if m == 1:
            out_rows6 = pd.DataFrame(columns=mindex6, 
                                   data=np.matrix([[0]*len(mindex6)]*len(outpts6)))        
        out_rows6[m] = outpts6
    regouts6.append(out_rows6)
    raw_out6[col] = reg.get_model_out()

In [None]:
        
#==============================================================================
# analyze results
#==============================================================================
#top p-values
#end of month        
pval_sorted3 = regouts3.sort_values(by=[(36,'pvalue')])
print 'End of term top fields (by pvalue)'
print pval_sorted3.head(30)

pval_sorted6 = regouts6.sort_values(by=[(60,'pvalue')])
print 'End of term top fields (by pvalue)'
print pval_sorted6.head(30)

#all months
topp3 = Counter()
for month in xrange(1,37):
    pval_sorted3 = regouts3.sort_values(by=[(month,'pvalue')])
    topp3 += Counter(pval_sorted3.head(50).index)
print topp3

topp6 = Counter()
for month in xrange(1,61):
    pval_sorted6 = regouts6.sort_values(by=[(month,'pvalue')])
    topp6 += Counter(pval_sorted6.head(50).index)
print topp6

#top R-squared
for month in xrange(1,37):
    regouts3[month].sort_values(by=[(month,'rsquared')], ascending=False).head(30)
for month in xrange(1,61):
    regouts6[month].sort_values(by=[(month,'rsquared')], ascending=False).head(30)

In [None]:
#==============================================================================
# Manually inspect top 100 pvalue columns
#==============================================================================
print topp.most_common(50)

In [None]:
#==============================================================================
# choose top 50 for regression analysis based on R-squared, pval, and coefficient
#==============================================================================
#delete misfit fields