## Regressions

In this code I run the regression: 
    simple_return ~ age_in_months + borrower_rate + payment + C(term) + %s

Where %s is replaced iteratively by the other variables. I generate the file simple_regressions.csv, which records the variable substituted, the pvalue, rsquared, confidence interval low (2.5%), and confidence interval high (97.5%).  

In [1]:
import random
import os
import copy
import pickle as pk
from collections import Counter
import itertools as it
import sqlalchemy as sql
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import statsmodels.api as sm

#### DB Connection

In [3]:
creds   = '<creds>'
ip      = '127.0.0.1'
port    = 5432
dbase   = 'p2p_lending'
constring = 'postgresql://%s@%s:%d/%s' %(creds, ip, port, dbase)
eng = sql.create_engine(constring)
con = eng.connect()

#### Check number of unique values in each column

In [4]:
#get unique values for each column 
cols = pd.read_sql('select * from merged_train limit 1', con=eng).columns

In [5]:
cols = cols.difference([c for c in cols if 'date' in c] + ['member_key','loan_number','listing_number',
                                                           'amount_funded','borrower_rate','late_fees_paid',
                                                           'principal_balance','service_fees_paid',
                                                           'interest_paid', 'principal_paid','simple_return',
                                                           'next_payment_due_amount','prosper_rating'])

In [5]:
unique_counts = pd.Series()

In [6]:
for i,col in enumerate(cols):
    stmt = ('with outpt as (select distinct %s from merged_train) ' %col +
            'select count(*) from outpt')
    cur = con.execute(stmt)
    val = cur.fetchone()[0]
    unique_counts[col] = val
    print '%d %s: ' %(i,col), val

0 age_in_months:  106
1 ale001:  36
2 ale002:  37
3 ale005:  36
4 ale007:  8
5 ale022:  11
6 ale023:  14
7 ale026:  8
8 ale071:  36
9 ale074:  7
10 ale075:  10
11 ale076:  7
12 ale077:  5
13 ale078:  10
14 ale080:  4
15 ale081:  10
16 ale084:  10
17 ale403:  127
18 ale501:  11
19 ale502:  10
20 ale503:  11
21 ale601:  10
22 ale720:  87
23 ale724:  86
24 ale740:  86
25 ale801:  12
26 ale804:  8
27 ale901:  52
28 ale903:  51
29 ale904:  46
30 ale905:  44
31 ale906:  44
32 ale908:  51
33 all001:  111
34 all002:  116
35 all003:  61
36 all005:  104
37 all006:  56
38 all007:  61
39 all010:  59
40 all021:  20
41 all022:  32
42 all023:  42
43 all024:  63
44 all026:  57
45 all051:  19
46 all052:  9
47 all062:  61
48 all064:  19
49 all067:  19
50 all071:  103
51 all074:  17
52 all075:  36
53 all076:  20
54 all077:  14
55 all078:  35
56 all080:  23
57 all081:  36
58 all082:  17
59 all084:  35
60 all085:  17
61 all086:  20
62 all090:  30
63 all091:  30
64 all092:  30
65 all101:  59
66 all102:  63


In [13]:
pd.options.display.max_rows = 550
unique_counts.sort_index()

age_in_months                                          106
ale001                                                  36
ale002                                                  37
ale005                                                  36
ale007                                                   8
ale022                                                  11
ale023                                                  14
ale026                                                   8
ale071                                                  36
ale074                                                   7
ale075                                                  10
ale076                                                   7
ale077                                                   5
ale078                                                  10
ale080                                                   4
ale081                                                  10
ale084                                                  

#### Run regressions

In [20]:
#columns to iterate over
cols = pd.read_sql('select * from merged_train limit 1', con=eng).columns
no_use_cols = ['loan_number','borrower_rate','term','amount_borrowed',
               'age_in_months','simple_return','analysis_class','listing_number',
               'member_key', 'first_recorded_credit_line', 'interest_paid', 
               'principal_paid','listing_term','prosper_rating']
cols = [c for c in cols if ('date' not in c and c not in no_use_cols)]

### Test Run

In [4]:
col = 'is_homeowner'
stmt = ('select simple_return, age_in_months,' +
        'borrower_rate, payment, term, %s from merged_train where term<>12' %col)
df = pd.read_sql(stmt, con)
formula = ('simple_return ~ age_in_months + ' +
               'borrower_rate + payment + C(term) + C(%s)' %col)
reg = smf.ols(formula=formula, data=df, missing='drop')
model = reg.fit()

In [5]:
model.summary()

0,1,2,3
Dep. Variable:,simple_return,R-squared:,0.323
Model:,OLS,Adj. R-squared:,0.323
Method:,Least Squares,F-statistic:,22000.0
Date:,"Mon, 31 Oct 2016",Prob (F-statistic):,0.0
Time:,04:29:04,Log-Likelihood:,-76127.0
No. Observations:,231070,AIC:,152300.0
Df Residuals:,231064,BIC:,152300.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,0.3746,0.003,148.524,0.000,0.370 0.380
C(term)[T.60],-0.0973,0.002,-62.793,0.000,-0.100 -0.094
C(is_homeowner)[T.True],0.0252,0.001,17.443,0.000,0.022 0.028
age_in_months,0.0089,3.16e-05,282.599,0.000,0.009 0.009
borrower_rate,0.7395,0.012,60.645,0.000,0.716 0.763
payment,-6.206e-05,3.06e-06,-20.310,0.000,-6.81e-05 -5.61e-05

0,1,2,3
Omnibus:,346.026,Durbin-Watson:,1.385
Prob(Omnibus):,0.0,Jarque-Bera (JB):,374.878
Skew:,-0.064,Prob(JB):,3.9499999999999995e-82
Kurtosis:,3.15,Cond. No.,7570.0


In [11]:
pvals = model.pvalues
column = pvals.index.difference(['Intercept','C(term)[T.60]','age_in_months','borrower_rate','payment'])
pvals[column]

C(is_homeowner)[T.True]    4.285614e-68
dtype: float64

In [7]:
outpt = pd.DataFrame()
outpt['pvalues'] = model.pvalues
outpt['rsquared'] = model.rsquared
outpt['coef'] = model.params
outpt['conf_low'] = model.conf_int()[0]
outpt['conf_high'] = model.conf_int()[1]

In [16]:
outpt.ix[column,:]

Unnamed: 0,pvalues,rsquared,coef,conf_low,conf_high
C(is_homeowner)[T.True],4.2856140000000006e-68,0.322526,0.025155,0.022328,0.027981


### Run Regressions Iteratively

In [44]:
cat_stmt = "select category from column_category where name='%s'" %col
cat = con.execute(cat_stmt).fetchone()[0]

In [45]:
cat

u'categorical'

In [67]:
column = pvals.index.difference(['Intercept','C(term)[T.60]','age_in_months','borrower_rate','payment','prosper_rating'])

In [6]:
#last minute get rid of columns with too many categories (they cause memory errors)
cols = [c for c in cols if c not in ['borrower_city']]

In [18]:
for i,col in enumerate(cols):
    #get category
    print col
    cat_stmt = "select category from column_category where name='%s'" %col
    cat = con.execute(cat_stmt).fetchone()[0]    
    if cat == 'categorical':
        formula = ('simple_return ~ age_in_months + ' +
                       'borrower_rate + payment + C(term) + C(%s)' %col)
    else:
        formula = ('simple_return ~ age_in_months + ' +
                       'borrower_rate + payment + C(term) + %s' %col)

    stmt = ('select simple_return, age_in_months,' +
            'borrower_rate, payment, term, %s from merged_train where term<>12' %col)
    df = pd.read_sql(stmt, con)
    if 'date' not in cat:
        #run regression
        reg = smf.ols(formula=formula, data=df, missing='drop')
        model = reg.fit()
        pvals = model.pvalues
        #get results
        outpt = pd.DataFrame()
        outpt['pvalues'] = pvals
        outpt['rsquared'] = model.rsquared
        outpt['coef'] = model.params
        outpt['conf_low'] = model.conf_int()[0]
        outpt['conf_high'] = model.conf_int()[1]
        columns = pvals.index.difference(['Intercept','C(term)[T.60]','age_in_months','borrower_rate',
                                         'payment','prosper_rating'])
        output = outpt.ix[columns,:]
        #write results
        output.to_sql('simple_regressions', con=con, if_exists='append',index=True, index_label='field')
        #print results
        print pvals[columns]

days_past_due
days_past_due    0.0
dtype: float64
principal_balance
principal_balance    0.0
dtype: float64
service_fees_paid
service_fees_paid    0.0
dtype: float64
prosper_fees_paid
prosper_fees_paid    5.041465e-33
dtype: float64
late_fees_paid
late_fees_paid    7.659300e-70
dtype: float64
debt_sale_proceeds_received
debt_sale_proceeds_received    2.542908e-54
dtype: float64
loan_status
C(loan_status)[T.2]     0.000000e+00
C(loan_status)[T.3]     0.000000e+00
C(loan_status)[T.4]     0.000000e+00
C(loan_status)[T.6]    2.991768e-287
dtype: float64
loan_status_description
C(loan_status_description)[T.CHARGEOFF]     0.000000e+00
C(loan_status_description)[T.COMPLETED]     1.704665e-04
C(loan_status_description)[T.CURRENT]      2.991768e-287
C(loan_status_description)[T.DEFAULTED]     0.000000e+00
dtype: float64
loan_default_reason
C(loan_default_reason)[T.3.0]    1.309634e-24
C(loan_default_reason)[T.4.0]    3.029138e-12
C(loan_default_reason)[T.5.0]    0.000000e+00
C(loan_default_reas

MemoryError: 

I for got to get rid of the 'borrower city' field, which uses too much memory.

I continue after this index.

In [21]:
cols.index('borrower_city')

39

In [22]:
for i,col in enumerate(cols[cols.index('borrower_city')+1:]):
    #get category
    print col
    cat_stmt = "select category from column_category where name='%s'" %col
    cat = con.execute(cat_stmt).fetchone()[0]    
    if cat == 'categorical':
        formula = ('simple_return ~ age_in_months + ' +
                       'borrower_rate + payment + C(term) + C(%s)' %col)
    else:
        formula = ('simple_return ~ age_in_months + ' +
                       'borrower_rate + payment + C(term) + %s' %col)

    stmt = ('select simple_return, age_in_months,' +
            'borrower_rate, payment, term, %s from merged_train where term<>12' %col)
    df = pd.read_sql(stmt, con)
    if 'date' not in cat:
        #run regression
        reg = smf.ols(formula=formula, data=df, missing='drop')
        model = reg.fit()
        pvals = model.pvalues
        #get results
        outpt = pd.DataFrame()
        outpt['pvalues'] = pvals
        outpt['rsquared'] = model.rsquared
        outpt['coef'] = model.params
        outpt['conf_low'] = model.conf_int()[0]
        outpt['conf_high'] = model.conf_int()[1]
        columns = pvals.index.difference(['Intercept','C(term)[T.60]','age_in_months','borrower_rate',
                                         'payment','prosper_rating'])
        output = outpt.ix[columns,:]
        #write results
        output.to_sql('simple_regressions', con=con, if_exists='append',index=True, index_label='field')
        #print results
        print pvals[columns]

prior_prosper_loans_active
prior_prosper_loans_active    0.000941
dtype: float64
prior_prosper_loans
prior_prosper_loans    1.951761e-91
dtype: float64
prior_prosper_loans_principal_borrowed
prior_prosper_loans_principal_borrowed    0.000009
dtype: float64
prior_prosper_loans_principal_outstanding
prior_prosper_loans_principal_outstanding    4.166028e-76
dtype: float64
prior_prosper_loans_balance_outstanding
prior_prosper_loans_balance_outstanding    8.940177e-76
dtype: float64
prior_prosper_loans_cycles_billed
prior_prosper_loans_cycles_billed    1.081401e-65
dtype: float64
prior_prosper_loans_ontime_payments
prior_prosper_loans_ontime_payments    1.970251e-64
dtype: float64
prior_prosper_loans_late_cycles
prior_prosper_loans_late_cycles    0.000032
dtype: float64
prior_prosper_loans_late_payments_one_month_plus
prior_prosper_loans_late_payments_one_month_plus    0.128805
dtype: float64
max_prior_prosper_loan
max_prior_prosper_loan    6.915650e-65
dtype: float64
min_prior_prosper_loan