In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('whitegrid')
import warnings
warnings.filterwarnings('ignore')
import itertools
import scipy.stats as scs
import statsmodels.api as sm

In [2]:
from get_clean_data_DTR import get_data, clean_data, get_feats, get_target

In [3]:
def get_clean_feats(year):
    prelim_df = get_data(year)
    df = clean_data(prelim_df, year)
    feats = get_feats(df, year)
    X = feats[['eir','fndng_tgt_{}'.format(year), 'tgt_nrml_cost_{}'.format(year), 'pmts_to_part_{}'.format(year)]]
    X['eir_ft'] = X['eir']/100 * X['fndng_tgt_{}'.format(year)]
    X['eir_tnc'] = X['eir']/100 * X['tgt_nrml_cost_{}'.format(year)]
    X['eir_pmt'] = X['eir']/100 * X['pmts_to_part_{}'.format(year)]
    y = get_target(df, year)
    return X, y
def get_clean_feats_75(year):
    prelim_df = get_data(year)
    prelim_df = prelim_df[prelim_df['fndng_tgt_{}'.format(year)] < prelim_df['fndng_tgt_{}'.format(year)].quantile(.75)]
    df = clean_data(prelim_df, year)
    feats = get_feats(df, year)
    X = feats[['eir','fndng_tgt_{}'.format(year), 'tgt_nrml_cost_{}'.format(year), 'pmts_to_part_{}'.format(year)]]
    X['eir_ft'] = X['eir'] * X['fndng_tgt_{}'.format(year)]
    X['eir_tnc'] = X['eir'] * X['tgt_nrml_cost_{}'.format(year)]
    X['eir_pmt'] = X['eir'] * X['pmts_to_part_{}'.format(year)]
    y = get_target(df, year)
    return X, y

<h3>Features:</h3>
    <ul><li>Prior year funding target
        <li>Prior year target normal cost (expected growth in funding target)
        <li>Prior year benefits paid
        <li>Prior year effective interest rate (single effective rate that approximates the effect of 3-tiered rates actually used to value funding target
    </ul>

<h3>Interaction Features: Interest on FT, TNC, Pmts</h3>

In [4]:
X, y = get_clean_feats(2014)

<h3>Isolate bottom 75% of data (exclude top 25% by plan size)</h3>

In [None]:
X75, y75 = get_clean_feats_75(2014)

<h2>Fit a Linear Regression</h2>
<br>Use K-Fold Cross Validation

In [5]:
def summary_model(X, y, label='scatter'):
    X = sm.add_constant(X)
    model = sm.OLS(y, X).fit()
    summary = model.summary()
    return summary

In [6]:
summary_model(X,y)

0,1,2,3
Dep. Variable:,fndng_tgt_2015,R-squared:,0.979
Model:,OLS,Adj. R-squared:,0.979
Method:,Least Squares,F-statistic:,292100.0
Date:,"Tue, 27 Nov 2018",Prob (F-statistic):,0.0
Time:,14:01:07,Log-Likelihood:,-907770.0
No. Observations:,44492,AIC:,1816000.0
Df Residuals:,44484,BIC:,1816000.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-3.97e+07,3.14e+07,-1.262,0.207,-1.01e+08,2.19e+07
eir,6.532e+06,4.86e+06,1.345,0.179,-2.99e+06,1.61e+07
fndng_tgt_2014,1.2851,0.077,16.625,0.000,1.134,1.437
tgt_nrml_cost_2014,-1.3457,1.164,-1.156,0.248,-3.628,0.936
pmts_to_part_2014,-1.8582,0.624,-2.979,0.003,-3.081,-0.636
eir_ft,-3.5166,1.192,-2.951,0.003,-5.852,-1.181
eir_tnc,49.7560,17.981,2.767,0.006,14.513,84.999
eir_pmt,16.5571,9.633,1.719,0.086,-2.324,35.438

0,1,2,3
Omnibus:,140460.63,Durbin-Watson:,0.227
Prob(Omnibus):,0.0,Jarque-Bera (JB):,16823176717.394
Skew:,50.783,Prob(JB):,0.0
Kurtosis:,3013.729,Cond. No.,45300000000.0


<h3>Results indicate strong multicollinearity - drop some non interaction features</h3>

In [7]:
X

Unnamed: 0,eir,fndng_tgt_2014,tgt_nrml_cost_2014,pmts_to_part_2014,eir_ft,eir_tnc,eir_pmt
0,6.67,3.618638e+06,229096.0,1217861.0,2.413632e+05,1.528070e+04,8.123133e+04
1,6.67,3.618638e+06,229096.0,1217861.0,2.413632e+05,1.528070e+04,8.123133e+04
2,6.67,3.618638e+06,229096.0,1217861.0,2.413632e+05,1.528070e+04,8.123133e+04
3,6.67,3.618638e+06,229096.0,1217861.0,2.413632e+05,1.528070e+04,8.123133e+04
4,6.67,3.618638e+06,229096.0,1217861.0,2.413632e+05,1.528070e+04,8.123133e+04
5,6.67,3.618638e+06,229096.0,1217861.0,2.413632e+05,1.528070e+04,8.123133e+04
6,6.67,3.618638e+06,229096.0,1217861.0,2.413632e+05,1.528070e+04,8.123133e+04
7,6.67,3.618638e+06,229096.0,1217861.0,2.413632e+05,1.528070e+04,8.123133e+04
8,6.67,3.618638e+06,229096.0,1217861.0,2.413632e+05,1.528070e+04,8.123133e+04
9,6.55,1.197172e+07,734485.0,610404.0,7.841477e+05,4.810877e+04,3.998146e+04


In [8]:
X_small = X[['eir_ft','eir_tnc', 'eir_pmt']]

In [10]:
summary_model(X_small, y)

0,1,2,3
Dep. Variable:,fndng_tgt_2015,R-squared:,0.977
Model:,OLS,Adj. R-squared:,0.977
Method:,Least Squares,F-statistic:,637100.0
Date:,"Tue, 27 Nov 2018",Prob (F-statistic):,0.0
Time:,14:01:55,Log-Likelihood:,-909240.0
No. Observations:,44492,AIC:,1818000.0
Df Residuals:,44488,BIC:,1819000.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.433e+06,8.85e+05,2.750,0.006,6.99e+05,4.17e+06
eir_ft,16.0132,0.045,359.628,0.000,15.926,16.100
eir_tnc,23.0121,0.856,26.894,0.000,21.335,24.689
eir_pmt,-6.2354,0.448,-13.905,0.000,-7.114,-5.357

0,1,2,3
Omnibus:,136010.447,Durbin-Watson:,0.227
Prob(Omnibus):,0.0,Jarque-Bera (JB):,12924194642.831
Skew:,46.405,Prob(JB):,0.0
Kurtosis:,2641.748,Cond. No.,78600000.0


In [None]:
colors = itertools.cycle(["c", "m", "y"])
def plot_results(y_true, y_pred):
    plt.figure(figsize=(20,8))
    #xx = np.linspace(0, len(y_true), num=len(y_true)
    plt.scatter(y_true,y_pred, s=20, color=next(colors), label="data")
    #plt.plot(xx, y_pred, color="cornflowerblue", label="max_depth=4", linewidth=2)
    plt.xlabel("data")
    plt.ylabel("Funding Target")
    plt.title("Predicting Funding Target from Linear Regression trained on 2014 data")
    plt.legend()
    plt.show()
    return

def plot_resid(y_true, y_pred):
    resid = y_true - y_pred

    plt.figure(figsize=(20,8))
    xx = np.linspace(0, len(y_true), num=len(y_true))
    plt.scatter(xx, resid, s=20, c="blue", label="residuals")
    plt.xlabel("data")
    plt.ylabel("residual")
    plt.title("Residuals of Predictions vs Actuals (Linear Regerssion)")
    plt.legend()
    plt.show()
    return

<h1>Test on 2015 Data</h1>

In [None]:
X15, y15 = get_clean_feats(2015)

In [None]:
len(X15)

In [None]:
linreg.score(X15,y15)

In [None]:
yhat_15 = linreg.predict(X15)

In [None]:
plot_results(y15, yhat_15)

In [None]:
plot_resid(y15, yhat_15)

<h1>Test on 2016 Data</h1>

In [None]:
X16, y16 = get_clean_feats(2016)

In [None]:
len(X16)

In [None]:
linreg.score(X16,y16)

In [None]:
yhat_16 = linreg.predict(X16)

In [None]:
plot_results(y16,yhat_16)

In [None]:
plot_resid(y16, yhat_16)

<h1>Test on 2017 Data</h1>

In [None]:
X17,y17 = get_clean_feats(2017)

In [None]:
X17

In [None]:
linreg.score(X17,y17)

In [None]:
yhat_17 = linreg.predict(X17)