In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.special as spec
import scipy.stats as st
import statsmodels.formula.api as smf

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR

%matplotlib inline

In [2]:
data_dir = "https://raw.githubusercontent.com/chirayukong/infsci2725-spring-2018/master/assignment-03/Retention.txt"
df = pd.read_table(data_dir, sep="\t")
df.head()

Unnamed: 0,spend,apret,top10,rejr,tstsc,pacc,strat,salar
0,9855,52.5,15,29.474,65.063,36.887,12.0,60800
1,10527,64.25,36,22.309,71.063,30.97,12.8,63900
2,7904,37.75,26,25.853,60.75,41.985,20.3,57800
3,6601,57.0,23,11.296,67.188,40.289,17.0,51200
4,7251,62.0,17,22.635,56.25,46.78,18.1,48000


In [3]:
df.describe(include='all')

Unnamed: 0,spend,apret,top10,rejr,tstsc,pacc,strat,salar
count,170.0,170.0,170.0,170.0,170.0,170.0,170.0,170.0
mean,10974.511765,56.721076,38.458824,30.654218,66.164165,43.173106,16.086471,61357.647059
std,5500.06558,18.077097,23.406393,17.098104,6.975306,13.105195,4.006503,9802.786457
min,4125.0,18.75,8.0,0.0,48.125,8.964,7.2,38640.0
25%,7371.75,45.37475,22.0,19.171,61.111,33.90375,13.4,54650.0
50%,9265.0,55.7085,30.0,27.3905,64.7815,40.8505,16.0,61150.0
75%,12838.0,68.6875,49.5,36.8075,70.45325,51.77325,18.575,67100.0
max,35863.0,95.25,98.0,84.067,87.5,76.253,29.2,87900.0


In [4]:
df.median()

spend     9265.0000
apret       55.7085
top10       30.0000
rejr        27.3905
tstsc       64.7815
pacc        40.8505
strat       16.0000
salar    61150.0000
dtype: float64

In [5]:
lm = smf.ols(formula='apret ~ tstsc + pacc	+ strat + tstsc*pacc', data=df).fit()
print(lm.summary())

                            OLS Regression Results                            
Dep. Variable:                  apret   R-squared:                       0.659
Model:                            OLS   Adj. R-squared:                  0.650
Method:                 Least Squares   F-statistic:                     79.58
Date:                Thu, 05 Apr 2018   Prob (F-statistic):           1.72e-37
Time:                        19:10:53   Log-Likelihood:                -641.46
No. Observations:                 170   AIC:                             1293.
Df Residuals:                     165   BIC:                             1309.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -4.4215     26.205     -0.169      0.8

# Stacking

In [6]:
X = df[['tstsc', 'pacc', 	'strat']]
X['tstsc_pacc'] = df['tstsc'] * df['pacc']
lm = smf.ols(formula='tstsc_pacc ~ tstsc*pacc', data=X).fit()
print(lm.summary())
X.head()

                            OLS Regression Results                            
Dep. Variable:             tstsc_pacc   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 1.013e+31
Date:                Thu, 05 Apr 2018   Prob (F-statistic):               0.00
Time:                        19:10:53   Log-Likelihood:                 4336.5
No. Observations:                 170   AIC:                            -8665.
Df Residuals:                     166   BIC:                            -8652.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   3.865e-12   4.93e-12      0.784      0.4

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,tstsc,pacc,strat,tstsc_pacc
0,65.063,36.887,12.0,2399.978881
1,71.063,30.97,12.8,2200.82111
2,60.75,41.985,20.3,2550.58875
3,67.188,40.289,17.0,2706.937332
4,56.25,46.78,18.1,2631.375


In [7]:
y = df['apret']
y.head()

0    52.50
1    64.25
2    37.75
3    57.00
4    62.00
Name: apret, dtype: float64

In [8]:
lr = LinearRegression()
lr.fit(X, y)
y_LR = lr.predict(X)
diff = y - y_LR
diff.describe()

count    1.700000e+02
mean    -6.185901e-15
std      1.056217e+01
min     -2.528376e+01
25%     -6.321386e+00
50%      2.216940e-02
75%      6.503852e+00
max      2.598902e+01
Name: apret, dtype: float64

In [9]:
rbf = SVR(kernel='rbf')
rbf.fit(X, y)
y_RBF = rbf.predict(X)
diff = y - y_RBF
diff.describe()

count    170.000000
mean       0.785812
std       17.279108
min      -36.188307
25%       -9.563557
50%       -0.099675
75%       11.749192
max       38.311693
Name: apret, dtype: float64

In [10]:
stacking_df = df.copy()
stacking_df['LR'] = y_LR
stacking_df['RBF'] = y_RBF
stacking_df.head()

Unnamed: 0,spend,apret,top10,rejr,tstsc,pacc,strat,salar,LR,RBF
0,9855,52.5,15,29.474,65.063,36.887,12.0,60800,58.266107,54.938307
1,10527,64.25,36,22.309,71.063,30.97,12.8,63900,69.16517,56.938307
2,7904,37.75,26,25.853,60.75,41.985,20.3,57800,45.6313,54.938306
3,6601,57.0,23,11.296,67.188,40.289,17.0,51200,59.006532,56.900294
4,7251,62.0,17,22.635,56.25,46.78,18.1,48000,36.730226,56.938307


In [11]:
lm = smf.ols(formula='apret ~ tstsc + pacc	+ strat + tstsc*pacc + LR + RBF', data=stacking_df).fit()
print(lm.summary())

                            OLS Regression Results                            
Dep. Variable:                  apret   R-squared:                       0.808
Model:                            OLS   Adj. R-squared:                  0.802
Method:                 Least Squares   F-statistic:                     138.3
Date:                Thu, 05 Apr 2018   Prob (F-statistic):           6.18e-57
Time:                        19:10:55   Log-Likelihood:                -592.39
No. Observations:                 170   AIC:                             1197.
Df Residuals:                     164   BIC:                             1216.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    -86.0590      8.306    -10.361      0.0

In [12]:
lm = smf.ols(formula='apret ~ LR*tstsc + LR*pacc	+ LR*strat + LR*tstsc*pacc + RBF*tstsc + RBF*pacc	+ RBF*strat + RBF*tstsc*pacc', data=stacking_df).fit()
print(lm.summary())

                            OLS Regression Results                            
Dep. Variable:                  apret   R-squared:                       0.829
Model:                            OLS   Adj. R-squared:                  0.815
Method:                 Least Squares   F-statistic:                     58.11
Date:                Thu, 05 Apr 2018   Prob (F-statistic):           6.40e-53
Time:                        19:10:56   Log-Likelihood:                -582.77
No. Observations:                 170   AIC:                             1194.
Df Residuals:                     156   BIC:                             1237.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept       -701.4760    458.180     -1.