In [2]:
import numpy as np
import pandas as pd
from patsy import dmatrices

import statsmodels.api as sm
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import datasets
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV

# Feature selection using p values leads to overestimates for p values

In [3]:
df = pd.DataFrame()
n = 500
p = 100
for j in range(p):
    xp = 'x'+str(j)
    df[xp] = np.random.randn(n)
y = np.random.randn(n)
df.head()

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x90,x91,x92,x93,x94,x95,x96,x97,x98,x99
0,0.225804,-2.251261,-0.104595,1.00986,-1.122611,0.470112,0.706946,0.187692,1.059212,0.832626,...,0.341726,0.634054,0.347543,0.186686,1.383417,-0.93304,-0.682474,0.810223,-0.332989,-0.39077
1,-0.293295,-0.857337,1.027413,-1.223776,-0.26375,-0.665609,0.637551,-0.094047,-0.484369,0.677276,...,-0.108033,-0.157103,0.350644,-1.075067,-0.039971,-1.442195,0.05973,0.082846,1.361562,0.180055
2,-0.014903,-0.001313,-1.438793,0.976974,-0.477221,0.080698,0.864227,-0.106084,-0.402679,-1.34266,...,-0.313825,0.734296,-1.652189,0.142787,-1.188169,-0.213004,2.194653,-0.790417,1.261227,-0.352331
3,0.327586,0.041229,-0.217257,-0.258589,-2.154041,1.767841,-0.397898,-1.154858,-0.653197,1.902532,...,-1.153752,-0.497645,1.198966,-0.738298,0.708095,1.15603,0.428264,-1.50378,0.58796,-1.725343
4,-1.98074,1.325467,0.724483,-0.254084,-0.715255,2.372744,1.240983,0.818632,-0.43894,-0.165537,...,-0.644622,1.423509,-0.629435,-0.593211,0.071661,0.013227,0.240894,0.7737,-1.209361,-0.308385


In [4]:
variables = df.columns
model = sm.OLS(y,df[variables]).fit()
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.199
Model:,OLS,Adj. R-squared:,-0.001
Method:,Least Squares,F-statistic:,0.9947
Date:,"Wed, 22 Apr 2020",Prob (F-statistic):,0.501
Time:,14:33:44,Log-Likelihood:,-665.19
No. Observations:,500,AIC:,1530.0
Df Residuals:,400,BIC:,1952.0
Df Model:,100,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x0,-0.0067,0.055,-0.122,0.903,-0.115,0.101
x1,0.0186,0.052,0.357,0.721,-0.084,0.121
x2,0.0264,0.049,0.534,0.594,-0.071,0.124
x3,0.0853,0.053,1.606,0.109,-0.019,0.190
x4,0.0792,0.051,1.543,0.124,-0.022,0.180
x5,-0.0949,0.052,-1.809,0.071,-0.198,0.008
x6,-0.0299,0.052,-0.575,0.566,-0.132,0.072
x7,0.0559,0.048,1.160,0.247,-0.039,0.151
x8,0.0320,0.051,0.625,0.532,-0.069,0.133

0,1,2,3
Omnibus:,0.22,Durbin-Watson:,1.907
Prob(Omnibus):,0.896,Jarque-Bera (JB):,0.322
Skew:,-0.032,Prob(JB):,0.851
Kurtosis:,2.893,Cond. No.,2.55


In [5]:
# uncomment this line the first time you run the cell; comment it out for subsequent runs
stat_sig_vars = variables
stat_sig_vars = stat_sig_vars[model.pvalues <= .05]
model = sm.OLS(y,df[stat_sig_vars]).fit()
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.035
Model:,OLS,Adj. R-squared:,0.027
Method:,Least Squares,F-statistic:,4.478
Date:,"Wed, 22 Apr 2020",Prob (F-statistic):,0.00146
Time:,14:33:44,Log-Likelihood:,-711.84
No. Observations:,500,AIC:,1432.0
Df Residuals:,496,BIC:,1449.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x13,-0.1014,0.044,-2.309,0.021,-0.188,-0.015
x37,-0.1143,0.046,-2.484,0.013,-0.205,-0.024
x42,-0.0781,0.047,-1.675,0.094,-0.170,0.013
x73,0.0933,0.047,1.984,0.048,0.001,0.186

0,1,2,3
Omnibus:,0.26,Durbin-Watson:,1.847
Prob(Omnibus):,0.878,Jarque-Bera (JB):,0.358
Skew:,-0.04,Prob(JB):,0.836
Kurtosis:,2.896,Cond. No.,1.13


# Solution: held out test set

In [6]:
X = df
y = y

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=0.5, random_state=1)
print("Training set: ", X_train.shape, y_train.shape)
print("Test set:", X_test.shape, y_test.shape)

Training set:  (250, 100) (250,)
Test set: (250, 100) (250,)


## select model using the training set, compute statistics on test set

In [8]:
# model selection using p values
variables = X.columns
model = sm.OLS(y_train,X_train[variables]).fit()
stat_sig_vars = variables
while any(model.pvalues > .05):
    stat_sig_vars = stat_sig_vars[model.pvalues <= .05]
    model = sm.OLS(y_train,X_train[stat_sig_vars]).fit()
# statistics on training set 
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.021
Model:,OLS,Adj. R-squared:,0.017
Method:,Least Squares,F-statistic:,5.289
Date:,"Wed, 22 Apr 2020",Prob (F-statistic):,0.0223
Time:,14:33:44,Log-Likelihood:,-363.86
No. Observations:,250,AIC:,729.7
Df Residuals:,249,BIC:,733.2
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x18,-0.1568,0.068,-2.300,0.022,-0.291,-0.023

0,1,2,3
Omnibus:,1.421,Durbin-Watson:,1.896
Prob(Omnibus):,0.491,Jarque-Bera (JB):,1.475
Skew:,-0.13,Prob(JB):,0.478
Kurtosis:,2.727,Cond. No.,1.0


In [9]:
# report statistics on test set - notice p values are no longer "statistically significant"!
model = sm.OLS(y_test,X_test[stat_sig_vars]).fit()
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.008
Model:,OLS,Adj. R-squared:,0.004
Method:,Least Squares,F-statistic:,1.996
Date:,"Wed, 22 Apr 2020",Prob (F-statistic):,0.159
Time:,14:33:44,Log-Likelihood:,-352.91
No. Observations:,250,AIC:,707.8
Df Residuals:,249,BIC:,711.3
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x18,0.0908,0.064,1.413,0.159,-0.036,0.217

0,1,2,3
Omnibus:,0.19,Durbin-Watson:,1.983
Prob(Omnibus):,0.909,Jarque-Bera (JB):,0.334
Skew:,0.01,Prob(JB):,0.846
Kurtosis:,2.822,Cond. No.,1.0


# Now for real data

In [10]:
boston_dataset = datasets.load_boston()
boston_dataset

{'data': array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
         4.9800e+00],
        [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
         9.1400e+00],
        [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
         4.0300e+00],
        ...,
        [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         5.6400e+00],
        [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
         6.4800e+00],
        [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         7.8800e+00]]),
 'target': array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
        18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
        15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
        13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
        21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
        35.4, 24.7, 3

In [11]:
print(boston_dataset.DESCR)

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

In [12]:
boston = pd.DataFrame(boston_dataset.data, columns=boston_dataset.feature_names)
boston.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [13]:
X = boston
y = boston_dataset.target

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=0.5, random_state=0)

In [15]:
# model selection using p values on training set
variables = X.columns
model = sm.OLS(y_train,X_train[variables]).fit()
stat_sig_vars = variables
while any(model.pvalues > .05):
    stat_sig_vars = stat_sig_vars[model.pvalues <= .05]
    model = sm.OLS(y_train,X_train[stat_sig_vars]).fit()

# report statistics on test set
model = sm.OLS(y_test,X_test[stat_sig_vars]).fit()
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.954
Model:,OLS,Adj. R-squared:,0.952
Method:,Least Squares,F-statistic:,632.3
Date:,"Wed, 22 Apr 2020",Prob (F-statistic):,7.3e-159
Time:,14:33:46,Log-Likelihood:,-770.64
No. Observations:,253,AIC:,1557.0
Df Residuals:,245,BIC:,1586.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
CRIM,-0.0040,0.052,-0.077,0.939,-0.107,0.099
ZN,0.0209,0.022,0.947,0.345,-0.023,0.064
CHAS,2.0500,1.319,1.554,0.121,-0.548,4.648
RM,5.2624,0.376,14.012,0.000,4.523,6.002
DIS,-0.5661,0.241,-2.354,0.019,-1.040,-0.092
PTRATIO,-0.4688,0.152,-3.092,0.002,-0.767,-0.170
B,0.0164,0.004,4.569,0.000,0.009,0.023
LSTAT,-0.4770,0.060,-7.914,0.000,-0.596,-0.358

0,1,2,3
Omnibus:,97.592,Durbin-Watson:,2.019
Prob(Omnibus):,0.0,Jarque-Bera (JB):,488.778
Skew:,1.476,Prob(JB):,7.300000000000001e-107
Kurtosis:,9.136,Cond. No.,1470.0


## Model selection by minimizing the AIC

In [16]:
def minAIC(X,y):
    variables = X.columns
    model = sm.OLS(y,X[variables]).fit()
    while True:
        maxp = np.max(model.pvalues)
        newvariables = variables[model.pvalues < maxp]
        newmodel = sm.OLS(y,X[newvariables]).fit()
        if newmodel.aic < model.aic:
            model = newmodel
            variables = newvariables
        else:
            break
    return model,variables

model,variables = minAIC(X,y)
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.959
Model:,OLS,Adj. R-squared:,0.958
Method:,Least Squares,F-statistic:,1162.0
Date:,"Wed, 22 Apr 2020",Prob (F-statistic):,0.0
Time:,14:33:47,Log-Likelihood:,-1524.6
No. Observations:,506,AIC:,3069.0
Df Residuals:,496,BIC:,3111.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
CRIM,-0.0898,0.034,-2.630,0.009,-0.157,-0.023
ZN,0.0512,0.014,3.630,0.000,0.024,0.079
CHAS,2.7212,0.892,3.052,0.002,0.970,4.473
RM,5.7113,0.245,23.353,0.000,5.231,6.192
DIS,-0.8664,0.167,-5.185,0.000,-1.195,-0.538
RAD,0.1820,0.063,2.867,0.004,0.057,0.307
TAX,-0.0109,0.003,-3.292,0.001,-0.017,-0.004
PTRATIO,-0.4004,0.109,-3.682,0.000,-0.614,-0.187
B,0.0146,0.003,5.475,0.000,0.009,0.020

0,1,2,3
Omnibus:,198.034,Durbin-Watson:,0.999
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1249.0
Skew:,1.575,Prob(JB):,6.070000000000001e-272
Kurtosis:,10.022,Cond. No.,2240.0


In [17]:
# select on training set, fit on test set 
model,variables = minAIC(X_train, y_train)
model = sm.OLS(y_test,X_test[variables]).fit()
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.955
Model:,OLS,Adj. R-squared:,0.954
Method:,Least Squares,F-statistic:,521.3
Date:,"Wed, 22 Apr 2020",Prob (F-statistic):,5.42e-158
Time:,14:33:47,Log-Likelihood:,-766.0
No. Observations:,253,AIC:,1552.0
Df Residuals:,243,BIC:,1587.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
CRIM,-0.0085,0.057,-0.149,0.882,-0.120,0.103
ZN,0.0374,0.023,1.654,0.100,-0.007,0.082
CHAS,1.9031,1.303,1.461,0.145,-0.663,4.469
RM,5.4851,0.378,14.527,0.000,4.741,6.229
DIS,-0.7162,0.250,-2.862,0.005,-1.209,-0.223
RAD,0.2473,0.099,2.503,0.013,0.053,0.442
TAX,-0.0159,0.005,-3.010,0.003,-0.026,-0.006
PTRATIO,-0.3330,0.165,-2.022,0.044,-0.657,-0.009
B,0.0167,0.004,4.585,0.000,0.010,0.024

0,1,2,3
Omnibus:,103.489,Durbin-Watson:,2.065
Prob(Omnibus):,0.0,Jarque-Bera (JB):,592.224
Skew:,1.529,Prob(JB):,2.51e-129
Kurtosis:,9.843,Cond. No.,2260.0


In [18]:
# compare with p values on training set 
model = sm.OLS(y_train,X_train[variables]).fit()
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.964
Model:,OLS,Adj. R-squared:,0.963
Method:,Least Squares,F-statistic:,654.2
Date:,"Wed, 22 Apr 2020",Prob (F-statistic):,1.78e-169
Time:,14:33:47,Log-Likelihood:,-751.95
No. Observations:,253,AIC:,1524.0
Df Residuals:,243,BIC:,1559.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
CRIM,-0.1356,0.043,-3.174,0.002,-0.220,-0.051
ZN,0.0584,0.018,3.228,0.001,0.023,0.094
CHAS,3.9211,1.230,3.187,0.002,1.497,6.345
RM,5.9295,0.328,18.096,0.000,5.284,6.575
DIS,-0.9108,0.231,-3.936,0.000,-1.367,-0.455
RAD,0.1159,0.083,1.397,0.164,-0.047,0.279
TAX,-0.0060,0.004,-1.400,0.163,-0.014,0.002
PTRATIO,-0.5235,0.149,-3.514,0.001,-0.817,-0.230
B,0.0143,0.004,3.346,0.001,0.006,0.023

0,1,2,3
Omnibus:,121.312,Durbin-Watson:,2.062
Prob(Omnibus):,0.0,Jarque-Bera (JB):,780.111
Skew:,1.81,Prob(JB):,3.9899999999999994e-170
Kurtosis:,10.804,Cond. No.,2260.0


# Another method for model selection: LASSO CV

In [19]:
# We use the base estimator LassoCV since the L1 norm promotes sparsity of features.
clf = LassoCV()
variables = X.columns

# Set a minimum threshold of .5 (controls number of features selected)
sfm = SelectFromModel(clf, threshold=.5)
sfm.fit(X_train, y_train)
selected_features = variables[sfm.get_support()]
selected_features

Index(['RM', 'DIS', 'PTRATIO', 'LSTAT'], dtype='object')

In [21]:
sfm.get_support()

array([False, False, False, False, False,  True, False,  True, False,
       False,  True, False,  True])

In [22]:
# fit model on test set, using just selected features, to report statistics
model = sm.OLS(y_test,X_test[selected_features]).fit()
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.948
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,1143.0
Date:,"Fri, 17 Apr 2020",Prob (F-statistic):,6.809999999999999e-159
Time:,16:12:10,Log-Likelihood:,-784.7
No. Observations:,253,AIC:,1577.0
Df Residuals:,249,BIC:,1592.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
RM,5.9550,0.357,16.688,0.000,5.252,6.658
DIS,-0.2204,0.175,-1.262,0.208,-0.564,0.124
PTRATIO,-0.4275,0.139,-3.078,0.002,-0.701,-0.154
LSTAT,-0.5102,0.061,-8.341,0.000,-0.631,-0.390

0,1,2,3
Omnibus:,87.742,Durbin-Watson:,1.986
Prob(Omnibus):,0.0,Jarque-Bera (JB):,405.005
Skew:,1.333,Prob(JB):,1.13e-88
Kurtosis:,8.595,Cond. No.,27.0


# Random real features

In [23]:
# random birthdays (not randomizing year)
def random_date_generator(start_date='1925-01-01', range_in_days=365):
    random_date = np.datetime64(start_date) + int(np.random.rand()*range_in_days) + int(365*95*np.random.rand())
    return random_date

random_date = random_date_generator()
random_date

numpy.datetime64('2016-03-24')