In [1]:
! pip install -U mlxtend
! pip install -U statsmodels
! pip install -U scikit-learn

Collecting mlxtend
  Downloading mlxtend-0.21.0-py2.py3-none-any.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 3.4 MB/s eta 0:00:01
Collecting scikit-learn>=1.0.2
  Downloading scikit_learn-1.2.0-cp39-cp39-macosx_10_9_x86_64.whl (9.1 MB)
[K     |████████████████████████████████| 9.1 MB 12.9 MB/s eta 0:00:01
Collecting joblib>=0.13.2
  Downloading joblib-1.2.0-py3-none-any.whl (297 kB)
[K     |████████████████████████████████| 297 kB 17.8 MB/s eta 0:00:01
Installing collected packages: joblib, scikit-learn, mlxtend
  Attempting uninstall: joblib
    Found existing installation: joblib 1.1.0
    Uninstalling joblib-1.1.0:
      Successfully uninstalled joblib-1.1.0
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.24.2
    Uninstalling scikit-learn-0.24.2:
      Successfully uninstalled scikit-learn-0.24.2
Successfully installed joblib-1.2.0 mlxtend-0.21.0 scikit-learn-1.2.0
Collecting statsmodels
  Downloading statsmodels-0.13.5-cp39

In [2]:
from sklearn import datasets
import statsmodels.api as sm
import statsmodels.formula.api as smf
import pandas as pd
import numpy as np
from itertools import cycle
from tqdm.auto import tqdm
# Load the Boston housing dataset
boston=datasets.load_boston()

# Multiple Regression

In [3]:
print(boston['DESCR'])

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [4]:
boston_data = pd.DataFrame(boston.data)
boston_data.columns = boston['feature_names']
boston_data['MEDV'] = boston.target

In [5]:
boston_data.head(5)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [6]:
boston_data.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


Run a full model

In [7]:
y = boston_data["MEDV"]
X = boston_data.drop(["MEDV"], axis=1)

In [8]:
X = sm.add_constant(X) # by default statsmodels does not add an intercept
# add a constant is the same as adding a column of 1 to X .
full_model = sm.OLS(y, X).fit()
full_model.summary()

  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,MEDV,R-squared:,0.741
Model:,OLS,Adj. R-squared:,0.734
Method:,Least Squares,F-statistic:,108.1
Date:,"Mon, 12 Dec 2022",Prob (F-statistic):,6.72e-135
Time:,13:37:55,Log-Likelihood:,-1498.8
No. Observations:,506,AIC:,3026.0
Df Residuals:,492,BIC:,3085.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,36.4595,5.103,7.144,0.000,26.432,46.487
CRIM,-0.1080,0.033,-3.287,0.001,-0.173,-0.043
ZN,0.0464,0.014,3.382,0.001,0.019,0.073
INDUS,0.0206,0.061,0.334,0.738,-0.100,0.141
CHAS,2.6867,0.862,3.118,0.002,0.994,4.380
NOX,-17.7666,3.820,-4.651,0.000,-25.272,-10.262
RM,3.8099,0.418,9.116,0.000,2.989,4.631
AGE,0.0007,0.013,0.052,0.958,-0.025,0.027
DIS,-1.4756,0.199,-7.398,0.000,-1.867,-1.084

0,1,2,3
Omnibus:,178.041,Durbin-Watson:,1.078
Prob(Omnibus):,0.0,Jarque-Bera (JB):,783.126
Skew:,1.521,Prob(JB):,8.84e-171
Kurtosis:,8.281,Cond. No.,15100.0


It might be easier to use R-style formulas when there are only a few variables. You do not need to explicitly add an intercept term as in R.

In [9]:
model_2 = smf.ols(formula='MEDV ~ CRIM + ZN', data=boston_data)
res = model_2.fit()
res.summary()

0,1,2,3
Dep. Variable:,MEDV,R-squared:,0.234
Model:,OLS,Adj. R-squared:,0.231
Method:,Least Squares,F-statistic:,76.82
Date:,"Mon, 12 Dec 2022",Prob (F-statistic):,7.68e-30
Time:,13:48:50,Log-Likelihood:,-1772.8
No. Observations:,506,AIC:,3552.0
Df Residuals:,503,BIC:,3564.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,22.4856,0.442,50.904,0.000,21.618,23.353
CRIM,-0.3521,0.043,-8.267,0.000,-0.436,-0.268
ZN,0.1161,0.016,7.392,0.000,0.085,0.147

0,1,2,3
Omnibus:,164.581,Durbin-Watson:,0.757
Prob(Omnibus):,0.0,Jarque-Bera (JB):,432.206
Skew:,1.625,Prob(JB):,1.4e-94
Kurtosis:,6.152,Cond. No.,32.0


## Bootstrap

Boostrap is a key tool in modern statistics to quantify the uncertainty of estimation. It is is a computer-intensive procedure that substitutes fast computation for theoretical math. The idea is quite simple.  

1. Suppose you have a dataset (sample) and use it to find $\hat{\theta}$, your estimate the unknown parameter $\theta$. 
2. Then draw a new random sample of size n, with replacement, from the orignial dataset. The sample size n is the same as the size of the original dataset (sample). 
3. This new sample is called a bootstrap sample. For this bootstrap sample, we can calculate a new estimate $\hat{\theta}_1$. 
4. Repeat step 2 and step 3 $K$ times and get $\hat{\theta}_1$, $\hat{\theta}_2$,...,$\hat{\theta}_K$.
5. The spread in these estimates tells us how large the estimation error is. Suppose we want to set a 95% confidence interval on $\theta$, the true parameter value. And suppose we take K = 5000 bootstrap samples. The bootstrap theory suggests that approximately 95% of the time, the true parameter value falls between the 2.5th percentile of the bootstrap samples (or the 125 smallest out of 5000) and the 97.5th percentile (or the 125 largest). As such, the 2.5th percentile of $\hat{\theta}_1$, $\hat{\theta}_2$,...,$\hat{\theta}_{5000}$ and the 97.5th percentile provides the 95% CI for $\theta$.

In [10]:
bootstrapped_ests = []
for i in tqdm(range(1000)):
    boston_data_boot = boston_data.sample(n=len(boston_data), replace=True)
    boston_model_boot = smf.ols(formula='MEDV ~ CRIM + ZN', data=boston_data_boot).fit()
    bootstrapped_ests.append(boston_model_boot.params[['CRIM', 'ZN']])

b_CRIMs, b_ZNs = zip(*bootstrapped_ests)

print("The 95% bootstrapped CI of b_CRIMs is [{:.2f}, {:.2f}].".format(
    np.percentile(b_CRIMs, 2.5),
    np.percentile(b_CRIMs, 97.5)))

print("The 95% bootstrapped CI of b_ZNs is [{:.2f}, {:.2f}].".format(
    np.percentile(b_ZNs, 2.5),
    np.percentile(b_ZNs, 97.5)))

  0%|          | 0/1000 [00:00<?, ?it/s]

The 95% bootstrapped CI of b_CRIMs is [-0.53, -0.26].
The 95% bootstrapped CI of b_ZNs is [0.08, 0.15].


# Variable Selection

In [11]:
from sklearn.linear_model import LinearRegression

## sklearn model without selection  
[Read the manual](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html)

In [12]:
model_3 = LinearRegression()

In [13]:
model_3.fit(X, y)

LinearRegression()

In [14]:
model_3.coef_

array([ 0.00000000e+00, -1.08011358e-01,  4.64204584e-02,  2.05586264e-02,
        2.68673382e+00, -1.77666112e+01,  3.80986521e+00,  6.92224640e-04,
       -1.47556685e+00,  3.06049479e-01, -1.23345939e-02, -9.52747232e-01,
        9.31168327e-03, -5.24758378e-01])

## Stepwise Regression using sklearn + mlxtend.  
[Read the manual and examples](http://rasbt.github.io/mlxtend/user_guide/feature_selection/SequentialFeatureSelector/)

In [15]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
model_sfs = SFS(model_3, scoring='neg_mean_squared_error', k_features=4, verbose=1, cv=5,)

ModuleNotFoundError: No module named 'mlxtend'

In [16]:
model_sfs.fit(X, y)

NameError: name 'model_sfs' is not defined

In [None]:
model_sfs.get_metric_dict()

In [None]:
pd.DataFrame.from_dict(model_sfs.get_metric_dict()).T

In [None]:
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
import matplotlib.pyplot as plt

fig1 = plot_sfs(model_sfs.get_metric_dict(), kind='std_dev')
plt.title('Sequential Forward Selection')
plt.grid()
plt.show()

After selection, we need to refit model

In [None]:
X_selected = model_sfs.transform(X)
X_selected.shape

In [None]:
model_3_after_selection = model_3.fit(X_selected, y)

We can use the refitted model to make predictions (in-sample)

In [None]:
model_3_after_selection.predict(X_selected)[:10] 

## Recursive feature elimination with sklearn

[Manual](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html)

In [None]:
from sklearn.feature_selection import RFE

In [None]:
model_RFE = RFE(model_3, n_features_to_select=4)
model_RFE.fit(X, y)

In [None]:
model_RFE.get_support()

Get the selected X variables:

In [None]:
X.loc[:, model_RFE.get_support()]

# Regularization

[Lasso Manual](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html)  
[LassoCV Manual](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoLarsCV.html#sklearn.linear_model.LassoLarsCV)


In [17]:
from sklearn.linear_model import Lasso, LassoCV, lars_path

We can set the alpha ($\lambda$ in slides)

In [18]:
model_lasso = Lasso(alpha = 0.1, normalize=True)
model_lasso.fit(X, y)
model_lasso.coef_

array([ 0.        , -0.        ,  0.        , -0.        ,  0.        ,
       -0.        ,  2.95469455, -0.        ,  0.        , -0.        ,
       -0.        , -0.24795837,  0.        , -0.42817439])

Which variables are selected? 

In [19]:
X.columns[np.abs(model_lasso.coef_) > 0]

Index(['RM', 'PTRATIO', 'LSTAT'], dtype='object')

LassoCV can search for best alpha automatically.

In [20]:
model_lassoCV = LassoCV(cv=5, normalize=True)
model_lassoCV.fit(X, y)

LassoCV(cv=5, normalize=True)

In [21]:
model_lassoCV.alpha_

0.008002884283707804

We can also visualize the Lasso Path

In [22]:
from scipy import interpolate
from sklearn import preprocessing

In [23]:
alphas_lasso, coefs_lasso, _ = model_lasso.path(X, y)

In [24]:
_, _, coefs = lars_path(X.values, y.values.flatten(), method='lasso')

xx = np.sum(np.abs(coefs.T), axis=1)
xx /= xx[-1]
plt.plot(xx, coefs.T)
ymin, ymax = plt.ylim()
plt.vlines(xx, ymin, ymax, linestyle='dashed')
plt.xlabel('|coef| / max|coef|')
plt.ylabel('Coefficients')
plt.title('LASSO Path')
plt.axis('tight')
plt.show()

NameError: name 'plt' is not defined

# Train-test split 

In [1]:
from sklearn.model_selection import train_test_split

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [27]:
X_train.shape

(404, 14)

In [28]:
X_test.shape

(102, 14)

**Important**: Standardizing/Normalizing data is part of the model training process. You shoud fit a standardizer (learning the mean and std from the train set) and use it to transform both the train and test set. See [here](https://scikit-learn.org/stable/modules/preprocessing.html). 

In [29]:
scaler = preprocessing.StandardScaler().fit(X_train)

In [30]:
model_lassoCV = LassoCV(cv=5, normalize=False) # note we don't need to normalize again
model_lassoCV.fit(scaler.transform(X_train), y_train) # note we transform X_train

LassoCV(cv=5)

Predict test set and evaluate performance

In [31]:
y_hat_test = model_lassoCV.predict(scaler.transform((X_test))) 
# note we transform X_test using the scaler learned from the train set

In [32]:
from sklearn import metrics

In [33]:
metrics.mean_squared_error(y_true=y_test, y_pred=y_hat_test)

29.20161666631384