In [1]:
! pip install -U mlxtend
! pip install -U statsmodels
! pip install -U scikit-learn



In [2]:
from sklearn import datasets
import statsmodels.api as sm
import statsmodels.formula.api as smf
import pandas as pd
import numpy as np
from itertools import cycle
from tqdm.auto import tqdm
# Load the Boston housing dataset
boston=datasets.load_boston()

ImportError: 
`load_boston` has been removed from scikit-learn since version 1.2.

The Boston housing prices dataset has an ethical problem: as
investigated in [1], the authors of this dataset engineered a
non-invertible variable "B" assuming that racial self-segregation had a
positive impact on house prices [2]. Furthermore the goal of the
research that led to the creation of this dataset was to study the
impact of air quality but it did not give adequate demonstration of the
validity of this assumption.

The scikit-learn maintainers therefore strongly discourage the use of
this dataset unless the purpose of the code is to study and educate
about ethical issues in data science and machine learning.

In this special case, you can fetch the dataset from the original
source::

    import pandas as pd
    import numpy as np

    data_url = "http://lib.stat.cmu.edu/datasets/boston"
    raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
    data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
    target = raw_df.values[1::2, 2]

Alternative datasets include the California housing dataset and the
Ames housing dataset. You can load the datasets as follows::

    from sklearn.datasets import fetch_california_housing
    housing = fetch_california_housing()

for the California housing dataset and::

    from sklearn.datasets import fetch_openml
    housing = fetch_openml(name="house_prices", as_frame=True)

for the Ames housing dataset.

[1] M Carlisle.
"Racist data destruction?"
<https://medium.com/@docintangible/racist-data-destruction-113e3eff54a8>

[2] Harrison Jr, David, and Daniel L. Rubinfeld.
"Hedonic housing prices and the demand for clean air."
Journal of environmental economics and management 5.1 (1978): 81-102.
<https://www.researchgate.net/publication/4974606_Hedonic_housing_prices_and_the_demand_for_clean_air>


# Multiple Regression

In [None]:
print(boston['DESCR'])

In [None]:
boston_data = pd.DataFrame(boston.data)
boston_data.columns = boston['feature_names']
boston_data['MEDV'] = boston.target

In [None]:
boston_data.head(5)

In [None]:
boston_data.describe()

Run a full model

In [None]:
y = boston_data["MEDV"]
X = boston_data.drop(["MEDV"], axis=1)

In [None]:
X = sm.add_constant(X) # by default statsmodels does not add an intercept
# add a constant is the same as adding a column of 1 to X .
full_model = sm.OLS(y, X).fit()
full_model.summary()

It might be easier to use R-style formulas when there are only a few variables. You do not need to explicitly add an intercept term as in R.

In [None]:
model_2 = smf.ols(formula='MEDV ~ CRIM + ZN', data=boston_data)
res = model_2.fit()
res.summary()

## Bootstrap

Boostrap is a key tool in modern statistics to quantify the uncertainty of estimation. It is is a computer-intensive procedure that substitutes fast computation for theoretical math. The idea is quite simple.  

1. Suppose you have a dataset (sample) and use it to find $\hat{\theta}$, your estimate the unknown parameter $\theta$. 
2. Then draw a new random sample of size n, with replacement, from the orignial dataset. The sample size n is the same as the size of the original dataset (sample). 
3. This new sample is called a bootstrap sample. For this bootstrap sample, we can calculate a new estimate $\hat{\theta}_1$. 
4. Repeat step 2 and step 3 $K$ times and get $\hat{\theta}_1$, $\hat{\theta}_2$,...,$\hat{\theta}_K$.
5. The spread in these estimates tells us how large the estimation error is. Suppose we want to set a 95% confidence interval on $\theta$, the true parameter value. And suppose we take K = 5000 bootstrap samples. The bootstrap theory suggests that approximately 95% of the time, the true parameter value falls between the 2.5th percentile of the bootstrap samples (or the 125 smallest out of 5000) and the 97.5th percentile (or the 125 largest). As such, the 2.5th percentile of $\hat{\theta}_1$, $\hat{\theta}_2$,...,$\hat{\theta}_{5000}$ and the 97.5th percentile provides the 95% CI for $\theta$.

In [None]:
bootstrapped_ests = []
for i in tqdm(range(1000)):
    boston_data_boot = boston_data.sample(n=len(boston_data), replace=True)
    boston_model_boot = smf.ols(formula='MEDV ~ CRIM + ZN', data=boston_data_boot).fit()
    bootstrapped_ests.append(boston_model_boot.params[['CRIM', 'ZN']])

b_CRIMs, b_ZNs = zip(*bootstrapped_ests)

print("The 95% bootstrapped CI of b_CRIMs is [{:.2f}, {:.2f}].".format(
    np.percentile(b_CRIMs, 2.5),
    np.percentile(b_CRIMs, 97.5)))

print("The 95% bootstrapped CI of b_ZNs is [{:.2f}, {:.2f}].".format(
    np.percentile(b_ZNs, 2.5),
    np.percentile(b_ZNs, 97.5)))

# Variable Selection

In [None]:
from sklearn.linear_model import LinearRegression

## sklearn model without selection  
[Read the manual](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html)

In [None]:
model_3 = LinearRegression()

In [None]:
model_3.fit(X, y)

In [None]:
model_3.coef_

## Stepwise Regression using sklearn + mlxtend.  
[Read the manual and examples](http://rasbt.github.io/mlxtend/user_guide/feature_selection/SequentialFeatureSelector/)

In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
model_sfs = SFS(model_3, scoring='neg_mean_squared_error', k_features=4, verbose=1, cv=5,)

In [None]:
model_sfs.fit(X, y)

In [None]:
model_sfs.get_metric_dict()

In [None]:
pd.DataFrame.from_dict(model_sfs.get_metric_dict()).T

In [None]:
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
import matplotlib.pyplot as plt

fig1 = plot_sfs(model_sfs.get_metric_dict(), kind='std_dev')
plt.title('Sequential Forward Selection')
plt.grid()
plt.show()

After selection, we need to refit model

In [None]:
X_selected = model_sfs.transform(X)
X_selected.shape

In [None]:
model_3_after_selection = model_3.fit(X_selected, y)

We can use the refitted model to make predictions (in-sample)

In [None]:
model_3_after_selection.predict(X_selected)[:10] 

## Recursive feature elimination with sklearn

[Manual](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html)

In [None]:
from sklearn.feature_selection import RFE

In [None]:
model_RFE = RFE(model_3, n_features_to_select=4)
model_RFE.fit(X, y)

In [None]:
model_RFE.get_support()

Get the selected X variables:

In [None]:
X.loc[:, model_RFE.get_support()]

# Regularization

[Lasso Manual](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html)  
[LassoCV Manual](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoLarsCV.html#sklearn.linear_model.LassoLarsCV)


In [None]:
from sklearn.linear_model import Lasso, LassoCV, lars_path

We can set the alpha ($\lambda$ in slides)

In [None]:
model_lasso = Lasso(alpha = 0.1, normalize=True)
model_lasso.fit(X, y)
model_lasso.coef_

Which variables are selected? 

In [None]:
X.columns[np.abs(model_lasso.coef_) > 0]

LassoCV can search for best alpha automatically.

In [None]:
model_lassoCV = LassoCV(cv=5, normalize=True)
model_lassoCV.fit(X, y)

In [None]:
model_lassoCV.alpha_

We can also visualize the Lasso Path

In [None]:
from scipy import interpolate
from sklearn import preprocessing

In [None]:
alphas_lasso, coefs_lasso, _ = model_lasso.path(X, y)

In [None]:
_, _, coefs = lars_path(X.values, y.values.flatten(), method='lasso')

xx = np.sum(np.abs(coefs.T), axis=1)
xx /= xx[-1]
plt.plot(xx, coefs.T)
ymin, ymax = plt.ylim()
plt.vlines(xx, ymin, ymax, linestyle='dashed')
plt.xlabel('|coef| / max|coef|')
plt.ylabel('Coefficients')
plt.title('LASSO Path')
plt.axis('tight')
plt.show()

# Train-test split 

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
X_train.shape

In [None]:
X_test.shape

**Important**: Standardizing/Normalizing data is part of the model training process. You shoud fit a standardizer (learning the mean and std from the train set) and use it to transform both the train and test set. See [here](https://scikit-learn.org/stable/modules/preprocessing.html). 

In [None]:
scaler = preprocessing.StandardScaler().fit(X_train)

In [None]:
model_lassoCV = LassoCV(cv=5, normalize=False) # note we don't need to normalize again
model_lassoCV.fit(scaler.transform(X_train), y_train) # note we transform X_train

Predict test set and evaluate performance

In [None]:
y_hat_test = model_lassoCV.predict(scaler.transform((X_test))) 
# note we transform X_test using the scaler learned from the train set

In [None]:
from sklearn import metrics

In [None]:
metrics.mean_squared_error(y_true=y_test, y_pred=y_hat_test)