# Regularized Regression

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, Lasso

Let's look at a couple of models built on the King County housing dataset. 

In [None]:
kc = pd.read_csv('data/kc_house_data.csv')

X = kc[['date', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15']].copy()
y = kc['price']

X['date'] = pd.to_datetime(X['date'])
X['sales_year'] = X['date'].dt.year

X['age_at_sale'] = X['sales_year'] - X['yr_built']
X['years_since_renovation'] = X['sales_year'] - np.max(X[['yr_built', 'yr_renovated']], axis = 1)


X = X.drop(columns = ['date', 'sales_year', 'yr_built', 'yr_renovated'])

X = pd.get_dummies(X, columns = ['zipcode'])

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 321)

Let's start by just making a basic linear regression model with minimal preprocessing.

In [None]:
linreg = LinearRegression().fit(X_train, y_train)

In [None]:
print(f'R2 Score: {r2_score(y_test, linreg.predict(X_test))}')

Let's look at the prediction we get on a particular house, say the first one in the test set.

In [None]:
house_num = 0
sample = X_test.iloc[house_num]

In [None]:
print(f'Prediction: {"${:,.2f}".format(linreg.predict(sample.values.reshape(1,-1))[0])}')

How can we understand how the model came up with this prediction? It is a linear regression model, which means that we have coefficients.

In [None]:
coefficients = pd.DataFrame({'feature': X.columns,
                             'coefficient': linreg.coef_})

In [None]:
coefficients

To get the prediction, you just multiply each feature by the corresponding coefficient and sum up the result. Let's import a helper function to better see this.

In [None]:
from explainer import tell_me_why

In [None]:
house_num = 0
tell_me_why(linreg, X_test.iloc[house_num], X.columns)

We can see that this model has some ridiculous coefficients. This has to do with the fact that we have strong multicollinearity in our dataset, so the coefficients can grow quite large.

We can remedy this (and often get better generalizability) by using a regularized model.

Remember that ridge and lasso both use the magnitude of the coefficients in their penalty terms. For this reason, we should use a StandardScaler to scale our variables prior to fitting.

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [None]:
ridge = Pipeline(
    steps = [
        ('scaler', StandardScaler()),
        ('linreg', RidgeCV())
    ]
).fit(X_train, y_train)


lasso = Pipeline(
    steps = [
        ('scaler', StandardScaler()),
        ('linreg', LassoCV())
    ]
).fit(X_train, y_train)

In [None]:
ridge

In [None]:
print(f'Ridge R2 Score: {r2_score(y_test, ridge.predict(X_test))}')
print(f'Lasso R2 Score: {r2_score(y_test, lasso.predict(X_test))}')

In [None]:
from explainer import tell_me_why_pipe

In [None]:
house_num = 0
tell_me_why_pipe(ridge, X_test.iloc[house_num], X.columns)

In [None]:
house_num = 0
tell_me_why_pipe(lasso, X_test.iloc[house_num], X.columns)

We can also manually set the penalty coefficient if we want to build a simpler model.

In [None]:
lasso_manual = Pipeline(steps = 
                       [
                           ('scaler', StandardScaler()),
                           ('linreg', Lasso(alpha = 10000))
                       ]
                       ).fit(X_train, y_train)

print(f'Percent of nonzero Coefficients: {(lasso_manual[-1].coef_ != 0).mean()}')

print(f'Lasso R2 Score: {r2_score(y_test, lasso_manual.predict(X_test))}')

In [None]:
house_num = 0
tell_me_why_pipe(lasso_manual, X_test.iloc[house_num], X.columns)

Finally, we can try a "relaxed lasso" (https://glmnet.stanford.edu/articles/relax.html) where we refit a non-penalized model on just the variables that had nonzero coefficients. (There is actually a little more to the relaxed lasso, but we'll just do this simple version of it).

In [None]:
coefficients = pd.DataFrame({'feature': X.columns,
                             'coefficient': lasso_manual[-1].coef_})

nonzeros = coefficients.loc[coefficients['coefficient'] != 0, 'feature']

X_relaxed = X[nonzeros]

X_train_relaxed, X_test_relaxed, y_train, y_test = train_test_split(X_relaxed, y, random_state = 321)

lasso_relaxed = LinearRegression().fit(X_train_relaxed, y_train)

print(f'Relaxed Lasso R2 Score: {r2_score(y_test, lasso_relaxed.predict(X_test_relaxed))}')