---
title: "Penalized Regression"
format:
  html:
    embed-resources: true
    code-fold: true
execute:
  echo: true
  warning: false
  message: false
---

# **Day 2**

Exploring more of the data today

**Topic of Penalized Regression**

In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score

In [None]:
# Read the data
ames = pd.read_csv("/content/AmesHousing (1).csv")

# Get rid of columns with mostly NaN values
good_cols = ames.isna().sum() < 100
ames = ames.loc[:,good_cols]

# Drop other NAs
ames = ames.dropna()

In [None]:
X = ames.drop(["SalePrice", "Order", "PID"], axis = 1) #Take out PID because it would way overfit, can predict on the model perfectly but not actually predict very well
y = ames["SalePrice"]


ct = ColumnTransformer(
  [
    ("dummify",
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize",
    StandardScaler(),
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

lr_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
)

In [None]:
cross_val_score(lr_pipeline_1, X, y, cv = 5, scoring = 'r2')

array([0.89727873, 0.91038253, 0.78900365, 0.77208628, 0.9006982 ])

Back and forth how muchy you care about overfitting esentialy

Use a placeholder lambda

Using the ridge method instead

In [None]:
# Ridge
lr_pipeline_2 = Pipeline(
  [("preprocessing", ct),
  ("Ridge", Ridge(alpha = 1))]
)

In [None]:
cross_val_score(lr_pipeline_2, X, y, cv = 5, scoring = 'r2')

array([0.89815807, 0.91744024, 0.79493606, 0.78522563, 0.91389818])

In [None]:
# Trying different Lambda values as dictated by alpha

Using the same pipeline as previously, perform tuning on lambda
.

You should always try lambda
 values on a log scale; that is, donâ€™t use [1,2,3,4]; instead use something like [0.001, 0.01, 0.1, 1, 10]

In [None]:
lr_pipeline_3 = Pipeline(
  [("preprocessing", ct),
  ("Ridge", Ridge(alpha = 0.001))]
)

cross_val_score_3 = cross_val_score(lr_pipeline_3, X, y, cv = 5, scoring = 'r2')

lr_pipeline_4 = Pipeline(
  [("preprocessing", ct),
  ("Ridge", Ridge(alpha = 0.01))]
)

cross_val_score_4 = cross_val_score(lr_pipeline_4, X, y, cv = 5, scoring = 'r2')

lr_pipeline_5 = Pipeline(
  [("preprocessing", ct),
  ("Ridge", Ridge(alpha = 0.1))]
)

cross_val_score_5 = cross_val_score(lr_pipeline_5, X, y, cv = 5, scoring = 'r2') # Fixed pipeline name

lr_pipeline_6 = Pipeline(
  [("preprocessing", ct),
  ("Ridge", Ridge(alpha = 1))]
)

cross_val_score_6 = cross_val_score(lr_pipeline_6, X, y, cv = 5, scoring = 'r2') # Fixed pipeline name


lr_pipeline_7 = Pipeline(
  [("preprocessing", ct),
  ("Ridge", Ridge(alpha = 10))]
)

cross_val_score_7 = cross_val_score(lr_pipeline_7, X, y, cv = 5, scoring = 'r2') # Fixed pipeline name

lr_pipeline_8 = Pipeline(
  [("preprocessing", ct),
  ("Ridge", Ridge(alpha = 100))]
)

cross_val_score_8 = cross_val_score(lr_pipeline_8, X, y, cv = 5, scoring = 'r2') # Fixed pipeline name

# Fixed the print statements to use f-strings
print(f"lambda = 0.001: {cross_val_score_3}")
print(f"lambda = 0.01: {cross_val_score_4}")
print(f"lambda = 0.1: {cross_val_score_5}")
print(f"lambda = 1: {cross_val_score_6}")
print(f"lambda = 10: {cross_val_score_7}")
print(f"lambda = 100: {cross_val_score_8}")

# Calculate and print the mean R2 score for each alpha
mean_scores = {
    0.001: np.mean(cross_val_score_3),
    0.01: np.mean(cross_val_score_4),
    0.1: np.mean(cross_val_score_5),
    1: np.mean(cross_val_score_6),
    10: np.mean(cross_val_score_7),
    100: np.mean(cross_val_score_8)
}

best_alpha = max(mean_scores, key=mean_scores.get)
print(f"\nBest average R2 score is with lambda (alpha) = {best_alpha}")

lambda = 0.001: [0.8972854  0.91040618 0.78901601 0.7721318  0.90076168]
lambda = 0.01: [0.89734306 0.91061417 0.7891259  0.77253192 0.90131686]
lambda = 0.1: [0.89774358 0.91230557 0.79010977 0.77576412 0.90558729]
lambda = 1: [0.89815807 0.91744024 0.79493606 0.78522563 0.91389818]
lambda = 10: [0.8977621  0.92081211 0.80057243 0.78711955 0.91509487]
lambda = 100: [0.88776492 0.92197931 0.79341952 0.77214935 0.91355403]

Best average R2 score is with lambda (alpha) = 10


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
import numpy as np
import pandas as pd

# Define the parameter grid for GridSearchCV for the Ridge model
# Explicitly listing the alpha values on a log scale as requested
param_grid_ridge_explicit = {
    'Ridge__alpha': [0.001, 0.01, 0.1, 1, 10]
}

# Create a pipeline for the Ridge model using the same preprocessing steps (assuming 'ct' is defined)
# If 'ct' is not defined, this code will need to be placed after 'ct' is defined.
ridge_pipeline = Pipeline(
  [("preprocessing", ct),
  ("Ridge", Ridge())]
)


# Create GridSearchCV object with the Ridge pipeline and explicit parameter grid
grid_search_ridge_explicit = GridSearchCV(ridge_pipeline, param_grid_ridge_explicit, cv=5, scoring='r2')

# Fit GridSearchCV
grid_search_ridge_explicit.fit(X, y)

# Print the best parameter and best score
print("Best parameters for Ridge (explicit values):", grid_search_ridge_explicit.best_params_)
print("Best cross-validated R2 score for Ridge (explicit values):", grid_search_ridge_explicit.best_score_)

# Optional: Display all the cross-validation results for Ridge
print("\nAll cross-validation results for Ridge:")
display(pd.DataFrame(grid_search_ridge_explicit.cv_results_))

Best parameters for Ridge (explicit values): {'Ridge__alpha': 10}
Best cross-validated R2 score for Ridge (explicit values): 0.8642722110166747

All cross-validation results for Ridge:


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_Ridge__alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.088632,0.015765,0.033589,0.010272,0.001,{'Ridge__alpha': 0.001},0.897285,0.910406,0.789016,0.772132,0.900762,0.85392,0.060278,5
1,0.117135,0.035633,0.036683,0.01118,0.01,{'Ridge__alpha': 0.01},0.897343,0.910614,0.789126,0.772532,0.901317,0.854186,0.06028,4
2,0.111965,0.045716,0.035426,0.009374,0.1,{'Ridge__alpha': 0.1},0.897744,0.912306,0.79011,0.775764,0.905587,0.856302,0.06025,3
3,0.115277,0.042223,0.032158,0.01044,1.0,{'Ridge__alpha': 1},0.898158,0.91744,0.794936,0.785226,0.913898,0.861932,0.059104,2
4,0.095833,0.00599,0.036627,0.01213,10.0,{'Ridge__alpha': 10},0.897762,0.920812,0.800572,0.78712,0.915095,0.864272,0.058157,1


Create a LASSO pipeline, and tune
.

Fit your best model on the full Ames data, and compare the coefficients to Ridge and OLS

In [None]:
from sklearn.linear_model import Lasso

# Create a pipeline for the Lasso model with a specific alpha
lasso_pipeline_single = Pipeline(
  [("preprocessing", ct),  # Using the same ColumnTransformer as before
  ("Lasso", Lasso(alpha = 1.0, max_iter=10000))] # Lasso model with alpha = 1.0
)

# Calculate and print the cross-validation R2 score for the single Lasso model
cross_val_score_lasso_single = cross_val_score(lasso_pipeline_single, X, y, cv = 5, scoring = 'r2')

print(f"Cross-validation R2 score for Lasso with alpha = 1.0: {cross_val_score_lasso_single}")
print(f"Mean cross-validation R2 score for Lasso with alpha = 1.0: {np.mean(cross_val_score_lasso_single)}")

Cross-validation R2 score for Lasso with alpha = 1.0: [0.89774385 0.91093785 0.79691806 0.77426245 0.90589714]
Mean cross-validation R2 score for Lasso with alpha = 1.0: 0.857151870606031


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso # Changed from Ridge
import numpy as np
import pandas as pd

# Define the parameter grid for GridSearchCV for the Lasso model # Changed from Ridge
# Explicitly listing the alpha values on a log scale as requested
param_grid_lasso_explicit = { # Changed from Ridge
    'Lasso__alpha': [0.001, 0.01, 0.1, 1, 10] # Changed key to Lasso__alpha
}

# Create a pipeline for the Lasso model using the same preprocessing steps (assuming 'ct' is defined) # Changed from Ridge
# If 'ct' is not defined, this code will need to be placed after 'ct' is defined.
lasso_pipeline = Pipeline( # Changed from Ridge
  [("preprocessing", ct),
  ("Lasso", Lasso())] # Changed from Ridge, added max_iter
)


# Create GridSearchCV object with the Lasso pipeline and explicit parameter grid # Changed from Ridge
grid_search_lasso_explicit = GridSearchCV(lasso_pipeline, param_grid_lasso_explicit, cv=5, scoring='r2') # Changed from Ridge

# Fit GridSearchCV
grid_search_lasso_explicit.fit(X, y) # Changed from Ridge

# Print the best parameter and best score
print("Best parameters for Lasso (explicit values):", grid_search_lasso_explicit.best_params_) # Changed from Ridge
print("Best cross-validated R2 score for Lasso (explicit values):", grid_search_lasso_explicit.best_score_) # Changed from Ridge

# Optional: Display all the cross-validation results for Lasso
print("\nAll cross-validation results for Lasso:") # Changed from Ridge
display(pd.DataFrame(grid_search_lasso_explicit.cv_results_)) # Changed from Ridge

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Best parameters for Lasso (explicit values): {'Lasso__alpha': 10}
Best cross-validated R2 score for Lasso (explicit values): 0.860632243732114

All cross-validation results for Lasso:


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_Lasso__alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,1.644492,1.283524,0.042343,0.028153,0.001,{'Lasso__alpha': 0.001},0.897202,0.910396,0.79032,0.77402,0.905557,0.855499,0.060242,5
1,0.915956,0.023222,0.030172,0.001546,0.01,{'Lasso__alpha': 0.01},0.897206,0.910401,0.790859,0.77406,0.905502,0.855606,0.060107,4
2,1.278797,0.642062,0.043876,0.013495,0.1,{'Lasso__alpha': 0.1},0.897258,0.910451,0.795951,0.774072,0.90536,0.856618,0.059025,3
3,0.751356,0.104901,0.027373,0.001452,1.0,{'Lasso__alpha': 1},0.897744,0.910938,0.796918,0.774262,0.905899,0.857152,0.059018,2
4,0.29761,0.111995,0.026845,0.000817,10.0,{'Lasso__alpha': 10},0.900776,0.915067,0.80142,0.776649,0.90925,0.860632,0.059157,1


Elastic Net combines these two

Create an Elastic Net pipeline, and tune
lambda and alpha.

Fit your best model on the full Ames data, and compare the coefficients to Ridge and OLS.

In [None]:
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV

# Create a pipeline for the Elastic Net model using the same preprocessing steps
elastic_net_pipeline = Pipeline(
  [("preprocessing", ct),  # Using the same ColumnTransformer as before
  ("ElasticNet", ElasticNet())] # Added max_iter to prevent convergence warnings
)

# Define the parameter grid for GridSearchCV for the Elastic Net model
# Tuning both 'alpha' and 'l1_ratio'
param_grid_elastic_net = {
    'ElasticNet__alpha': [0.001, 0.01, 0.1, 1, 10],  # Tuning alpha on a log scale
    'ElasticNet__l1_ratio': [0.1, 0.5, 0.9] # Tuning l1_ratio (mix between Lasso and Ridge)
}

# Create GridSearchCV object with the Elastic Net pipeline
grid_search_elastic_net = GridSearchCV(elastic_net_pipeline, param_grid_elastic_net, cv=5, scoring='r2')

# Fit GridSearchCV
grid_search_elastic_net.fit(X, y)

# Print the best parameter and best score for Elastic Net
print("Best parameters for Elastic Net:", grid_search_elastic_net.best_params_)
print("Best cross-validated R2 score for Elastic Net:", grid_search_elastic_net.best_score_)

# Fit the best Elastic Net model on the full data
best_elastic_net_model = grid_search_elastic_net.best_estimator_
best_elastic_net_model.fit(X, y)

# Get the coefficients from the best Elastic Net model
elastic_net_coef = best_elastic_net_model.named_steps['ElasticNet'].coef_

# Assuming best_ridge_model and ols_model from previous cells are available
# If not, you would need to refit them here using their best parameters

# Get the feature names after preprocessing
try:
    feature_names = ct.get_feature_names_out(X.columns)
except:
    # Fallback for older versions or if get_feature_names_out doesn't work as expected
    print("\nCould not retrieve exact feature names. Displaying coefficients without names.")
    feature_names = [f"feature_{i}" for i in range(len(ols_model.named_steps['linear_regression'].coef_))] # Using OLS coef length as a reference

# Create a DataFrame to compare coefficients
coef_comparison_elastic_net = pd.DataFrame({
    'Feature': feature_names,
    'OLS_Coefficient': ols_model.named_steps['linear_regression'].coef_, # Use ols_model from the notebook
    'Ridge_Coefficient': best_ridge_model.named_steps['Ridge'].coef_, # Use best_ridge_model from the notebook
    'Lasso_Coefficient': grid_search_lasso.best_estimator_.named_steps['Lasso'].coef_, # Use grid_search_lasso from the notebook
    'ElasticNet_Coefficient': elastic_net_coef
})

# Display the coefficient comparison
print("\nCoefficient Comparison (OLS vs Ridge vs Lasso vs Elastic Net):")
display(coef_comparison_elastic_net)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Best parameters for Elastic Net: {'ElasticNet__alpha': 0.01, 'ElasticNet__l1_ratio': 0.5}
Best cross-validated R2 score for Elastic Net: 0.8642678883652006

Could not retrieve exact feature names. Displaying coefficients without names.


  model = cd_fast.enet_coordinate_descent(


NameError: name 'ols_model' is not defined