In [None]:
import pandas as pd
import numpy as np
import plotnine as p9
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score

In [None]:
ames = pd.read_csv("/content/AmesHousing.csv")

# Get rid of columns with mostly NaN values
good_cols = ames.isna().sum() < 100
ames = ames.loc[:,good_cols]

# Drop other NAs
ames = ames.dropna()
ames

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Area,Street,Lot Shape,Land Contour,Utilities,Lot Config,...,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,31770,Pave,IR1,Lvl,AllPub,Corner,...,0,0,0,0,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,11622,Pave,Reg,Lvl,AllPub,Inside,...,0,0,120,0,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,14267,Pave,IR1,Lvl,AllPub,Corner,...,0,0,0,0,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,11160,Pave,Reg,Lvl,AllPub,Corner,...,0,0,0,0,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,13830,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,3,2010,WD,Normal,189900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2925,2926,923275080,80,RL,7937,Pave,IR1,Lvl,AllPub,CulDSac,...,0,0,0,0,0,3,2006,WD,Normal,142500
2926,2927,923276100,20,RL,8885,Pave,IR1,Low,AllPub,Inside,...,0,0,0,0,0,6,2006,WD,Normal,131000
2927,2928,923400125,85,RL,10441,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,700,7,2006,WD,Normal,132000
2928,2929,924100070,20,RL,10010,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,4,2006,WD,Normal,170000


In [None]:
#Default Linear Regression
X = ames.drop(["SalePrice", "Order", "PID"], axis = 1)
y = ames["SalePrice"]


ct = ColumnTransformer(
  [
    ("dummify",
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize",
    StandardScaler(),
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

lr_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
)

In [None]:
cross_val_score(lr_pipeline_1, X, y, cv = 5, scoring = 'r2')

array([-4.59217770e+20, -1.21723181e+20, -8.37868327e+19, -9.71584476e+20,
       -3.78389822e+20])

In [None]:
X = ames.drop(["SalePrice", "Order", "PID"], axis = 1)
y = ames["SalePrice"]


ct = ColumnTransformer(
  [
    ("dummify",
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize",
    StandardScaler(),
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

ridge_pipeline = Pipeline(
  [("preprocessing", ct),
  ("ridge_regression", Ridge(alpha = 1))]
)

In [None]:
print(cross_val_score(ridge_pipeline, X, y, cv = 5, scoring = 'r2'))
print(np.mean(cross_val_score(ridge_pipeline, X, y, cv = 5, scoring = 'r2')))

[0.8983804  0.91749817 0.79251623 0.78248152 0.91492025]
0.8611593144036662


These scores are far, far better than the ones for the previous model

In [None]:
ridge_pipeline = Pipeline(
  [("preprocessing", ct),
  ("ridge_regression", Ridge(alpha = .001))]
)

print(cross_val_score(ridge_pipeline, X, y, cv = 5, scoring = 'r2'))
print(np.mean(cross_val_score(ridge_pipeline, X, y, cv = 5, scoring = 'r2')))

[0.89854203 0.90960213 0.78694665 0.76870417 0.90093231]
0.8529454580763808


In [None]:
ridge_pipeline = Pipeline(
  [("preprocessing", ct),
  ("ridge_regression", Ridge(alpha = .01))]
)

print(cross_val_score(ridge_pipeline, X, y, cv = 5, scoring = 'r2'))
print(np.mean(cross_val_score(ridge_pipeline, X, y, cv = 5, scoring = 'r2')))

[0.89857655 0.90983697 0.78705498 0.76912045 0.90152915]
0.8532236194833424


In [None]:
ridge_pipeline = Pipeline(
  [("preprocessing", ct),
  ("ridge_regression", Ridge(alpha = .1))]
)

print(cross_val_score(ridge_pipeline, X, y, cv = 5, scoring = 'r2'))
print(np.mean(cross_val_score(ridge_pipeline, X, y, cv = 5, scoring = 'r2')))

[0.89877654 0.91174047 0.78801513 0.77248969 0.90609412]
0.8554231896664215


In [None]:
ridge_pipeline = Pipeline(
  [("preprocessing", ct),
  ("ridge_regression", Ridge(alpha = 10))]
)

print(cross_val_score(ridge_pipeline, X, y, cv = 5, scoring = 'r2'))
print(np.mean(cross_val_score(ridge_pipeline, X, y, cv = 5, scoring = 'r2')))

[0.89725065 0.92136221 0.79751148 0.78466735 0.91598258]
0.8633548524419264


10 appears to be the best choice for alpha, as it gives the highest R^2 on average

# Part 2: Lasso

In [None]:
lasso_pipeline = Pipeline(
  [("preprocessing", ct),
  ("lasso_regression", Lasso(alpha = 1))]
)

print(cross_val_score(lasso_pipeline, X, y, cv = 5, scoring = 'r2'))
print(np.mean(cross_val_score(lasso_pipeline, X, y, cv = 5, scoring = 'r2')))



[0.89890206 0.91021274 0.79352217 0.7703388  0.90647455]




0.8558900650255857




In [None]:
lasso_pipeline = Pipeline(
  [("preprocessing", ct),
  ("lasso_regression", Lasso(alpha = .001))]
)

print(cross_val_score(lasso_pipeline, X, y, cv = 5, scoring = 'r2'))
print(np.mean(cross_val_score(lasso_pipeline, X, y, cv = 5, scoring = 'r2')))



[0.89845958 0.90958752 0.78788765 0.77001534 0.90624892]




0.8544397995269115




In [None]:
lasso_pipeline = Pipeline(
  [("preprocessing", ct),
  ("lasso_regression", Lasso(alpha = .01))]
)

print(cross_val_score(lasso_pipeline, X, y, cv = 5, scoring = 'r2'))
print(np.mean(cross_val_score(lasso_pipeline, X, y, cv = 5, scoring = 'r2')))



[0.89846254 0.90959277 0.78843197 0.77007616 0.90618235]




0.854549158954238




In [None]:
lasso_pipeline = Pipeline(
  [("preprocessing", ct),
  ("lasso_regression", Lasso(alpha = .1))]
)

print(cross_val_score(lasso_pipeline, X, y, cv = 5, scoring = 'r2'))
print(np.mean(cross_val_score(lasso_pipeline, X, y, cv = 5, scoring = 'r2')))



[0.89850667 0.90965082 0.79279655 0.7701185  0.90589892]




0.8553942923313158




In [None]:
lasso_pipeline = Pipeline(
  [("preprocessing", ct),
  ("lasso_regression", Lasso(alpha = 10))]
)

print(cross_val_score(lasso_pipeline, X, y, cv = 5, scoring = 'r2'))
print(np.mean(cross_val_score(lasso_pipeline, X, y, cv = 5, scoring = 'r2')))

[0.90126483 0.91466516 0.79834316 0.77275615 0.91044612]
0.8594950842585085


The best lasso model was the one with alpha of 10. It was not better than the best performing ridge model, although it is still far better than the OLS method.

# Part 3: Elastic Net

alpha = 10 gave us the best results for both ridge and lasso, so we will stick with that

In [None]:
net_pipeline = Pipeline(
  [("preprocessing", ct),
  ("net_regression", ElasticNet(alpha = 10, l1_ratio = .1))]
)

print(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2'))
print(np.mean(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2')))

[0.51857267 0.60545365 0.56551838 0.52749108 0.58959984]
0.5613271234724171


In [None]:
net_pipeline = Pipeline(
  [("preprocessing", ct),
  ("net_regression", ElasticNet(alpha = 10, l1_ratio = .2))]
)

print(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2'))
print(np.mean(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2')))

[0.5452929  0.63225164 0.58936202 0.54996608 0.61599404]
0.586573338240022


In [None]:
net_pipeline = Pipeline(
  [("preprocessing", ct),
  ("net_regression", ElasticNet(alpha = 10, l1_ratio = .3))]
)

print(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2'))
print(np.mean(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2')))

[0.57448069 0.66105656 0.61470614 0.57394039 0.64437059]
0.6137108752901929


In [None]:
net_pipeline = Pipeline(
  [("preprocessing", ct),
  ("net_regression", ElasticNet(alpha = 10, l1_ratio = .4))]
)

print(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2'))
print(np.mean(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2')))

[0.60642199 0.69195506 0.64148681 0.5993884  0.67482306]
0.6428150628494537


In [None]:
net_pipeline = Pipeline(
  [("preprocessing", ct),
  ("net_regression", ElasticNet(alpha = 10, l1_ratio = .5))]
)

print(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2'))
print(np.mean(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2')))

[0.6414183  0.72495609 0.66949369 0.62617536 0.70739037]
0.6738867625545876


In [None]:
net_pipeline = Pipeline(
  [("preprocessing", ct),
  ("net_regression", ElasticNet(alpha = 10, l1_ratio = .6))]
)

print(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2'))
print(np.mean(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2')))

[0.67977875 0.75995082 0.69826358 0.65396424 0.74199763]
0.7067910042238292


In [None]:
net_pipeline = Pipeline(
  [("preprocessing", ct),
  ("net_regression", ElasticNet(alpha = 10, l1_ratio = .7))]
)

print(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2'))
print(np.mean(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2')))

[0.72179896 0.79661752 0.72686734 0.68206736 0.77845047]
0.7411603302801135


In [None]:
net_pipeline = Pipeline(
  [("preprocessing", ct),
  ("net_regression", ElasticNet(alpha = 10, l1_ratio = .8))]
)

print(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2'))
print(np.mean(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2')))

[0.76784457 0.83443673 0.75356924 0.70924416 0.81651998]
0.7763229359426165


In [None]:
net_pipeline = Pipeline(
  [("preprocessing", ct),
  ("net_regression", ElasticNet(alpha = 10, l1_ratio = .9))]
)

print(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2'))
print(np.mean(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2')))

[0.81908537 0.87346804 0.77511063 0.73376357 0.85711255]
0.8117080324541781


In [None]:
net_pipeline = Pipeline(
  [("preprocessing", ct),
  ("net_regression", ElasticNet(alpha = 10, l1_ratio = .99))]
)

print(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2'))
print(np.mean(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2')))

[0.88023223 0.91886489 0.78641743 0.76176058 0.90843509]
0.8511420436156341


Based on these results, elastic net only offers good results for alpha = 10 if the l1_ratio is near 1. Let's try with some other alphas

In [None]:
net_pipeline = Pipeline(
  [("preprocessing", ct),
  ("net_regression", ElasticNet(alpha = .001, l1_ratio = .99))]
)

print(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2'))
print(np.mean(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2')))



[0.8985533  0.91016468 0.78746074 0.76949634 0.90245839]




0.853626689930415




In [None]:
net_pipeline = Pipeline(
  [("preprocessing", ct),
  ("net_regression", ElasticNet(alpha = .001, l1_ratio = .5))]
)

print(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2'))
print(np.mean(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2')))



[0.89833171 0.91776933 0.79283004 0.78291948 0.91505287]




0.8613806869559927




In [None]:
net_pipeline = Pipeline(
  [("preprocessing", ct),
  ("net_regression", ElasticNet(alpha = .001, l1_ratio = .25))]
)

print(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2'))
print(np.mean(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2')))



[0.89818501 0.91856248 0.79394458 0.78419233 0.9152417 ]




0.8620252220028732




In [None]:
net_pipeline = Pipeline(
  [("preprocessing", ct),
  ("net_regression", ElasticNet(alpha = .001, l1_ratio = .1))]
)

print(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2'))
print(np.mean(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2')))



[0.89813129 0.91886753 0.79443593 0.78463365 0.91525145]




0.8622639708214237




In [None]:
net_pipeline = Pipeline(
  [("preprocessing", ct),
  ("net_regression", ElasticNet(alpha = .01, l1_ratio = .9))]
)

print(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2'))
print(np.mean(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2')))



[0.89810373 0.9190335  0.79471785 0.7848517  0.91525069]




0.8623914920036897




In [None]:
net_pipeline = Pipeline(
  [("preprocessing", ct),
  ("net_regression", ElasticNet(alpha = .01, l1_ratio = .6))]
)

print(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2'))
print(np.mean(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2')))



[0.89738131 0.92117134 0.79744001 0.78488968 0.91586869]




0.8633502065818149




This one is about as good as our previous best from ridge alone!

In [None]:
net_pipeline = Pipeline(
  [("preprocessing", ct),
  ("net_regression", ElasticNet(alpha = .01, l1_ratio = .5))]
)

print(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2'))
print(np.mean(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2')))



[0.89707388 0.92159413 0.79756085 0.78435902 0.91612413]




0.8633424013104791




In [None]:
net_pipeline = Pipeline(
  [("preprocessing", ct),
  ("net_regression", ElasticNet(alpha = .01, l1_ratio = .4))]
)

print(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2'))
print(np.mean(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2')))

[0.89675349 0.92195091 0.79755187 0.78379669 0.91634021]
0.8632786333452508


In [None]:
net_pipeline = Pipeline(
  [("preprocessing", ct),
  ("net_regression", ElasticNet(alpha = .01, l1_ratio = .25))]
)

print(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2'))
print(np.mean(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2')))

[0.89626552 0.92238805 0.79739278 0.78294203 0.91659151]
0.863115976972448


In [None]:
net_pipeline = Pipeline(
  [("preprocessing", ct),
  ("net_regression", ElasticNet(alpha = .1, l1_ratio = .9))]
)

print(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2'))
print(np.mean(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2')))

[0.89546139 0.92292223 0.79693198 0.7815566  0.91685411]
0.8627452619329926


In [None]:
net_pipeline = Pipeline(
  [("preprocessing", ct),
  ("net_regression", ElasticNet(alpha = .1, l1_ratio = .5))]
)

print(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2'))
print(np.mean(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2')))

[0.88685578 0.92263593 0.78992096 0.76872349 0.91380872]
0.8563889762359503


In [None]:
net_pipeline = Pipeline(
  [("preprocessing", ct),
  ("net_regression", ElasticNet(alpha = .1, l1_ratio = .25))]
)

print(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2'))
print(np.mean(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2')))

[0.88337032 0.92087376 0.78791151 0.76475035 0.91110037]
0.8536012629243084


In [None]:
net_pipeline = Pipeline(
  [("preprocessing", ct),
  ("net_regression", ElasticNet(alpha = .1, l1_ratio = .1))]
)

print(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2'))
print(np.mean(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2')))

[0.88154593 0.91975921 0.78712297 0.76295714 0.90955755]
0.852188558499626


In [None]:
net_pipeline = Pipeline(
  [("preprocessing", ct),
  ("net_regression", ElasticNet(alpha = 1, l1_ratio = .9))]
)

print(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2'))
print(np.mean(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2')))

[0.88038833 0.91900231 0.78668326 0.76191068 0.9085572 ]
0.8513083584934382


In [None]:
net_pipeline = Pipeline(
  [("preprocessing", ct),
  ("net_regression", ElasticNet(alpha = 1, l1_ratio = .5))]
)

print(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2'))
print(np.mean(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2')))

[0.84881153 0.89544437 0.78218137 0.74552196 0.88112237]
0.8306163198059204


In [None]:
net_pipeline = Pipeline(
  [("preprocessing", ct),
  ("net_regression", ElasticNet(alpha = 1, l1_ratio = .25))]
)

print(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2'))
print(np.mean(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2')))

[0.83343264 0.88406276 0.77915019 0.73961742 0.86855425]
0.8209634525083622


In [None]:
net_pipeline = Pipeline(
  [("preprocessing", ct),
  ("net_regression", ElasticNet(alpha = 1, l1_ratio = .1))]
)

print(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2'))
print(np.mean(cross_val_score(net_pipeline, X, y, cv = 5, scoring = 'r2')))

[0.82480667 0.87771502 0.77689093 0.73620154 0.8616636 ]
0.8154555502063175


After many tries, the best average r^2 we could get was .86335, which was achieved using both ridge and elastic net methods