In [65]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import GridSearchCV,cross_val_score
from sklearn.compose import ColumnTransformer

Consider four possible models for predicting house prices:

    Using only the size and number of rooms.
    Using size, number of rooms, and building type.
    Using size and building type, and their interaction.
    Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type.

Set up a pipeline for each of these four models.

Then, get predictions on the test set for each of your pipelines, and compute the root mean squared error. Which model performed best?

Note: You should only use the function train_test_split() one time in your code; that is, we should be predicting on the same test set for all three models.


In [66]:
ames = pd.read_csv("AmesHousing.csv")
ames.head()

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900


In [67]:
ames['Bldg Type']

0       1Fam
1       1Fam
2       1Fam
3       1Fam
4       1Fam
        ... 
2925    1Fam
2926    1Fam
2927    1Fam
2928    1Fam
2929    1Fam
Name: Bldg Type, Length: 2930, dtype: object

In [68]:
X = ames.drop(columns=['SalePrice'], axis=1)
y = ames["SalePrice"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

## Size and Bedroom Pipeline

In [69]:
#LotArea = size? and Bedroom

#Standardizes these columns and drops the rest
ct1 = ColumnTransformer(
  [
    ("standardize", StandardScaler(), ["Lot Area", "Bedroom AbvGr"])
  ],
  remainder = "drop"
)

pipe1 = Pipeline(
  [("standardize", ct1),
  ("linear_regression", LinearRegression())]
).set_output(transform = "pandas")


pipe1_fit = pipe1.fit(X_train, y_train)
y_preds = pipe1_fit.predict(X_test)
print(pipe1_fit.named_steps['linear_regression'].coef_)
print(mean_squared_error(y_test, y_preds))
scores = cross_val_score(pipe1, X, y, cv=5, scoring='neg_mean_squared_error')
-scores.mean()


[20624.35738284  8911.61822334]
5322854570.032586


5942920483.4934025

## Size, Bedroom, and Building Type Pipeline


In [70]:
#Standardizes these columns and drops the rest
ct2 = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Lot Area", "Bedroom AbvGr"])
  ],
  remainder = "drop"
)

pipe2 = Pipeline(
  [("standardize", ct2),
  ("linear_regression", LinearRegression())]
).set_output(transform = "pandas")


pipe2_fit = pipe2.fit(X_train, y_train)
y_preds = pipe2_fit.predict(X_test)
print(pipe2_fit.named_steps['linear_regression'].coef_)
print(mean_squared_error(y_test, y_preds))
scores = cross_val_score(pipe2, X, y, cv=5, scoring='neg_mean_squared_error')
-scores.mean()

[ 21352.52219231 -47432.48366306 -39226.33174702   2359.21268099
  62947.08053678  21232.35255458  15871.80068945]
5062558346.127863


5635180047.845639

##  Size and Building Type and Interaction

In [71]:
X_train_dummified = ct3.fit_transform(X_train)
X_train_dummified

Unnamed: 0,dummify__Bldg Type_1Fam,dummify__Bldg Type_2fmCon,dummify__Bldg Type_Duplex,dummify__Bldg Type_Twnhs,dummify__Bldg Type_TwnhsE,standardize__Lot Area,remainder__Order,remainder__PID,remainder__MS SubClass,remainder__MS Zoning,...,remainder__Screen Porch,remainder__Pool Area,remainder__Pool QC,remainder__Fence,remainder__Misc Feature,remainder__Misc Val,remainder__Mo Sold,remainder__Yr Sold,remainder__Sale Type,remainder__Sale Condition
533,1.0,0.0,0.0,0.0,0.0,-0.083155,534,531363010,20,RL,...,0,0,,,,0,4,2009,WD,Normal
802,1.0,0.0,0.0,0.0,0.0,0.553796,803,906203120,20,RL,...,0,0,,,,0,6,2009,WD,Normal
955,1.0,0.0,0.0,0.0,0.0,0.515045,956,916176030,20,RL,...,233,0,,,,0,1,2009,COD,Abnorml
459,0.0,0.0,0.0,0.0,1.0,-0.476060,460,528180130,120,RL,...,0,0,,,,0,4,2009,WD,Normal
486,1.0,0.0,0.0,0.0,0.0,-0.066977,487,528290030,80,RL,...,0,0,,,,0,5,2009,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2763,1.0,0.0,0.0,0.0,0.0,-0.028477,2764,906420020,60,RL,...,0,0,,,,0,2,2006,WD,Abnorml
905,1.0,0.0,0.0,0.0,0.0,-0.497631,906,909129090,50,RH,...,0,0,,,,0,7,2009,WD,Abnorml
1096,1.0,0.0,0.0,0.0,0.0,0.274887,1097,528292060,60,RL,...,0,0,,,,0,5,2008,WD,Normal
235,1.0,0.0,0.0,0.0,0.0,0.044762,236,905426060,30,RL,...,0,0,,,Shed,400,5,2010,COD,Abnorml


In [72]:
#Standardizes these columns and drops the rest
ct3 = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Lot Area"])
  ],
  remainder = "passthrough"
)

inter = ColumnTransformer(
  [
    ("interaction1", PolynomialFeatures(interaction_only = True), ["dummify__Bldg Type_1Fam", "standardize__Lot Area"]),
      ("interaction2", PolynomialFeatures(interaction_only = True), ["dummify__Bldg Type_2fmCon", "standardize__Lot Area"]),
      ("interaction3", PolynomialFeatures(interaction_only = True), ["dummify__Bldg Type_Duplex", "standardize__Lot Area"]),
      ("interaction4", PolynomialFeatures(interaction_only = True), ["dummify__Bldg Type_Twnhs", "standardize__Lot Area"]),
      ("interaction5", PolynomialFeatures(interaction_only = True), ["dummify__Bldg Type_TwnhsE", "standardize__Lot Area"])
  ],
  remainder = "drop"
)

pipe3 = Pipeline(
  [("standardize", ct3),
   ("interaction",inter),
  ("linear_regression", LinearRegression())]
).set_output(transform = "pandas")


pipe3_fit = pipe3.fit(X_train, y_train)
y_preds = pipe3_fit.predict(X_test)
print(pipe3_fit.named_steps['linear_regression'].coef_)
print(mean_squared_error(y_test, y_preds))
scores = cross_val_score(pipe3, X, y, cv=5, scoring='neg_mean_squared_error')
-scores.mean()

[ 0.00000000e+00 -4.90489481e+04  1.87575908e+04 -7.21346784e+04
  5.82076609e-11 -1.09771091e+05  1.87575908e+04 -8.23564957e+04
 -2.91038305e-11 -9.11374936e+04  1.87575908e+04 -6.83358944e+04
 -1.88079096e-37  2.02164355e+05  1.87575908e+04  2.08772694e+05
 -4.59177481e-41  4.77931778e+04  1.87575908e+04  3.28119655e+04]
4998044119.378019


5742039455.364227

##  5th order polynomials + building type

In [73]:
#Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type
#Standardizes these columns and drops the rest
ct4 = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("polynomial", PolynomialFeatures(degree=5, include_bias = False), ["Lot Area", "Bedroom AbvGr"])
  ],
  remainder = "drop"
)

pipe4 = Pipeline(
  [("standardize", ct4),
  ("linear_regression", LinearRegression())]
).set_output(transform = "pandas")


pipe4_fit = pipe4.fit(X_train, y_train)
y_preds = pipe4_fit.predict(X_test)
print(pipe4_fit.named_steps['linear_regression'].coef_)
print(mean_squared_error(y_test, y_preds))
scores = cross_val_score(pipe4, X, y, cv=5, scoring='neg_mean_squared_error')
-scores.mean()

[ 2.97596428e-17 -2.39661292e-07 -8.51502821e-13 -5.45661058e-14
 -9.73169894e-17  2.60210759e-12  1.00326326e-15  3.78087136e-08
  1.85556575e-11  6.81047071e-15  8.08099567e-10  2.21619026e-07
  9.82797909e-11  3.88102660e-14 -3.46349473e-14  1.22968796e-09
  1.08526286e-06  5.06152307e-10  2.17398673e-13 -8.80884917e-20
  2.25119792e-14 -9.12200380e-10  5.45817098e-06  2.68203830e-09
  1.25442861e-12]
7247582155.362603


253471039910.07037

Consider one hundred modeling options for house price:

    House size, trying degrees 1 through 10
    Number of rooms, trying degrees 1 through 10
    Building Type

Hint: The dictionary of possible values that you make to give to GridSearchCV will have two elements instead of one.

Q1: Which model performed the best?

Q2: What downsides do you see of trying all possible model options? How might you go about choosing a smaller number of tuning values to try?


In [85]:
ames = pd.read_csv("AmesHousing.csv")
X = ames[["Lot Area", "TotRms AbvGrd","Bldg Type"]]
y = ames["SalePrice"]

In [86]:
ct_poly = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("polynomial1", PolynomialFeatures(), ["Lot Area"]),
    ("polynomial2", PolynomialFeatures(), ["TotRms AbvGrd"])
  ],
  remainder = "drop"
)

lr_pipeline_poly = Pipeline(
  [("preprocessing", ct_poly),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")

degrees = {'preprocessing__polynomial1__degree': np.arange(1, 10),'preprocessing__polynomial2__degree': np.arange(1, 10)}

gscv = GridSearchCV(lr_pipeline_poly, degrees, cv = 5, scoring='r2')

gscv_fitted = gscv.fit(X, y)

gscv_fitted.cv_results_['mean_test_score']

array([  0.3434329 ,   0.34013712,   0.3499688 ,   0.34895661,
         0.35062807,   0.33441832,   0.1560209 ,  -0.84208817,
         0.21330422,   0.35891578,   0.35642933,   0.36788072,
         0.36714644,   0.36837807,   0.36158475,   0.1385557 ,
        -1.72718411,  -2.02990958,   0.27886493,   0.26407843,
         0.27050641,   0.27605854,   0.30006853,   0.30830868,
         0.27796525,   0.27596013,  -2.43755036,  -1.51122832,
        -1.51122832,  -1.51122832,  -1.51122832,  -0.72370927,
        -0.30994997,  -0.51130538,  -0.46398827,  -0.4050215 ,
        -3.90302896,  -3.90302896,  -3.90302896,  -3.90302896,
        -3.90302896,  -3.90302896,  -3.90302896,  -3.90302896,
        -2.92800433, -11.32761461, -11.32761461, -11.32761461,
       -11.32761461, -11.32761461, -11.32761461, -11.32761461,
       -11.32761461, -11.32761461, -16.66475385, -16.66475385,
       -16.66475385, -16.66475385, -16.66475385, -16.66475385,
       -16.66475385, -16.66475385, -16.66475385,  -9.42

In [87]:
df = pd.DataFrame(data = {"degrees_size": np.array(list(map(lambda x: [x]*9, list(range(1,10))))).reshape(1,-1)[0], "degrees_bedroom": list(range(1,10))*9, "scores": gscv_fitted.cv_results_['mean_test_score']})
df

Unnamed: 0,degrees_size,degrees_bedroom,scores
0,1,1,0.343433
1,1,2,0.340137
2,1,3,0.349969
3,1,4,0.348957
4,1,5,0.350628
...,...,...,...
76,9,5,-7.050214
77,9,6,-7.050214
78,9,7,-7.050214
79,9,8,-7.050214


In [88]:
df.iloc[df[df["scores"]==df["scores"].max()].index.values[0]]

degrees_size       2.000000
degrees_bedroom    5.000000
scores             0.368378
Name: 13, dtype: float64

Model with cubic size and linear bedrooms did the best.
Trying all values could get computationally long to perform, probably smarter to have a cutoff point such as a check to see if the metric is only decreasing and then stop continuing to increase complexity at that point and move on.