---
title: "Cross Validation and Tuning"
format:
  html:
    embed-resources: true
    code-fold: true
execute:
  echo: true
  warning: false
  message: false
---

# **Day 1**

In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [2]:
lr = LinearRegression()


ames = pd.read_csv("/content/AmesHousing (1).csv")
X = ames[["Gr Liv Area", "TotRms AbvGrd"]]
y = ames["SalePrice"]



X_train, X_test, y_train, y_test = train_test_split(X, y)

X_train_s = (X_train - X_train.mean())/X_train.std()

lr_fitted = lr.fit(X_train_s, y_train)
lr_fitted.coef_

array([ 68195.68377691, -16752.45579666])

In [3]:
y_preds = lr_fitted.predict(X_test)

r2_score(y_test, y_preds)

-1682645.5266480977

In [4]:
y_preds[1:5]

array([1.58720521e+08, 7.75497188e+07, 7.58101359e+07, 8.24765605e+07])

THESE ARE WAYYY OFF

In [5]:
new_house = pd.DataFrame(data = {"Gr Liv Area": [889], "TotRms AbvGrd": [6]})
new_house

Unnamed: 0,Gr Liv Area,TotRms AbvGrd
0,889,6


In [6]:
new_house_s = (new_house - new_house.mean())/new_house.std()
new_house_s

Unnamed: 0,Gr Liv Area,TotRms AbvGrd
0,,


AGAIN, doesn't work correctly

In [7]:
X_test_s = (X_test - X_train.mean())/X_train.std()
y_preds = lr_fitted.predict(X_test_s)

r2_score(y_test, y_preds)

0.5579561022194479

In [8]:
new_house_s = (new_house - X_train.mean())/X_train.std()
lr_fitted.predict(new_house_s)

array([101402.3256956])

Now let's try to use piplines instead to normalize this process

In [9]:
lr_pipeline = Pipeline(
  [("standardize", StandardScaler()),
  ("linear_regression", LinearRegression())]
)

lr_pipeline

In [10]:
lr_pipeline_fitted = lr_pipeline.fit(X_train, y_train)

y_preds = lr_pipeline_fitted.predict(X_test)
r2_score(y_test, y_preds)

0.5579561022194479

In [11]:
lr_pipeline_fitted.predict(new_house)

array([101402.3256956])

In [12]:
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
)


lr_pipeline = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
)

lr_pipeline

In [13]:
X = ames.drop("SalePrice", axis = 1)
y = ames["SalePrice"]



X_train, X_test, y_train, y_test = train_test_split(X, y)

lr_fitted = lr_pipeline.fit(X_train, y_train)


In [15]:
ct_fitted = ct.fit(X_train)

ct.transform(X_train)

array([[ 1.        ,  0.        ,  0.        , ...,  0.        ,
        -0.46616696, -0.2587848 ],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.12972972, -0.2587848 ],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.50463161,  1.00325783],
       ...,
       [ 0.        ,  1.        ,  0.        , ...,  0.        ,
         0.62696802,  1.00325783],
       [ 0.        ,  0.        ,  1.        , ...,  0.        ,
         0.65064603,  1.00325783],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
        -0.22346732, -0.88980611]])

In [16]:
ct.transform(X_test)

array([[ 1.        ,  0.        ,  0.        , ...,  0.        ,
        -0.83120301, -0.2587848 ],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.60920951, -0.2587848 ],
       [ 0.        ,  1.        ,  0.        , ...,  0.        ,
         0.57566565,  0.37223651],
       ...,
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.63486069,  0.37223651],
       [ 0.        ,  0.        ,  0.        , ...,  1.        ,
         0.10605171,  0.37223651],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.3822952 , -0.2587848 ]])

In [17]:
lr_pipeline_fitted.named_steps['linear_regression'].coef_

array([ 68180.16182752, -16748.64278744])

In [18]:
lr_pipeline = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")


ct.fit_transform(X_train)

Unnamed: 0,dummify__Bldg Type_1Fam,dummify__Bldg Type_2fmCon,dummify__Bldg Type_Duplex,dummify__Bldg Type_Twnhs,dummify__Bldg Type_TwnhsE,standardize__Gr Liv Area,standardize__TotRms AbvGrd
1740,1.0,0.0,0.0,0.0,0.0,-0.466167,-0.258785
2738,1.0,0.0,0.0,0.0,0.0,0.129730,-0.258785
2647,1.0,0.0,0.0,0.0,0.0,0.504632,1.003258
2385,1.0,0.0,0.0,0.0,0.0,0.701948,1.003258
617,1.0,0.0,0.0,0.0,0.0,0.388215,-0.889806
...,...,...,...,...,...,...,...
1366,1.0,0.0,0.0,0.0,0.0,-1.320549,-1.520827
2583,1.0,0.0,0.0,0.0,0.0,-0.235306,-0.889806
207,0.0,1.0,0.0,0.0,0.0,0.626968,1.003258
324,0.0,0.0,1.0,0.0,0.0,0.650646,1.003258


In [19]:
ct_inter = ColumnTransformer(
  [
    ("interaction", PolynomialFeatures(interaction_only = True), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
).set_output(transform = "pandas")

ct_inter.fit_transform(X_train)

Unnamed: 0,interaction__1,interaction__Gr Liv Area,interaction__TotRms AbvGrd,interaction__Gr Liv Area TotRms AbvGrd
1740,1.0,1258.0,6.0,7548.0
2738,1.0,1560.0,6.0,9360.0
2647,1.0,1750.0,8.0,14000.0
2385,1.0,1850.0,8.0,14800.0
617,1.0,1691.0,5.0,8455.0
...,...,...,...,...
1366,1.0,825.0,4.0,3300.0
2583,1.0,1375.0,5.0,6875.0
207,1.0,1812.0,8.0,14496.0
324,1.0,1824.0,8.0,14592.0


In [20]:
ct_dummies = ColumnTransformer(
  [("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"])],
  remainder = "passthrough"
).set_output(transform = "pandas")

ct_inter = ColumnTransformer(
  [
    ("interaction", PolynomialFeatures(interaction_only = True), ["remainder__TotRms AbvGrd", "dummify__Bldg Type_1Fam"]),
  ],
  remainder = "drop"
).set_output(transform = "pandas")

X_train_dummified = ct_dummies.fit_transform(X_train)
X_train_dummified

Unnamed: 0,dummify__Bldg Type_1Fam,dummify__Bldg Type_2fmCon,dummify__Bldg Type_Duplex,dummify__Bldg Type_Twnhs,dummify__Bldg Type_TwnhsE,remainder__Order,remainder__PID,remainder__MS SubClass,remainder__MS Zoning,remainder__Lot Frontage,...,remainder__Screen Porch,remainder__Pool Area,remainder__Pool QC,remainder__Fence,remainder__Misc Feature,remainder__Misc Val,remainder__Mo Sold,remainder__Yr Sold,remainder__Sale Type,remainder__Sale Condition
1740,1.0,0.0,0.0,0.0,0.0,1741,528222090,20,RL,64.0,...,0,0,,,,0,8,2007,New,Partial
2738,1.0,0.0,0.0,0.0,0.0,2739,905427140,80,RL,78.0,...,0,0,,MnPrv,,0,5,2006,WD,Normal
2647,1.0,0.0,0.0,0.0,0.0,2648,902125020,70,RM,50.0,...,0,0,,MnPrv,,0,6,2006,WD,Abnorml
2385,1.0,0.0,0.0,0.0,0.0,2386,528116030,20,RL,98.0,...,0,0,,,,0,3,2006,WD,Normal
617,1.0,0.0,0.0,0.0,0.0,618,534475250,20,RL,90.0,...,0,0,,MnPrv,,0,4,2009,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1366,1.0,0.0,0.0,0.0,0.0,1367,903475020,20,RM,49.0,...,0,0,,MnPrv,,0,6,2008,WD,Normal
2583,1.0,0.0,0.0,0.0,0.0,2584,535303050,20,RL,75.0,...,0,0,,,,0,6,2006,WD,Normal
207,0.0,1.0,0.0,0.0,0.0,208,903476030,190,RM,76.0,...,0,0,,MnPrv,,0,5,2010,WD,Normal
324,0.0,0.0,1.0,0.0,0.0,325,923204150,90,RL,94.0,...,0,0,,,,0,4,2010,WD,Normal


In [21]:
ct_inter.fit_transform(X_train_dummified)

Unnamed: 0,interaction__1,interaction__remainder__TotRms AbvGrd,interaction__dummify__Bldg Type_1Fam,interaction__remainder__TotRms AbvGrd dummify__Bldg Type_1Fam
1740,1.0,6.0,1.0,6.0
2738,1.0,6.0,1.0,6.0
2647,1.0,8.0,1.0,8.0
2385,1.0,8.0,1.0,8.0
617,1.0,5.0,1.0,5.0
...,...,...,...,...
1366,1.0,4.0,1.0,4.0
2583,1.0,5.0,1.0,5.0
207,1.0,8.0,0.0,0.0
324,1.0,8.0,0.0,0.0


Consider four possible models for predicting house prices:

* Using only the size and number of rooms.

* Using size, number of rooms, and building type.

* Using size and building type, and their interaction.

* Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type.

Set up a pipeline for each of these four models.

Then, get predictions on the test set for each of your pipelines, and compute the root mean squared error. Which model performed best?

Note: You should only use the function train_test_split() one time in your code; that is, we should be predicting on the same test set for all three models.

In [22]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Define X and y
X = ames.drop(columns=["SalePrice"])
y = ames["SalePrice"]

# Model 1: Using only the size and number of rooms
features1 = ["Gr Liv Area", "TotRms AbvGrd"]

ct_1 = ColumnTransformer(
    [
        ("select", "passthrough", features1)
    ],
    remainder="drop"
)

pipeline1 = Pipeline([
    ('preprocessing', ct_1),
    ('standardize', StandardScaler()),
    ('linear_regression', LinearRegression())
])

X1_train = X_train[features1]
X1_test = X_test[features1]


pipeline1.fit(X_train, y_train) # Fit using the full X_train
y1_preds = pipeline1.predict(X_test) # Predict using the full X_test

rmse1 = np.sqrt(mean_squared_error(y_test, y1_preds))
print(f"RMSE for Model 1: {rmse1}")


# Model 2: Using size, number of rooms, and building type.
# Defing 'Gr Liv Area', 'TotRms AbvGrd', and 'Bldg Type'.
features2 = ["Gr Liv Area", "TotRms AbvGrd", "Bldg Type"]
X2_train = X_train[features2]
X2_test = X_test[features2]

# Creatinv a ColumnTransformer for preprocessing.
# 'dummify' applies OneHotEncoder to 'Bldg Type' to handle categorical data.
# 'standardize' applies StandardScaler to 'Gr Liv Area' and 'TotRms AbvGrd' to scale numerical features.
ct2 = ColumnTransformer(
    [
        ("dummify", OneHotEncoder(sparse_output=False), ["Bldg Type"]),
        ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"]),
    ],
    remainder="drop", # Drop any columns not specified in the transformers.
)

# Create a pipeline for Model 2, including preprocessing and linear regression.
pipeline2 = Pipeline([
    ('preprocessing', ct2),
    ('linear_regression', LinearRegression())
])

# Fit the pipeline to the training data.
pipeline2.fit(X_train, y_train) # Fit using the full X_train

# Make predictions on the test data.
y2_preds = pipeline2.predict(X_test) # Predict using the full X_test

# Calculate the Root Mean Squared Error (RMSE) for Model 2.
rmse2 = np.sqrt(mean_squared_error(y_test, y2_preds))
print(f"RMSE for Model 2: {rmse2}")


# Model 3: Using size and building type, and their interaction.
# Define the features for this model: 'Gr Liv Area', 'Bldg Type'. 'TotRms AbvGrd' is not used directly here.
features3 = ["Gr Liv Area", "Bldg Type"]
X3_train = X_train[features3]
X3_test = X_test[features3]

# Create a ColumnTransformer for preprocessing, including dummification and interaction terms.
ct3 = ColumnTransformer(
    [
        ("dummify", OneHotEncoder(sparse_output=False), ["Bldg Type"]), # One-hot encode 'Bldg Type'.
        ("standardize", StandardScaler(), ["Gr Liv Area"]), # Standardize Gr Liv Area
        ("interaction", PolynomialFeatures(interaction_only=True, include_bias=False), ["Gr Liv Area"]), # Create interaction term for 'Gr Liv Area'.

    ],
    remainder="drop", # Keep 'Gr Liv Area' for interaction
)

# Create a pipeline for Model 3, including preprocessing and linear regression.
pipeline3 = Pipeline([
    ('preprocessing', ct3),
    ('linear_regression', LinearRegression())
])

# Fit the pipeline to the training data.
pipeline3.fit(X_train, y_train) # Fit using the full X_train

# Make predictions on the test data.
y3_preds = pipeline3.predict(X_test) # Predict using the full X_test

# Calculate the Root Mean Squared Error (RMSE) for Model 3.
rmse3 = np.sqrt(mean_squared_error(y_test, y3_preds))
print(f"RMSE for Model 3: {rmse3}")


# Model 4: Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type.
# Define the features for this model: 'Gr Liv Area', 'TotRms AbvGrd', and 'Bldg Type'.
features4 = ["Gr Liv Area", "TotRms AbvGrd", "Bldg Type"]
X4_train = X_train[features4]
X4_test = X_test[features4]

# Create a ColumnTransformer for preprocessing.
# 'dummify' applies OneHotEncoder to 'Bldg Type'.
# 'poly_size' applies PolynomialFeatures (degree 5) to 'Gr Liv Area'.
# 'poly_rooms' applies PolynomialFeatures (degree 5) to 'TotRms AbvGrd'.
ct4 = ColumnTransformer(
    [
        ("dummify", OneHotEncoder(sparse_output=False), ["Bldg Type"]),
        ("poly_size", PolynomialFeatures(degree=5, include_bias=False), ["Gr Liv Area"]),
        ("poly_rooms", PolynomialFeatures(degree=5, include_bias=False), ["TotRms AbvGrd"]),
    ],
    remainder="drop", # Drop any columns not specified in the transformers.
)

# Create a pipeline for Model 4, including preprocessing and linear regression.
pipeline4 = Pipeline([
    ('preprocessing', ct4),
    ('linear_regression', LinearRegression())
])

# Fit the pipeline to the training data.
pipeline4.fit(X_train, y_train) # Fit using the full X_train

# Make predictions on the test data.
y4_preds = pipeline4.predict(X_test) # Predict using the full X_test

# Calculate the Root Mean Squared Error (RMSE) for Model 4.
rmse4 = np.sqrt(mean_squared_error(y_test, y4_preds))
print(f"RMSE for Model 4: {rmse4}")

RMSE for Model 1: 57259.11459131699
RMSE for Model 2: 55285.95375826663
RMSE for Model 3: 55729.04375333086
RMSE for Model 4: 58079.74529672229


In [23]:
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Define X and y
X = ames.drop(columns=["SalePrice"])
y = ames["SalePrice"]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)


# Model 1: Using only the size and number of rooms
ct_1 = ColumnTransformer(
    [
        ("select", "passthrough", ["Gr Liv Area", "TotRms AbvGrd"])
    ],
    remainder="drop"
)

pipeline1 = Pipeline([
    ('preprocessing', ct_1),
    ('standardize', StandardScaler()),
    ('linear_regression', LinearRegression())
])

# Fit using the full X_train, ColumnTransformer will select the features
pipeline1.fit(X_train, y_train)
# Predict using the full X_test, ColumnTransformer will select the features
y1_preds = pipeline1.predict(X_test)

rmse1 = np.sqrt(mean_squared_error(y_test, y1_preds))
print(f"RMSE for Model 1: {rmse1}")


# Model 2: Using size, number of rooms, and building type.
# Create a ColumnTransformer for preprocessing.
# 'dummify' applies OneHotEncoder to 'Bldg Type' to handle categorical data.
# 'standardize' applies StandardScaler to 'Gr Liv Area' and 'TotRms AbvGrd' to scale numerical features.
ct2 = ColumnTransformer(
    [
        ("dummify", OneHotEncoder(sparse_output=False), ["Bldg Type"]),
        ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"]),
    ],
    remainder="drop", # Drop any columns not specified in the transformers.
)

# Create a pipeline for Model 2, including preprocessing and linear regression.
pipeline2 = Pipeline([
    ('preprocessing', ct2),
    ('linear_regression', LinearRegression())
])

# Fit the pipeline to the training data.
pipeline2.fit(X_train, y_train) # Fit using the full X_train

# Make predictions on the test data.
y2_preds = pipeline2.predict(X_test) # Predict using the full X_test

# Calculate the Root Mean Squared Error (RMSE) for Model 2.
rmse2 = np.sqrt(mean_squared_error(y_test, y2_preds))
print(f"RMSE for Model 2: {rmse2}")


# Model 3: Using size and building type, and their interaction.
# Create a ColumnTransformer for preprocessing, including dummification and interaction terms.
ct3 = ColumnTransformer(
    [
        ("dummify", OneHotEncoder(sparse_output=False), ["Bldg Type"]), # One-hot encode 'Bldg Type'.
        ("standardize", StandardScaler(), ["Gr Liv Area"]), # Standardize Gr Liv Area
        ("interaction", PolynomialFeatures(interaction_only=True, include_bias=False), ["Gr Liv Area"]), # Create interaction term for 'Gr Liv Area'.

    ],
    remainder="drop", # Keep 'Gr Liv Area' for interaction
)

# Create a pipeline for Model 3, including preprocessing and linear regression.
pipeline3 = Pipeline([
    ('preprocessing', ct3),
    ('linear_regression', LinearRegression())
])

# Fit the pipeline to the training data.
pipeline3.fit(X_train, y_train) # Fit using the full X_train

# Make predictions on the test data.
y3_preds = pipeline3.predict(X_test) # Predict using the full X_test

# Calculate the Root Mean Squared Error (RMSE) for Model 3.
rmse3 = np.sqrt(mean_squared_error(y_test, y3_preds))
print(f"RMSE for Model 3: {rmse3}")


# Model 4: Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type.
# Create a ColumnTransformer for preprocessing.
# 'dummify' applies OneHotEncoder to 'Bldg Type'.
# 'poly_size' applies PolynomialFeatures (degree 5) to 'Gr Liv Area'.
# 'poly_rooms' applies PolynomialFeatures (degree 5) to 'TotRms AbvGrd'.
ct4 = ColumnTransformer(
    [
        ("dummify", OneHotEncoder(sparse_output=False), ["Bldg Type"]),
        ("poly_size", PolynomialFeatures(degree=5, include_bias=False), ["Gr Liv Area"]),
        ("poly_rooms", PolynomialFeatures(degree=5, include_bias=False), ["TotRms AbvGrd"]),
    ],
    remainder="drop", # Drop any columns not specified in the transformers.
)

# Create a pipeline for Model 4, including preprocessing and linear regression.
pipeline4 = Pipeline([
    ('preprocessing', ct4),
    ('linear_regression', LinearRegression())
])

# Fit the pipeline to the training data.
pipeline4.fit(X_train, y_train) # Fit using the full X_train

# Make predictions on the test data.
y4_preds = pipeline4.predict(X_test) # Predict using the full X_test

# Calculate the Root Mean Squared Error (RMSE) for Model 4.
rmse4 = np.sqrt(mean_squared_error(y_test, y4_preds))
print(f"RMSE for Model 4: {rmse4}")

RMSE for Model 1: 55268.303363037936
RMSE for Model 2: 54210.31262903667
RMSE for Model 3: 54503.904169999565
RMSE for Model 4: 60813.42506213173


Cross validation

In [24]:
from sklearn.model_selection import cross_val_score

X = ames.drop("SalePrice", axis = 1)
y = ames["SalePrice"]


ct = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
)

lr_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")


scores = cross_val_score(lr_pipeline_1, X, y, cv=5, scoring='r2')
scores

array([0.53197809, 0.53225302, 0.42829534, 0.56574793, 0.60613781])

TEST QUESTION -- how many times was the data tested -- in this case 5 times

TUNING -- note that degree name must reference the other parts of the code

In [25]:
from sklearn.model_selection import GridSearchCV

ct_poly = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("polynomial", PolynomialFeatures(), ["Gr Liv Area"])
  ],
  remainder = "drop"
)

lr_pipeline_poly = Pipeline(
  [("preprocessing", ct_poly),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")

degrees = {'preprocessing__polynomial__degree': np.arange(1, 10)}

gscv = GridSearchCV(lr_pipeline_poly, degrees, cv = 5, scoring='r2')

In [26]:
gscv_fitted = gscv.fit(X, y) # This made 45 possibe models -- 9 models x 5 cross validation searches

gscv_fitted.cv_results_

{'mean_fit_time': array([0.05013499, 0.06169529, 0.03756366, 0.04180427, 0.03344402,
        0.04539924, 0.05305147, 0.04707866, 0.05618773]),
 'std_fit_time': array([0.01472467, 0.02616279, 0.01676121, 0.01546946, 0.02519663,
        0.01501647, 0.02574327, 0.02644647, 0.0254758 ]),
 'mean_score_time': array([0.02436042, 0.02984934, 0.01819777, 0.02481065, 0.01268392,
        0.02224922, 0.02689366, 0.01776705, 0.0199779 ]),
 'std_score_time': array([0.00499742, 0.00738552, 0.01001368, 0.00389093, 0.00149426,
        0.00639919, 0.00699826, 0.00904722, 0.00691561]),
 'param_preprocessing__polynomial__degree': masked_array(data=[1, 2, 3, 4, 5, 6, 7, 8, 9],
              mask=[False, False, False, False, False, False, False, False,
                    False],
        fill_value=999999),
 'params': [{'preprocessing__polynomial__degree': np.int64(1)},
  {'preprocessing__polynomial__degree': np.int64(2)},
  {'preprocessing__polynomial__degree': np.int64(3)},
  {'preprocessing__polynomial__

In [27]:
# Find mean for each of the 9 models that we were testing out
gscv_fitted.cv_results_['mean_test_score']

array([ 0.52988868,  0.5314061 ,  0.55123644,  0.49921823,  0.49713958,
        0.44472675,  0.26412025, -0.26944803, -1.83251917])

Just nicer for vizualizing!

In [28]:
pd.DataFrame(data = {"degrees": np.arange(1, 10), "scores": gscv_fitted.cv_results_['mean_test_score']})

Unnamed: 0,degrees,scores
0,1,0.529889
1,2,0.531406
2,3,0.551236
3,4,0.499218
4,5,0.49714
5,6,0.444727
6,7,0.26412
7,8,-0.269448
8,9,-1.832519


Model 3 would be the best in this case!

Practice Assignment

Once again consider four modeling options for house price:

* Using size, number of rooms, and building type.
* Using size and building type, and their interaction.
* Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type.
* Use cross_val_score with the pipelines you made earlier to find the cross-validated root mean squared error for each model.

Which do you prefer? Does this agree with your conclusion from earlier?

In [30]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
import numpy as np

# Model 1: Using only the size and number of rooms
# This pipeline (pipeline1) is already defined in cell YsI8u0DbB5so

# Calculate cross-validated RMSE for Model 1
rmse_scores_model1 = -cross_val_score(pipeline1, X, y, cv=5, scoring='neg_root_mean_squared_error')
print(f"Cross-validated RMSE for Model 1: {np.mean(rmse_scores_model1)}")


# Model 2: Using size, number of rooms, and building type.
# This pipeline (pipeline2) is already defined in cell YsI8u0DbB5so

# Calculate cross-validated RMSE for Model 2
rmse_scores_model2 = -cross_val_score(pipeline2, X, y, cv=5, scoring='neg_root_mean_squared_error')
print(f"Cross-validated RMSE for Model 2: {np.mean(rmse_scores_model2)}")

# Model 3: Using size and building type, and their interaction.
# This pipeline (pipeline3) is already defined in cell YsI8u0DbB5so

# Calculate cross-validated RMSE for Model 3
rmse_scores_model3 = -cross_val_score(pipeline3, X, y, cv=5, scoring='neg_root_mean_squared_error')
print(f"Cross-validated RMSE for Model 3: {np.mean(rmse_scores_model3)}")

# Model 4: Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type.
# This pipeline (pipeline4) is already defined in cell YsI8u0DbB5so

# Calculate cross-validated RMSE for Model 4
rmse_scores_model4 = -cross_val_score(pipeline4, X, y, cv=5, scoring='neg_root_mean_squared_error')
print(f"Cross-validated RMSE for Model 4: {np.mean(rmse_scores_model4)}")

# Determine which model performed best based on the average cross-validated RMSE
mean_rmses = {
    "Model 1": np.mean(rmse_scores_model1),
    "Model 2": np.mean(rmse_scores_model2),
    "Model 3": np.mean(rmse_scores_model3),
    "Model 4": np.mean(rmse_scores_model4)
}

best_model = min(mean_rmses, key=mean_rmses.get)
print(f"\nBased on cross-validation, the best performing model is: {best_model}")

Cross-validated RMSE for Model 1: 55806.32634926364
Cross-validated RMSE for Model 2: 54168.08142919383
Cross-validated RMSE for Model 3: 54344.554815480275
Cross-validated RMSE for Model 4: 56255.73634469771

Based on cross-validation, the best performing model is: Model 2


Based on the cross-validation results, **Model 2** (using size, number of rooms, and building type) had the lowest average RMSE, indicating it is the best performing model among the four considered.

To get the corresponding R-squared metric for Model 2 using cross-validation, we can run `cross_val_score` with the 'r2' scoring.

In [31]:
from sklearn.model_selection import cross_val_score
import numpy as np

# Model 2 pipeline (assuming it's defined as pipeline2 in a previous cell)
# If not defined, you would need to redefine it here or in an earlier cell.
# ct2 = ColumnTransformer(
#     [
#         ("dummify", OneHotEncoder(sparse_output=False), ["Bldg Type"]),
#         ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"]),
#     ],
#     remainder="drop",
# )
# pipeline2 = Pipeline([
#     ('preprocessing', ct2),
#     ('linear_regression', LinearRegression())
# ])


r2_scores_model2 = cross_val_score(pipeline2, X, y, cv=5, scoring='r2')
print(f"Cross-validated R-squared for Model 2: {np.mean(r2_scores_model2)}")

Cross-validated R-squared for Model 2: 0.5328824390692034
