## Practice Activity 7.1 : Cross-Validation and Tuning
## Author: David Greco
# 11/8/24

In [8]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

## 13.2.5 Practice Activity

In [6]:
lr = LinearRegression()

ames = pd.read_csv(r"c:\Users\elect\OneDrive\Desktop\AmesHousing.csv")

X = ames.drop("SalePrice", axis = 1)
y = ames["SalePrice"]

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [11]:
# Size and Number of Rooms
lr = LinearRegression()

ct = ColumnTransformer(
    [("standardize", StandardScaler(),["Gr Liv Area", "TotRms AbvGrd"])],
    remainder = "drop"
)

my_pipeline = Pipeline([
    ("PreProcessing", ct),
    ("Regression", lr)
]).set_output(transform="pandas")

fitted_pipeline = my_pipeline.fit(X_train, y_train)

y_pred = fitted_pipeline.predict(X_test)

mse = mean_squared_error(y_true = y_test, y_pred = y_pred)
rmse = mse**(1/2)
print(f"Root Mean Squared Error: {rmse}")

scores = cross_val_score(fitted_pipeline, X, y, cv=5, scoring='neg_mean_squared_error')
avg_rmse = (-1*scores.mean())**(1/2)
print(f"Average Root Mean Squared Error: {avg_rmse}")

Root Mean Squared Error: 64733.52286904902
Average Root Mean Squared Error: 56001.24023779208


In [12]:
# Size, Number of Rooms, Building Type
ct = ColumnTransformer(
    [("Dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("standardize", StandardScaler(),["Gr Liv Area", "TotRms AbvGrd"])],
    remainder="drop"
)

my_pipeline = Pipeline([
    ("PreProcessing", ct),
    ("Regression", lr)
]).set_output(transform="pandas")

fitted_pipeline = my_pipeline.fit(X_train, y_train)

y_pred = fitted_pipeline.predict(X_test)

mse = mean_squared_error(y_true = y_test, y_pred = y_pred)
rmse = mse**(1/2)
print(f"Root Mean Squared Error: {rmse}")

scores = cross_val_score(fitted_pipeline, X, y, cv=5, scoring='neg_mean_squared_error')
avg_rmse = (-1*scores.mean())**(1/2)
print(f"Average Root Mean Squared Error: {avg_rmse}")

Root Mean Squared Error: 62435.308801263
Average Root Mean Squared Error: 54332.25522744965


In [13]:
# Size, Building Type, and their interaction
ct = ColumnTransformer(
    [("Dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("standardize", StandardScaler(),["Gr Liv Area"])],
    remainder="drop"
).set_output(transform="pandas")

ct_inter = ColumnTransformer(
    [("Interaction", PolynomialFeatures(interaction_only=True), ["standardize__Gr Liv Area", "Dummify__Bldg Type_1Fam", "Dummify__Bldg Type_2fmCon", "Dummify__Bldg Type_Duplex", "Dummify__Bldg Type_Twnhs", "Dummify__Bldg Type_TwnhsE"])],
    remainder="drop"
).set_output(transform="pandas")

X_train_dummified = ct.fit_transform(X_train)
X_train_inter = ct_inter.fit_transform(X_train_dummified)

my_pipeline = Pipeline([
    ("PreProcessing", ct),
    ("Interact", ct_inter),
    ("Regression", lr)
]).set_output(transform="pandas")

fitted_pipeline = my_pipeline.fit(X_train, y_train)

y_pred = fitted_pipeline.predict(X_test)

mse = mean_squared_error(y_true = y_test, y_pred = y_pred)
rmse = mse**(1/2)
print(f"Root Mean Squared Error: {rmse}")

scores = cross_val_score(fitted_pipeline, X, y, cv=5, scoring='neg_mean_squared_error')
avg_rmse = (-1*scores.mean())**(1/2)
print(f"Average Root Mean Squared Error: {avg_rmse}")

Root Mean Squared Error: 60699.94625913544
Average Root Mean Squared Error: 53583.84060019522


In [14]:
# 5 Degree Polynomial on Size and Number of Rooms, with building type
ct = ColumnTransformer(
    [("Dummify", OneHotEncoder(sparse_output=False), ["Bldg Type"]),
     ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])],
    remainder = "drop"
).set_output(transform="pandas")

poly = PolynomialFeatures(degree = (1, 5))

ct_poly = ColumnTransformer(
    [("5-Degree Polynomial", poly, ["standardize__Gr Liv Area", "standardize__TotRms AbvGrd"])],
    remainder = "drop"
).set_output(transform = "pandas")

X_train_dummified = ct.fit_transform(X_train)
X_train_poly = ct_poly.fit_transform(X_train_dummified)

my_pipeline = Pipeline([
    ("PreProcessing", ct),
    ("Poly", ct_poly),
    ("Regression", lr)
]).set_output(transform="pandas")

fitted_pipeline = my_pipeline.fit(X_train, y_train)

y_pred = fitted_pipeline.predict(X_test)

mse = mean_squared_error(y_true = y_test, y_pred = y_pred)
rmse = mse**(1/2)
print(f"Root Mean Squared Error: {rmse}")

scores = cross_val_score(fitted_pipeline, X, y, cv=5, scoring='neg_mean_squared_error')
avg_rmse = (-1*scores.mean())**(1/2)
print(f"Average Root Mean Squared Error: {avg_rmse}")

Root Mean Squared Error: 144594.56677503738
Average Root Mean Squared Error: 78352.3821374074


The model with the lowest Root Mean Squared Error (RMSE) is the one that includes size, building type, and their interaction.

Root Mean Squared Error: 60699.95

Average Root Mean Squared Error: 53583.84

After performing cross-validation on all four models, we found that this model consistently produced the lowest average RMSE, confirming our earlier result obtained from a single RMSE calculation. This indicates that incorporating the interaction between size and building type enhances the model’s accuracy.

## 2 Practice Activity 

In [18]:
ct = ColumnTransformer(
    [("Dummify", OneHotEncoder(sparse_output=False), ["Bldg Type"]),
     ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])],
    remainder = "drop"
).set_output(transform="pandas")

poly = PolynomialFeatures()

ct_poly = ColumnTransformer(
    [("Poly_1", poly, ["standardize__Gr Liv Area"]),
    ("Poly_2", poly, ["standardize__TotRms AbvGrd"])],
    remainder = "drop"
)

ct_poly_1 = ColumnTransformer(
    [("Polynomial_1", poly, ["standardize__Gr Liv Area"])],
    remainder = "passthrough"
).set_output(transform = "pandas")

ct_poly_2 = ColumnTransformer(
    [("Polynomial_2", poly, ["remainder__standardize__TotRms AbvGrd"])],
    remainder = "drop"
).set_output(transform = "pandas")

X_train_dummified = ct.fit_transform(X_train)
X_train_poly = ct_poly.fit_transform(X_train_dummified)

my_pipeline = Pipeline([
    ("PreProcessing", ct),
    ("Poly", ct_poly),
    ("Regression", lr)
]).set_output(transform="pandas")

degrees = {'Poly__Poly_1__degree': list(range(1, 11)), "Poly__Poly_2__degree": list(range(1, 11))}

gscv = GridSearchCV(my_pipeline, degrees, cv = 5, scoring='r2')

In [16]:
gscv_fitted = gscv.fit(X, y)

## 13.3.3 Practice Activity Answer

In [17]:
scores = gscv_fitted.cv_results_['mean_test_score']

reshaped_scores = scores.reshape(-1, 10)

df = pd.DataFrame(reshaped_scores)

df.index = np.arange(1, len(df) + 1)

df.columns = [i for i in range(1, 11)]
df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
1,0.504209,0.506386,0.509102,0.514402,0.508661,0.513551,0.109585,-1.243942,0.225695,-1.718506
2,0.510919,0.506613,0.506505,0.5074,0.491352,0.504176,0.360343,-0.301577,0.457286,-0.251378
3,0.529559,0.530418,0.527478,0.523482,0.510005,0.509287,0.452479,0.383339,-1.146477,-64.898834
4,0.52286,0.526006,0.525562,0.530995,0.520465,0.516726,0.452285,0.384775,-1.340203,-67.134251
5,0.476571,0.48966,0.496708,0.515624,0.492705,0.419309,0.177101,-0.00165,-1.708776,-69.114476
6,0.427372,0.461563,0.473685,0.525176,0.516866,0.247598,-1.113622,-3.440775,-5.53108,-69.024411
7,0.515959,0.506099,0.478719,0.282558,-0.031128,0.387929,-0.330182,-5.203017,-11.353276,-78.811957
8,-1.274206,-1.482915,-1.663327,-3.664042,-4.715253,-2.227257,-0.051164,-6.513658,-25.254123,-111.896176
9,-44.096407,-45.397061,-46.331579,-50.623998,-61.639079,-53.738819,-38.330567,-20.75673,-32.837642,-107.503956
10,-529.00328,-517.904336,-519.703542,-541.599417,-569.397962,-548.834326,-503.57074,-479.702403,-532.579372,-925.573737


## Q1:
The most effective model is the one that incorporates a polynomial degree of 4 for both house size and number of rooms. This configuration strikes the best balance between model complexity and accuracy, capturing the non-linear relationships in the data without overfitting.

## Q2:
One downside of testing all possible modeling options is the significant increase in computational time due to the large number of calculations required. To mitigate this, a practical approach would be to set a cutoff threshold, where testing of higher-degree polynomials stops once a certain performance metric or value is reached, thus optimizing efficiency without sacrificing model quality.