In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import GridSearchCV



In [2]:
ames = pd.read_csv("AmesHousing.csv")


In [3]:
print(ames.head())
print("-----")
print(ames.describe)
print("-----")
print(ames.columns)


   Order        PID  MS SubClass MS Zoning  Lot Frontage  Lot Area Street  \
0      1  526301100           20        RL         141.0     31770   Pave   
1      2  526350040           20        RH          80.0     11622   Pave   
2      3  526351010           20        RL          81.0     14267   Pave   
3      4  526353030           20        RL          93.0     11160   Pave   
4      5  527105010           60        RL          74.0     13830   Pave   

  Alley Lot Shape Land Contour  ... Pool Area Pool QC  Fence Misc Feature  \
0   NaN       IR1          Lvl  ...         0     NaN    NaN          NaN   
1   NaN       Reg          Lvl  ...         0     NaN  MnPrv          NaN   
2   NaN       IR1          Lvl  ...         0     NaN    NaN         Gar2   
3   NaN       Reg          Lvl  ...         0     NaN    NaN          NaN   
4   NaN       IR1          Lvl  ...         0     NaN  MnPrv          NaN   

  Misc Val Mo Sold Yr Sold Sale Type  Sale Condition  SalePrice  
0       

In [4]:
data = ames[['Gr Liv Area', 'TotRms AbvGrd', 'Bldg Type', 'SalePrice']].copy()


In [5]:
data = data.dropna()

X = data[['Gr Liv Area', 'TotRms AbvGrd', 'Bldg Type']]
y = data['SalePrice']


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
num_features = ['Gr Liv Area', 'TotRms AbvGrd']
cat_features = ['Bldg Type']

In [8]:
# Model 1: size + rooms
preprocessor1 = ColumnTransformer([
        ('num', StandardScaler(), num_features)
], 
remainder='drop')

model1 = Pipeline([
        ('preprocessor', preprocessor1),
        ('regressor', LinearRegression())
])

model1

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [9]:
# Model 2: size + rooms + building type
preprocessor2 = ColumnTransformer([
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(drop='first'), cat_features)
])

model2 = Pipeline([
        ('preprocessor', preprocessor2),
        ('regressor', LinearRegression())
])

model2

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [10]:
# Model 3: size + building type + interaction
preprocessor3 = ColumnTransformer([
        ('num', StandardScaler(), ['Gr Liv Area']),
        ('cat', OneHotEncoder(drop='first'), cat_features)
])

model3 = Pipeline([
        ('preprocessor', preprocessor3),
        # degree=2 with interaction_only=True adds only pairwise products (no squares), i.e.,
        # (Gr Liv Area) × (each building-type dummy)
        ('interaction', PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),
        ('regressor', LinearRegression())
])

model3

0,1,2
,steps,"[('preprocessor', ...), ('interaction', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,degree,2
,interaction_only,True
,include_bias,False
,order,'C'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [11]:
#  Model 4: 5-degree polynomial on size and rooms + building type
preprocessor4 = ColumnTransformer([
        ('num', PolynomialFeatures(degree=5, include_bias=False), num_features),
        ('cat', OneHotEncoder(drop='first'), cat_features)
])

model4 = Pipeline([
        ('preprocessor', preprocessor4),
        ('scaler', StandardScaler()),
        ('regressor', LinearRegression())
])

model4

0,1,2
,steps,"[('preprocessor', ...), ('scaler', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,degree,5
,interaction_only,False
,include_bias,False
,order,'C'

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [12]:
models = [model1, model2, model3, model4]
names = ['Model 1', 'Model 2', 'Model 3', 'Model 4']

rmses = []

for name, model in zip(names, models):
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, preds))
        rmses.append(rmse)
        print(f"{name} RMSE: {rmse}")


Model 1 RMSE: 61928.53719680032
Model 2 RMSE: 59589.20317423357
Model 3 RMSE: 58276.72598588448
Model 4 RMSE: 61791.58851621564


In [13]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)

for name, model in zip(names, models):
        neg_rmse = cross_val_score(model, X, y, cv=cv, scoring='neg_root_mean_squared_error')
        print(f"{name} CV RMSE: {(-neg_rmse).mean()}")

Model 1 CV RMSE: 55769.33212965836
Model 2 CV RMSE: 53975.92253624382
Model 3 CV RMSE: 53256.947429096
Model 4 CV RMSE: 84852.67117354981


In [14]:
preprocessor = ColumnTransformer([
        ('num', PolynomialFeatures(include_bias=False), ['Gr Liv Area', 'TotRms AbvGrd']),
        ('cat', OneHotEncoder(drop='first'), ['Bldg Type'])
])

pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('scaler', StandardScaler()),
        ('regressor', LinearRegression())
])

param_grid = {
        'preprocessor__num__degree': list(range(1, 11))
}


In [15]:
grid = GridSearchCV(pipe, param_grid, scoring='neg_root_mean_squared_error', cv=5)
grid.fit(X, y)


0,1,2
,estimator,Pipeline(step...egression())])
,param_grid,"{'preprocessor__num__degree': [1, 2, ...]}"
,scoring,'neg_root_mean_squared_error'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,degree,3
,interaction_only,False
,include_bias,False
,order,'C'

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [16]:
print("Best degree:", grid.best_params_['preprocessor__num__degree'])
print("Best cross-validated RMSE:", -grid.best_score_)

Best degree: 3
Best cross-validated RMSE: 53805.94033479634
