In [2]:
# 
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import make_column_transformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression


# Load The Data

In [3]:
# Load dataset
filename = "./Data/ComputerHardware/machine.data"
names = "./Data/ComputerHardware/machine.names"
names = ["vendor name", "Model Name", "MYCT", "MMIN", "MMAX", "CACH", "CHMIN", "CHMAX", "PRP", "ERP"]
dataset = pd.read_csv(filename, names=names)

# Cross-Validation and Hyperparameter Search with LinearRegression

- KFold cross-validation
- GridSearchCV and RandomizedSearchCV
- Different preprocessing options (StandardScaler, MinMaxScaler)
- Handling categorical columns (drop vs. one-hot encoding)
- LinearRegression hyperparameters (fit_intercept, normalize)

In [4]:
X = dataset.copy()
y = X["PRP"]
X = X.drop(columns=["PRP", "ERP"])

In [5]:
class ColumnsDroper(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop: list):
        self.columns_to_drop = columns_to_drop
    
    def drop_columns(self, X):
        X = X.drop(self.columns_to_drop, axis=1)
        return(X)
        

    def fit(self, X:pd.DataFrame, y = None):
        return self

    def transform(self, X, y = None):
        X = self.drop_columns(X)
        return(X)

In [6]:
# Option 1: Drop categorical columns
drop_cat = ColumnsDroper(["vendor name", "Model Name"])

In [7]:
# Option 2: Encode categorical columns
cat_features = ["vendor name", "Model Name"]
cat_encoder = make_column_transformer(
    (OneHotEncoder(handle_unknown="ignore"), cat_features),
    remainder="passthrough"
)

In [9]:
# Preprocessing options
scalers = {
    "standard": StandardScaler(),
    "minmax": MinMaxScaler()
}

In [10]:
# Define pipelines for grid search
pipelines = {
    "drop_standard": make_pipeline(
        ColumnsDroper(["vendor name", "Model Name"]),
        StandardScaler(),
        LinearRegression()
    ),
    "drop_minmax": make_pipeline(
        ColumnsDroper(["vendor name", "Model Name"]),
        MinMaxScaler(),
        LinearRegression()
    ),
    "encode_standard": make_pipeline(
        cat_encoder,
        StandardScaler(),
        LinearRegression()
    ),
    "encode_minmax": make_pipeline(
        cat_encoder,
        MinMaxScaler(),
        LinearRegression()
    )
}

In [11]:
# KFold cross-validation with one pipeline
from sklearn.model_selection import KFold, cross_val_score

kf = KFold(n_splits=5, shuffle=True, random_state=42)
pipe = pipelines["drop_standard"]
scores = cross_val_score(pipe, X, y, cv=kf, scoring="neg_mean_squared_error")
print("KFold CV (drop_standard):", scores.mean(), scores.std())

KFold CV (drop_standard): -4719.553300618649 2695.549166896928


In [12]:
# GridSearchCV: different preprocessing and LinearRegression options
param_grid = [
    {
        "columnsdroper__columns_to_drop": [["vendor name", "Model Name"]],
        "standardscaler": [StandardScaler()],
        "linearregression__fit_intercept": [True, False]
    },
    {
        "columnsdroper__columns_to_drop": [["vendor name", "Model Name"]],
        "minmaxscaler": [MinMaxScaler()],
        "linearregression__fit_intercept": [True, False]
    },
    {
        "columntransformer": [cat_encoder],
        "standardscaler": [StandardScaler()],
        "linearregression__fit_intercept": [True, False]
    },
    {
        "columntransformer": [cat_encoder],
        "minmaxscaler": [MinMaxScaler()],
        "linearregression__fit_intercept": [True, False]
    }
]

In [13]:
from sklearn.pipeline import Pipeline

# Build a generic pipeline with all possible steps (some will be skipped depending on params)
full_pipeline = Pipeline([
    ("columnsdroper", ColumnsDroper(["vendor name", "Model Name"])),
    ("columntransformer", "passthrough"),
    ("standardscaler", "passthrough"),
    ("minmaxscaler", "passthrough"),
    ("linearregression", LinearRegression())
])

In [17]:
grid = GridSearchCV(full_pipeline, param_grid, cv=kf, scoring="neg_mean_squared_error", n_jobs=1)

In [18]:
grid.fit(X, y)
print("Best GridSearchCV score:", grid.best_score_)
print("Best params:", grid.best_params_)

Best GridSearchCV score: -4719.5533006186415
Best params: {'columnsdroper__columns_to_drop': ['vendor name', 'Model Name'], 'linearregression__fit_intercept': True, 'minmaxscaler': MinMaxScaler()}


20 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/envs/py39_ml/lib/python3.9/site-packages/pandas/core/indexes/base.py", line 3805, in get_loc
    return self._engine.get_loc(casted_key)
  File "index.pyx", line 167, in pandas._libs.index.IndexEngine.get_loc
  File "index.pyx", line 196, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 7081, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 7089, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'vendor name'

The above exception was the direc

In [20]:
for params in grid.cv_results_['params']:
    print(params)

{'columnsdroper__columns_to_drop': ['vendor name', 'Model Name'], 'linearregression__fit_intercept': True, 'standardscaler': StandardScaler()}
{'columnsdroper__columns_to_drop': ['vendor name', 'Model Name'], 'linearregression__fit_intercept': False, 'standardscaler': StandardScaler()}
{'columnsdroper__columns_to_drop': ['vendor name', 'Model Name'], 'linearregression__fit_intercept': True, 'minmaxscaler': MinMaxScaler()}
{'columnsdroper__columns_to_drop': ['vendor name', 'Model Name'], 'linearregression__fit_intercept': False, 'minmaxscaler': MinMaxScaler()}
{'columntransformer': ColumnTransformer(remainder='passthrough',
                  transformers=[('onehotencoder',
                                 OneHotEncoder(handle_unknown='ignore'),
                                 ['vendor name', 'Model Name'])]), 'linearregression__fit_intercept': True, 'standardscaler': StandardScaler()}
{'columntransformer': ColumnTransformer(remainder='passthrough',
                  transformers=[('one

In [23]:
grid.cv_results_["params"].__len__()

8

In [28]:
# RandomizedSearchCV

param_dist = {
    "columnsdroper__columns_to_drop": [["vendor name", "Model Name"]],
    "standardscaler": [StandardScaler(), "passthrough"],
    "minmaxscaler": [MinMaxScaler(), "passthrough"],
    "linearregression__fit_intercept": [True, False]
}

random_search = RandomizedSearchCV(full_pipeline, param_distributions=param_dist, n_iter=8, cv=kf, scoring="neg_mean_squared_error", random_state=42, n_jobs=-1)
random_search.fit(X, y)
print("Best RandomizedSearchCV score:", random_search.best_score_)
print("Best params:", random_search.best_params_)

Best RandomizedSearchCV score: -4719.553300618643
Best params: {'standardscaler': StandardScaler(), 'minmaxscaler': MinMaxScaler(), 'linearregression__fit_intercept': True, 'columnsdroper__columns_to_drop': ['vendor name', 'Model Name']}
