# More about pipelines & hyperparameter tuning

In [60]:
# Imports
import pandas as pd
import numpy as np
import joblib # use it to save trained models to disk so you donâ€™t have to retrain every time.

from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV # RandomizedSearch is faster than GridSearch
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer # preprocessing different columns in different ways inside a single unified pipeline.
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer # used to fill missing values in a dataset.
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

from scipy.stats import randint

In [61]:
data = pd.read_csv("../data/vehicle_emissions.csv")

pd.set_option('display.width', None)
pd.set_option('display.max_columns', None)
pd.set_option("display.expand_frame_repr", False)

print("Data-Set HEAD:\n--------------")
print(data.head())
print("\nData-Set INFO:\n--------------")
data.info()

Data-Set HEAD:
--------------
   Model_Year   Make              Model Vehicle_Class  Engine_Size  Cylinders Transmission  Fuel_Consumption_in_City(L/100 km)  Fuel_Consumption_in_City_Hwy(L/100 km)  Fuel_Consumption_comb(L/100km)  CO2_Emissions  Smog_Level
0        2021  Acura                ILX       Compact          2.4          4          AM8                                 9.9                                     7.0                             8.6            199           3
1        2021  Acura                NSX    Two-seater          3.5          6          AM9                                11.1                                    10.8                            11.0            256           3
2        2021  Acura         RDX SH-AWD    SUV: Small          2.0          4         AS10                                11.0                                     8.6                             9.9            232           6
3        2021  Acura  RDX SH-AWD A-SPEC    SUV: Small          2.0

In [62]:
# Create Features and Target Variables
X = data.drop(["CO2_Emissions"], axis=1)
y = data["CO2_Emissions"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [63]:
# Split Numerical and Categorical Columns
# They require different preprocessing steps.
# Numerical values need imputation & scaling.
# Categorical values need imputation & encoding.
numerical_cols = ["Model_Year", "Engine_Size", "Cylinders", "Fuel_Consumption_in_City(L/100 km)",
                  "Fuel_Consumption_in_City_Hwy(L/100 km)", "Fuel_Consumption_comb(L/100km)", 
                  "Smog_Level"]
categorical_cols = ["Make", "Model", "Vehicle_Class", "Transmission"]

In [64]:
# Starting pipeline
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="mean")),
    ('scaler', StandardScaler()), # StandardScaler() only works on numbers.
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('encoder', OneHotEncoder(handle_unknown='ignore')) # OneHotEncoder() only works on categories.
])

In [65]:
# Combine the pipelines:
preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_cols), # Applies the numerical_pipeline only to numerical columns.
    ('cat', categorical_pipeline, categorical_cols) # Applies the categorical_pipeline only to categorical columns.
])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor())
])


In [72]:
# Using a param distribution.
param_distributions = {
    'model__n_estimators' : randint(100, 500),
    'model__max_depth' : [None] + list(range(10,50,10)),
    'model__min_samples_split' :  randint(2, 20),
    'model__min_samples_leaf': randint(1,10),
    'model__max_features': ['sqrt', 'log2', None]
}

random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    n_iter=100,
    cv=5,
    scoring="neg_mean_squared_error",
    n_jobs=-1,
    verbose=1,
    random_state=42
)

In [74]:
# Train and predict model
random_search.fit(X_train, y_train)

best_params = random_search.best_params_
best_score = random_search.best_score_
cv_results = random_search.cv_results_
best_model = random_search.best_estimator_

print(f"Best Params: {best_params}")
print(f"Best Score: {best_score}")
print(f"Best Model: {best_model}")

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Params: {'model__max_depth': None, 'model__max_features': None, 'model__min_samples_leaf': 1, 'model__min_samples_split': 3, 'model__n_estimators': 229}
Best Score: -137.6843439388704
Best Model: Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['Model_Year', 'Engine_Size',
                                                   'Cylinders',
                                                   'Fuel_Consumption_in_City(L/100 '
                                                   'km)',
                               

In [75]:
# Making the prediction
prediction  = best_model.predict(X_test)

In [67]:
# View the encoding done.
# encoded_cols = pipeline.named_steps['preprocessor'].named_transformers_['cat']['encoder'].get_feature_names_out(categorical_cols)
# print(encoded_cols)

In [76]:
# Evaluate model accuracy:
mse = mean_squared_error(y_test, prediction) # lower better
rmse = np.sqrt(mse) # lower better 
r2 = r2_score(y_test, prediction) # higher better
mae = mean_absolute_error(y_test, prediction) # lower better

print('Model Performance:')
print(f"R2 Score: {r2:.2f}")
print(f'Root Mean Squared Error: {rmse:.2f}')
print(f'Mean Absolute Error: {mae:.2f}')

Model Performance:
R2 Score: 0.97
Root Mean Squared Error: 10.11
Mean Absolute Error: 3.19
