In [None]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("hellbuoy/car-price-prediction")

print("Path to dataset files:", path)

In [None]:
from pathlib import Path
import kagglehub

csv_file = Path(path)/'CarPrice_Assignment.csv'

df = pd.read_csv(csv_file)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.select_dtypes(include='object').isnull().sum()

In [None]:
df.select_dtypes(include='object').value_counts()

In [None]:
cars = df

In [None]:
cars.describe()

In [None]:
IMAGE_PATH = Path() / 'images' / 'car_price_prediction'
IMAGE_PATH.mkdir(parents=True, exist_ok=True)

def fig_save(fig, tight_layout=True, img_extension='png', resolution=300):
    img_path = IMAGE_PATH / f'{fig}.{img_extension}'
    if tight_layout:
        plt.tight_layout()
    plt.savefig(img_path, format=img_extension, dpi=resolution)

In [None]:
plt.rc('font', size=10)
plt.rc('axes', labelsize=10, titlesize=10)
plt.rc('legend', fontsize=14)
plt.rc('xtick', labelsize=10)
plt.rc('ytick', labelsize=10)

cars.hist(bins=50, figsize=(12, 8))
fig_save('attribute_histogram_plots')

In [None]:
np.random.permutation(10)

# Create Test Set

In [None]:
def split_train_test(data, test_size):
    shuffled_data = np.random.permutation(len(data))
    test_data_size = int(len(data) * test_size)
    train_data_idx = shuffled_data[:-test_data_size]
    test_data_idx = shuffled_data[-test_data_size:]
    return data.iloc[train_data_idx], data.iloc[test_data_idx]

In [None]:
train_data, test_data = split_train_test(cars, 0.2)

In [None]:
train_data.shape, test_data.shape

In [None]:
len(train_data), len(test_data)

In [None]:
cars

In [None]:
cars.horsepower.min(), cars.horsepower.max()

In [None]:
cars["horsepower_cat"] = pd.cut(cars.horsepower,
                                bins=[10, 120, 160, 200, np.inf],
                                labels=[1, 2, 3, 4])    

In [None]:
cars.horsepower_cat.value_counts().sort_index().plot.bar(rot=0, grid=True)
plt.xlabel("Horsepower Category")
plt.ylabel("Number of Cars")
fig_save("cars_horsepower_cat_bar_plot")
plt.show()

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

splitter = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
strat_splits = []
for train_split, test_split in splitter.split(cars, cars.horsepower_cat):
    strat_train_split = cars.iloc[train_split]
    strat_test_split = cars.iloc[test_split]
    strat_splits.append([strat_train_split, strat_test_split])

In [None]:
strat_train_set, strat_test_set = strat_splits[0]

In [None]:
strat_train_set.shape, strat_test_set.shape

In [None]:
from sklearn.model_selection import train_test_split

strat_train_set, strat_test_set = train_test_split(
    cars, stratify=cars.horsepower_cat, test_size=0.2, random_state=42
)

In [None]:
# strat_test_set.horsepower_cat.value_counts()

In [None]:
strat_test_set.horsepower_cat.value_counts() / len(strat_test_set)

In [None]:
def horsepower_cat_proportions(data):
    return data.horsepower_cat.value_counts() / len(data)


train_set, test_set = train_test_split(cars, test_size=0.2, random_state=42)

compare_props = pd.DataFrame(
    {
        "Overall %": horsepower_cat_proportions(cars),
        "Stratified %": horsepower_cat_proportions(strat_test_set),
        "Random %": horsepower_cat_proportions(test_set),
    }
)

compare_props["Strat. Error %"] = (
    compare_props["Stratified %"] / compare_props["Overall %"] - 1
)
compare_props["Rand. Eroor %"] = (
    compare_props["Random %"] / compare_props["Overall %"]- 1
)

(compare_props * 100).round(2)

In [None]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop("horsepower_cat", axis=1, inplace=True)

In [None]:
cars = strat_train_set.copy()

In [None]:
cars

In [None]:
cars.plot(kind="scatter", x="highwaympg", y="price")
fig_save('highwaympg-price')

## Looking for Correlations

In [None]:
corr_matrix = cars.corr(numeric_only=True)

In [None]:
corr_matrix['price'].sort_values(ascending=False)

In [None]:
from pandas.plotting import scatter_matrix

attributes = [
    "price",
    "enginesize",
    "curbweight",
    "horsepower",
    "carwidth",
    "carlength",
    "boreratio",
    "wheelbase",
    "carheight",
]

scatter_matrix(cars[attributes], figsize=(12, 8))
plt.show()

In [None]:
cars.plot(kind="scatter", x="enginesize", y="price", 
          alpha=0.3, grid=True)
fig_save("engine_size_vs_price_scatterplot")
plt.show()

In [None]:
cars.select_dtypes(include=['number'])

In [None]:
cars.dtypes

## Experimenting with Attribute Combinations

In [None]:
cars["weight_per_engine"] = cars["curbweight"] / cars["enginesize"]
cars["enginesize_liters"] = cars["enginesize"] / 1000
cars["car_volume"] = cars["carlength"] * cars["carwidth"] * cars["carheight"]
cars["density"] = cars["curbweight"] / cars["car_volume"]
cars["height_to_width"] = cars["carheight"] / cars["carwidth"]

In [None]:
import seaborn as sns

features_to_plot = [
    "price",
    "enginesize", "curbweight", "horsepower", "carwidth",
    "carlength", "boreratio", "wheelbase", "carheight",
    "weight_per_engine", "density", "height_to_width"
]

# 3. Plot pairplot
sns.pairplot(cars[features_to_plot])
plt.tight_layout()
plt.show()

In [None]:
corr_matrix = cars.corr(numeric_only=True)
corr_matrix["price"].sort_values(ascending=False)

In [None]:
cars = cars.drop(columns=["car_ID", "CarName"])
cars

In [None]:
cars = strat_train_set.drop("price", axis=1)
cars_labels = strat_train_set["price"].copy()

In [None]:
cars.loc[cars.sample(5).index, 'symboling'] = np.nan

In [None]:
null_rows_idx = cars.isnull().any(axis=1)
cars.loc[null_rows_idx].head()

In [None]:
from sklearn.impute import SimpleImputer

impute = SimpleImputer(strategy="median")

In [None]:
cars_num = cars.select_dtypes(include=[np.number])

In [None]:
impute.fit(cars_num)

In [None]:
impute.statistics_

In [None]:
cars_num.median().values

In [None]:
X = impute.transform(cars_num)

In [None]:
impute.feature_names_in_

In [None]:
cars_tr = pd.DataFrame(X, columns=cars_num.columns,
                       index=cars_num.index)

In [None]:
cars_tr.loc[null_rows_idx].head()

In [None]:
impute.strategy

### Checking Outliers

In [None]:
from sklearn.ensemble import IsolationForest

isolation_forest = IsolationForest(random_state=42)
outlier_pred = isolation_forest.fit_predict(X)

In [None]:
# outlier_pred

In [None]:
cars = cars.iloc[outlier_pred == 1]
cars_labels = cars_labels.iloc[outlier_pred == 1]

In [None]:
cars

In [None]:
cars.iloc[9]

### Handling Text and Categorical Attributes

In [None]:
cars_cat = cars.select_dtypes(include='object')

In [None]:
cars_cat

In [None]:
cars_cat.head(8)

In [None]:
cars_cat.fueltype.value_counts()

In [None]:
cars_cat.CarName = cars_cat.CarName.str.split(" ").str[0]

In [None]:
cars_cat.CarName.value_counts()

In [None]:
cars_cat.CarName = cars_cat.CarName.replace({
    'vw': 'volkswagen',
    'vokswagen': 'volkswagen',
    'maxda': 'mazda'
})

In [None]:
cars_cat.CarName.value_counts()

In [None]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder()
cars_cat_encoded = ordinal_encoder.fit_transform(cars_cat)

In [None]:
cars_cat_encoded

In [None]:
cars_ct_en = pd.DataFrame(cars_cat_encoded, columns=cars_cat.columns, index=cars_cat.index)

In [None]:
cars_ct_en

In [None]:
ordinal_encoder.categories_

In [None]:
from sklearn.preprocessing import OneHotEncoder

onehot_encode = OneHotEncoder()
cars_cat_1hot = onehot_encode.fit_transform(cars_cat)

In [None]:
cars_cat_1hot

In [None]:
cars_cat_1hot.toarray()

In [None]:
onehot_encode = OneHotEncoder(sparse_output=False)
cars_cat_1hot = onehot_encode.fit_transform(cars_cat)
cars_cat_1hot

In [None]:
onehot_encode.categories_

In [None]:
onehot_encode.feature_names_in_

In [None]:
onehot_encode.get_feature_names_out()

### Feature Scaling

In [None]:
cars_num

In [None]:
from sklearn.preprocessing import MinMaxScaler

min_max_scalar = MinMaxScaler(feature_range=(-1, 1))
cars_num_min_max_scaled = min_max_scalar.fit_transform(cars_num)

In [None]:
cars_num.isnull().sum()

In [None]:
from sklearn.preprocessing import StandardScaler

std_scaler = StandardScaler()
cars_num_std_scaler = std_scaler.fit_transform(cars_num)

In [None]:
# extra code – this cell generates Figure 2–17
fig, axs = plt.subplots(1, 2, figsize=(8, 3), sharey=True)
cars["enginesize"].hist(ax=axs[0], bins=15)
cars["enginesize"].apply(np.log).hist(ax=axs[1], bins=15)
axs[0].set_xlabel("Population")
axs[1].set_xlabel("Log of population")
axs[0].set_ylabel("Number of cars")
fig_save("long_tail_plot")
plt.show()

In [None]:
from sklearn.metrics.pairwise import rbf_kernel

engine_sim_90 = rbf_kernel(cars[["enginesize"]], [[100]], gamma=0.1)

In [None]:
sizes = np.linspace(cars["enginesize"].min(),
                   cars["enginesize"].max(),
                   500).reshape(-1, 1)
gamma1 = 0.01
gamma2 = 0.003
rbf1 = rbf_kernel(sizes, [[90]], gamma=gamma1)
rbf2 = rbf_kernel(sizes, [[90]], gamma=gamma2)

fig, ax1 = plt.subplots()

ax1.set_xlabel("Cars Engine Size")
ax1.set_ylabel("Number of cars")
ax1.hist(cars["enginesize"], bins=50)

ax2 = ax1.twinx()  # create a twin axis that shares the same x-axis
color = "blue"
ax2.plot(sizes, rbf1, color=color, label="gamma = 0.01")
ax2.plot(sizes, rbf2, color=color, label="gamma = 0.003", linestyle="--")
ax2.tick_params(axis='y', labelcolor=color)
ax2.set_ylabel("Size similarity", color=color)

plt.legend(loc="upper left")
fig_save("age_similarity_plot")
plt.show()

In [None]:
from sklearn.linear_model import LinearRegression

target_scaler = StandardScaler()
scaled_labels = target_scaler.fit_transform(cars_labels.to_frame())

model = LinearRegression()
model.fit(cars[["enginesize"]], scaled_labels)
some_new_data = cars[["enginesize"]].iloc[:5]  # pretend this is new data

scaled_predictions = model.predict(some_new_data)
predictions = target_scaler.inverse_transform(scaled_predictions)

In [None]:
predictions

In [None]:
from sklearn.compose import TransformedTargetRegressor

model = TransformedTargetRegressor(LinearRegression(),
                                   transformer=StandardScaler())

model.fit(cars[["enginesize"]], cars_labels)
predictions = model.predict(some_new_data)

In [None]:
predictions

### Custom Transformers

In [None]:
from sklearn.metrics.pairwise import rbf_kernel

In [None]:
from sklearn.preprocessing import FunctionTransformer

log_transformer = FunctionTransformer(np.log, inverse_func=np.exp)
log_pop = log_transformer.transform(cars[["enginesize"]])

In [None]:
rbf_transformer = FunctionTransformer(rbf_kernel,
                                      kw_args=dict(Y=[[90.]], gamma=0.01))

size_simil_90 = rbf_transformer.transform(cars[["enginesize"]])

In [None]:
from ml_custom_blocks import StandardScalerClone, ClusterSimilarity

In [None]:
cars

In [None]:
# cluster_sim = ClusterSimilarity(n_clusters=10, gamma=1., random_state=42)
# similarities = cluster_sim.fit_transform(cars[["enginesize", "horsepower"]],
#                                          sample_weight=cars_labels)

In [None]:
# similarities[:3].round()
# cars["Max cluster similarity"] = similarities.max(axis=1)


In [None]:
# import matplotlib.pyplot as plt

# plt.figure(figsize=(10, 7))

# scatter = plt.scatter(
#     x=cars["enginesize"],
#     y=cars["horsepower"],
#     s=cars["curbweight"] / 10,
#     c=cars["Max cluster similarity"],  # <- Now directly passing the Series
#     cmap="jet",
#     alpha=0.6
# )

# # Plot cluster centers
# plt.scatter(
#     cluster_sim.kmeans_.cluster_centers_[:, 0],
#     cluster_sim.kmeans_.cluster_centers_[:, 1],
#     color='black', marker='X', s=200, label='Cluster centers'
# )

# plt.xlabel("Engine Size")
# plt.ylabel("Horsepower")
# plt.colorbar(scatter, label="Max Cluster Similarity")
# plt.legend(loc="upper right")
# plt.title("Car Cluster Visualization")
# plt.grid(True)
# plt.tight_layout()
# plt.show()


### Transformation Pipelines

In [None]:
from sklearn.pipeline import Pipeline

num_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("standardize", StandardScaler()),
])

In [None]:
from sklearn.pipeline import make_pipeline

num_pipeline = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())

In [None]:
from sklearn import set_config

set_config(display='diagram')

num_pipeline

In [None]:
cars_num_prepared = num_pipeline.fit_transform(cars_num)
cars_num_prepared[:2].round(2)

In [None]:
# dumpii = pd.DataFrame(cars_num_prepared, index=cars_num.index, columns=cars_num.columns)

In [None]:
# dumpii[null_rows_idx]

In [None]:
df_cars_num_prepared = pd.DataFrame(cars_num_prepared, columns=num_pipeline.get_feature_names_out(), index=cars_num.index)

In [None]:
df_cars_num_prepared.head(2)

In [None]:
num_pipeline.steps

In [None]:
num_pipeline[1]

In [None]:
num_pipeline[:-1]

In [None]:
num_pipeline.named_steps["simpleimputer"]

In [None]:
num_pipeline.set_params(simpleimputer__strategy="median")

In [None]:
cars_cat.columns

In [None]:
from sklearn.compose import ColumnTransformer

num_attribs = ['car_ID', 'symboling', 'wheelbase', 'carlength', 'carwidth',
                'carheight', 'curbweight', 'enginesize', 'boreratio', 'stroke',
                'compressionratio', 'horsepower', 'peakrpm', 'citympg', 'highwaympg']

cat_attribs = ['CarName', 'fueltype', 'aspiration', 'doornumber', 'carbody',
                'drivewheel', 'enginelocation', 'enginetype', 'cylindernumber',
                'fuelsystem']

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore")
)

preprocessing = ColumnTransformer([
    ["num", num_pipeline, num_attribs],
    ["cat", cat_pipeline, cat_attribs],
])

In [None]:
from sklearn.compose import make_column_selector, make_column_transformer

preprocessing = make_column_transformer(
    (num_pipeline, make_column_selector(dtype_include=np.number)),
    (cat_pipeline, make_column_selector(dtype_include=object))
)

In [None]:
cars_prepared = preprocessing.fit_transform(cars)

In [None]:
cars_prepared.shape

In [None]:
df_cars_prepared = pd.DataFrame(cars_prepared.toarray(), index=cars.index, columns=preprocessing.get_feature_names_out())

In [None]:
cars_num

In [None]:
def column_ratio(X):
    return X[:, [0]] / X[:, [1]]


def ratio_name(function_transformer, feature_names_in):
    return ["ratio"]


def ratio_pipeline():
    return make_pipeline(
        SimpleImputer(strategy="median"),
        FunctionTransformer(column_ratio, feature_names_out=ratio_name),
        StandardScaler(),
    )


log_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    FunctionTransformer(np.log, feature_names_out="one-to-one"),
    StandardScaler(),
)
cluster_simil = ClusterSimilarity(n_clusters=10, gamma=1.0, random_state=42)
default_num_pipeline = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())
preprocessing = ColumnTransformer(
    [
        ("weight_per_engine", ratio_pipeline(), ["curbweight", "enginesize"]),
        ("height_to_width", ratio_pipeline(), ["carheight", "carwidth"]),
        (
            "log",
            log_pipeline,
            [
                "curbweight",
                "peakrpm",
                "wheelbase",
                "carlength",
                "carwidth",
                "carheight",
                "enginesize",
                "highwaympg",
            ],
        ),
        ("cat", cat_pipeline, make_column_selector(dtype_include=object)),
    ],
    remainder=default_num_pipeline
)

In [None]:
cars_prepared = preprocessing.fit_transform(cars)
cars_prepared.shape

In [None]:
preprocessing.get_feature_names_out()

## Train Model

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = make_pipeline(preprocessing, LinearRegression())
lin_reg.fit(cars, cars_labels)

In [None]:
cars_predictions = lin_reg.predict(cars)

In [None]:
from sklearn.metrics import root_mean_squared_error

lin_rmse = root_mean_squared_error(cars_labels, cars_predictions)
lin_rmse

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = make_pipeline(preprocessing, DecisionTreeRegressor(random_state=42))
tree_reg.fit(cars, cars_labels)

In [None]:
tree_pred = tree_reg.predict(cars)
tree_rmse = root_mean_squared_error(cars_labels, tree_pred)
tree_rmse

## Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score

tree_rmses = -cross_val_score(tree_reg, cars, cars_labels,
                              scoring="neg_root_mean_squared_error", cv=10)

In [None]:
pd.Series(tree_rmses).describe()

In [None]:
lin_rmses = -cross_val_score(lin_reg, cars, cars_labels,
                              scoring="neg_root_mean_squared_error", cv=10)
pd.Series(lin_rmses).describe()

In [None]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = make_pipeline(preprocessing, RandomForestRegressor(random_state=42))

forest_rmses = -cross_val_score(forest_reg, cars, cars_labels, scoring="neg_root_mean_squared_error", cv=10)

In [None]:
pd.Series(forest_rmses).describe()

In [None]:
forest_reg.fit(cars, cars_labels)
cars_predictions = forest_reg.predict(cars)
forest_rmse = root_mean_squared_error(cars_labels, cars_predictions)
forest_rmse

# Fine Tuning 
## Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

full_pipeline = Pipeline([
    ('preprocessing', preprocessing),
    ('ridge_reg', Ridge())
])
param_grid = {'ridge_reg__alpha': [0.01, 0.1, 1.0, 10.0, 100.0]}
grid_search = GridSearchCV(full_pipeline, param_grid, cv=3, scoring="neg_root_mean_squared_error")

grid_search.fit(cars, cars_labels)

In [None]:
print(str(full_pipeline.get_params().keys())[:1000] + "...")

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_

In [None]:
cv_res = pd.DataFrame(grid_search.cv_results_)
cv_res.sort_values(by="mean_test_score", ascending=False, inplace=True)

cv_res = cv_res[["param_ridge_reg__alpha", "split0_test_score",
                 "split1_test_score", "split2_test_score", "mean_test_score"]]
score_cols = ["split0", "split1", "split2", "mean_test_rmse"]
cv_res.columns = ["max_features"] + score_cols
cv_res[score_cols] = -cv_res[score_cols].round().astype(np.int64)

cv_res.head()

## Randomized Search

In [None]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingRandomSearchCV

In [None]:
from sklearn.model_selection import RandomizedSearchCV

from scipy.stats import reciprocal

param_distribs = {
    'ridge_reg__alpha': reciprocal(0.01, 100)
}


rnd_search = RandomizedSearchCV(
    full_pipeline, param_distributions=param_distribs, n_iter=10, cv=3,
    scoring='neg_root_mean_squared_error', random_state=42) 

rnd_search.fit(cars, cars_labels)

In [None]:
# extra code – displays the random search results
cv_res = pd.DataFrame(rnd_search.cv_results_)
cv_res.sort_values(by="mean_test_score", ascending=False, inplace=True)
cv_res = cv_res[["param_ridge_reg__alpha", "split0_test_score",
                 "split1_test_score", "split2_test_score", "mean_test_score"]]
cv_res.columns = ["max_features"] + score_cols
cv_res[score_cols] = -cv_res[score_cols].round().astype(np.int64)
cv_res.head()

## Evaluate

In [None]:
X_test = strat_test_set.drop("price", axis=1)
y_test = strat_test_set["price"].copy()

final_model = rnd_search.best_estimator_

final_predictions = final_model.predict(X_test)

final_rmse = root_mean_squared_error(y_test, final_predictions)
print(final_rmse)

In [None]:
y_test

In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np


# best_ridge = grid_search.best_estimator_

y_pred = final_model.predict(X_test) 

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Best Ridge RMSE:", rmse)


In [None]:
from scipy import stats

def rmse(squared_errors):
    return np.sqrt(np.mean(squared_errors))

confidence = 0.95
squared_errors = (y_pred - y_test) ** 2
boot_result = stats.bootstrap([squared_errors], rmse,
                              confidence_level=confidence, random_state=42)
rmse_lower, rmse_upper = boot_result.confidence_interval


In [None]:
rmse_lower, rmse_upper

In [None]:
import joblib

joblib.dump(final_model, "my_car_price_model.pkl")

In [None]:
import joblib

final_model_reloaded = joblib.load("my_car_price_model.pkl")

new_data = cars.iloc[:5]
predictions = final_model_reloaded.predict(new_data)

In [None]:
predictions

In [None]:
cars_labels[:5]