In [1]:
import pandas as pd

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn import set_config

set_config(transform_output="pandas")

In [2]:
df = pd.read_csv("../bmw.csv")

In [3]:
df

Unnamed: 0,Model,Year,Region,Color,Fuel_Type,Transmission,Engine_Size_L,Mileage_KM,Price_USD,Sales_Volume,Sales_Classification
0,5 Series,2016,Asia,Red,Petrol,Manual,3.5,151748,98740,8300,High
1,i8,2013,North America,Red,Hybrid,Automatic,1.6,121671,79219,3428,Low
2,5 Series,2022,North America,Blue,Petrol,Automatic,4.5,10991,113265,6994,Low
3,X3,2024,Middle East,Blue,Petrol,Automatic,1.7,27255,60971,4047,Low
4,7 Series,2020,South America,Black,Diesel,Manual,2.1,122131,49898,3080,Low
...,...,...,...,...,...,...,...,...,...,...,...
49995,i3,2014,Asia,Red,Hybrid,Manual,4.6,151030,42932,8182,High
49996,i3,2023,Middle East,Silver,Electric,Manual,4.2,147396,48714,9816,High
49997,5 Series,2010,Middle East,Red,Petrol,Automatic,4.5,174939,46126,8280,High
49998,i3,2020,Asia,White,Electric,Automatic,3.8,3379,58566,9486,High


In [4]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

In [5]:
for train_index, test_index in split.split(df, df.Model):
    strat_train_df = df.loc[train_index].copy()
    df.loc[test_index].to_csv("../input.csv", index=False)

In [6]:
bmw_label_df = strat_train_df["Price_USD"].copy()
strat_train_df.drop("Price_USD", axis=1, inplace=True)

num_attributes = strat_train_df.drop(["Model", "Region", "Color", "Fuel_Type", "Transmission", "Sales_Classification"],
                                     axis=1).columns.tolist().copy()

cat_attributes = ["Model", "Region", "Color", "Fuel_Type", "Transmission", "Sales_Classification"]


In [7]:
num_attributes

['Year', 'Engine_Size_L', 'Mileage_KM', 'Sales_Volume']

In [8]:
def construct_pipeline(cat_att, num_att):
    cat_pipeline = Pipeline([
        ("one_hot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ])

    num_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    full_pipeline = ColumnTransformer([
        ("num_pipeline", num_pipeline, num_att),
        ("cat_pipeline", cat_pipeline, cat_att)
    ])

    return full_pipeline

In [9]:
pipeline = construct_pipeline(cat_att=cat_attributes, num_att=num_attributes)

In [10]:
df_prepared = pipeline.fit_transform(strat_train_df)

In [11]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

In [24]:
random_forest_model = RandomForestRegressor(random_state=42)
random_forest_model.fit(df_prepared, bmw_label_df)
random_forest_rmses = -cross_val_score(random_forest_model, df_prepared, bmw_label_df,
                                       scoring="neg_root_mean_squared_error",
                                       cv=10)
pd.Series(random_forest_rmses).describe()

count       10.000000
mean     26351.630636
std        250.414525
min      25969.072235
25%      26163.076481
50%      26449.297848
75%      26520.857401
max      26705.630862
dtype: float64

In [25]:
lin_model = LinearRegression()
lin_model.fit(df_prepared, bmw_label_df)
lin_rmses = -cross_val_score(lin_model, df_prepared, bmw_label_df,
                             scoring="neg_root_mean_squared_error",
                             cv=10)
pd.Series(lin_rmses).describe()

count       10.000000
mean     25995.754371
std        246.866556
min      25631.647258
25%      25784.450641
50%      26057.744005
75%      26152.039160
max      26318.681110
dtype: float64

In [23]:
dec_model = DecisionTreeRegressor()
dec_model.fit(df_prepared, bmw_label_df)
dec_rmses = -cross_val_score(dec_model, df_prepared, bmw_label_df,
                             scoring="neg_root_mean_squared_error",
                             cv=10)
pd.Series(dec_rmses).describe()

count       10.000000
mean     37007.276272
std        239.585254
min      36650.419269
25%      36839.217224
50%      36961.523152
75%      37170.857424
max      37458.268845
dtype: float64

In [28]:
rmse_df = pd.DataFrame({
    "Random Forest Regressor": random_forest_rmses,
    "Linear Regression": lin_rmses,
    "Decision Tree": dec_rmses
})

In [30]:
rmse_df.describe()

Unnamed: 0,Random Forest Regressor,Linear Regression,Decision Tree
count,10.0,10.0,10.0
mean,26351.630636,25995.754371,37007.276272
std,250.414525,246.866556,239.585254
min,25969.072235,25631.647258,36650.419269
25%,26163.076481,25784.450641,36839.217224
50%,26449.297848,26057.744005,36961.523152
75%,26520.857401,26152.03916,37170.857424
max,26705.630862,26318.68111,37458.268845
