In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer

bmw_data_set = pd.read_csv('../../../datasets/BMW sales data (2010-2024).csv')
bmw_data_set

Unnamed: 0,Model,Year,Region,Color,Fuel_Type,Transmission,Engine_Size_L,Mileage_KM,Price_USD,Sales_Volume,Sales_Classification
0,5 Series,2016,Asia,Red,Petrol,Manual,3.5,151748,98740,8300,High
1,i8,2013,North America,Red,Hybrid,Automatic,1.6,121671,79219,3428,Low
2,5 Series,2022,North America,Blue,Petrol,Automatic,4.5,10991,113265,6994,Low
3,X3,2024,Middle East,Blue,Petrol,Automatic,1.7,27255,60971,4047,Low
4,7 Series,2020,South America,Black,Diesel,Manual,2.1,122131,49898,3080,Low
...,...,...,...,...,...,...,...,...,...,...,...
49995,i3,2014,Asia,Red,Hybrid,Manual,4.6,151030,42932,8182,High
49996,i3,2023,Middle East,Silver,Electric,Manual,4.2,147396,48714,9816,High
49997,5 Series,2010,Middle East,Red,Petrol,Automatic,4.5,174939,46126,8280,High
49998,i3,2020,Asia,White,Electric,Automatic,3.8,3379,58566,9486,High


In [2]:
X = bmw_data_set
y = X['Price_USD']
y2 = X['Sales_Volume']
X.drop(columns=['Price_USD', 'Sales_Volume'], inplace=True)
X

Unnamed: 0,Model,Year,Region,Color,Fuel_Type,Transmission,Engine_Size_L,Mileage_KM,Sales_Classification
0,5 Series,2016,Asia,Red,Petrol,Manual,3.5,151748,High
1,i8,2013,North America,Red,Hybrid,Automatic,1.6,121671,Low
2,5 Series,2022,North America,Blue,Petrol,Automatic,4.5,10991,Low
3,X3,2024,Middle East,Blue,Petrol,Automatic,1.7,27255,Low
4,7 Series,2020,South America,Black,Diesel,Manual,2.1,122131,Low
...,...,...,...,...,...,...,...,...,...
49995,i3,2014,Asia,Red,Hybrid,Manual,4.6,151030,High
49996,i3,2023,Middle East,Silver,Electric,Manual,4.2,147396,High
49997,5 Series,2010,Middle East,Red,Petrol,Automatic,4.5,174939,High
49998,i3,2020,Asia,White,Electric,Automatic,3.8,3379,High


In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=41, test_size=0.4)

In [4]:
X_test, y_test, X_val, y_val = train_test_split(X_test, y_test, random_state=41, test_size=0.5)

In [5]:
col_cat_low_cardinality = [col for col in X_train.columns if X_train[col].dtype == 'object'
                           and X_train[col].unique().size <= 2]
col_cat_low_cardinality

['Transmission', 'Sales_Classification']

In [6]:
col_cat_high_cardinality = [col for col in X_train.columns if X_train[col].dtype == 'object'
                            and X_train[col].unique().size > 2]
col_cat_high_cardinality

['Model', 'Region', 'Color', 'Fuel_Type']

In [13]:
num_cols = ['Mileage_KM']
num_cols

['Mileage_KM']

In [37]:
preprocessor = ColumnTransformer(
    transformers = [
        ('cat_low_card', OrdinalEncoder(dtype=np.int64), col_cat_low_cardinality),
        ('cat_high_card', OneHotEncoder(dtype=np.int64, sparse_output=False), col_cat_high_cardinality),
        ('num', MinMaxScaler(), num_cols)
    ],
    remainder='passthrough'
)
pipeline = Pipeline(
    [('preprocessor', preprocessor)]
)

pipeline.fit(X_train)
X_train_prepared = pipeline.transform(X_train)

In [38]:
X_train_prepared

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        3.94171534e-01, 2.02100000e+03, 4.70000000e+00],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, ...,
        6.54907393e-01, 2.01000000e+03, 4.40000000e+00],
       [1.00000000e+00, 1.00000000e+00, 0.00000000e+00, ...,
        1.79964397e-01, 2.02200000e+03, 4.80000000e+00],
       ...,
       [0.00000000e+00, 1.00000000e+00, 1.00000000e+00, ...,
        1.67063365e-01, 2.01900000e+03, 1.80000000e+00],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, ...,
        5.38918113e-01, 2.01300000e+03, 3.80000000e+00],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, ...,
        1.26035083e-01, 2.01000000e+03, 3.40000000e+00]])

In [39]:
pipeline.get_feature_names_out()

array(['cat_low_card__Transmission', 'cat_low_card__Sales_Classification',
       'cat_high_card__Model_3 Series', 'cat_high_card__Model_5 Series',
       'cat_high_card__Model_7 Series', 'cat_high_card__Model_M3',
       'cat_high_card__Model_M5', 'cat_high_card__Model_X1',
       'cat_high_card__Model_X3', 'cat_high_card__Model_X5',
       'cat_high_card__Model_X6', 'cat_high_card__Model_i3',
       'cat_high_card__Model_i8', 'cat_high_card__Region_Africa',
       'cat_high_card__Region_Asia', 'cat_high_card__Region_Europe',
       'cat_high_card__Region_Middle East',
       'cat_high_card__Region_North America',
       'cat_high_card__Region_South America',
       'cat_high_card__Color_Black', 'cat_high_card__Color_Blue',
       'cat_high_card__Color_Grey', 'cat_high_card__Color_Red',
       'cat_high_card__Color_Silver', 'cat_high_card__Color_White',
       'cat_high_card__Fuel_Type_Diesel',
       'cat_high_card__Fuel_Type_Electric',
       'cat_high_card__Fuel_Type_Hybrid',
     

In [40]:
X_train_prepared_df = pd.DataFrame(X_train_prepared, columns=pipeline.get_feature_names_out())
X_train_prepared_df

Unnamed: 0,cat_low_card__Transmission,cat_low_card__Sales_Classification,cat_high_card__Model_3 Series,cat_high_card__Model_5 Series,cat_high_card__Model_7 Series,cat_high_card__Model_M3,cat_high_card__Model_M5,cat_high_card__Model_X1,cat_high_card__Model_X3,cat_high_card__Model_X5,...,cat_high_card__Color_Red,cat_high_card__Color_Silver,cat_high_card__Color_White,cat_high_card__Fuel_Type_Diesel,cat_high_card__Fuel_Type_Electric,cat_high_card__Fuel_Type_Hybrid,cat_high_card__Fuel_Type_Petrol,num__Mileage_KM,remainder__Year,remainder__Engine_Size_L
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.394172,2021.0,4.7
1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.654907,2010.0,4.4
2,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.179964,2022.0,4.8
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.191595,2014.0,2.5
4,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.624750,2010.0,2.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.220883,2022.0,3.4
29996,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.377240,2023.0,4.6
29997,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.167063,2019.0,1.8
29998,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.538918,2013.0,3.8


In [41]:
X_train_prepared_df.describe()

Unnamed: 0,cat_low_card__Transmission,cat_low_card__Sales_Classification,cat_high_card__Model_3 Series,cat_high_card__Model_5 Series,cat_high_card__Model_7 Series,cat_high_card__Model_M3,cat_high_card__Model_M5,cat_high_card__Model_X1,cat_high_card__Model_X3,cat_high_card__Model_X5,...,cat_high_card__Color_Red,cat_high_card__Color_Silver,cat_high_card__Color_White,cat_high_card__Fuel_Type_Diesel,cat_high_card__Fuel_Type_Electric,cat_high_card__Fuel_Type_Hybrid,cat_high_card__Fuel_Type_Petrol,num__Mileage_KM,remainder__Year,remainder__Engine_Size_L
count,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,...,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0
mean,0.499367,0.698233,0.092067,0.090933,0.093367,0.086233,0.089967,0.091733,0.089667,0.090933,...,0.1679,0.1707,0.166233,0.244533,0.2526,0.252333,0.250533,0.499389,2017.0231,3.248013
std,0.500008,0.459032,0.289125,0.287519,0.29095,0.280713,0.286139,0.288654,0.285708,0.287519,...,0.373784,0.376253,0.372296,0.429817,0.434511,0.434359,0.433327,0.289846,4.318833,1.009715
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2010.0,1.5
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.248069,2013.0,2.4
50%,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.498652,2017.0,3.2
75%,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.752346,2021.0,4.1
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2024.0,5.0
