In [260]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import numpy as np

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.pipeline import Pipeline, make_pipeline

In [261]:
df = pd.read_csv('../../data/cars/cars.csv')

## Default cleaning

In [262]:
# I assume that the relationships between card brands and types are different and can't be captured by a single model
df = df.drop_duplicates()
df = df.query('Make == "Volkswagen"')
df = df[df['Model'].str.contains('Golf')]
df = df[df['Model'].isin(['Golf', 'Golf GTI', 'Golf Variant', 'Golf GTE', 'Golf Plus'])]

## Train test split

In [263]:
train, validation = train_test_split(df, test_size=0.25, random_state=42)  

## Cleaning

In [264]:
def clean_data(df):
    df = df.copy()

    # Feature names
    # Convert to dates
    date_features = ['Production_Date', 'Build_Year', 'Technical_Inspection', 'Last_Service']
    for feature in date_features:
        df[feature] = pd.to_datetime(df[feature], format='mixed')

    # Engine_Displacement
    def fix_engine_displacement(row):
        if row['Engine_Displacement'] > 10:
            return row['Engine_Displacement'] / 1000
        return row['Engine_Displacement']
    # Remove other energy
    df['Engine_Displacement'] = df.apply(fix_engine_displacement, axis=1)

    # Fuel Electric -> drop
    df = df[df['Fuel'] != 'Electric']

    # Category
    df['Category'] = df['Category'].replace(
        {'New Registered': "New", 
        'Classic': "Used", 
        'Demo': "Used", 
    })

    # Doors
    df['Doors'] = df['Doors'].replace(
        {2: 3, 
        4: 5, 
    })  

    # Rare Body types: Remove
    rare_body_types = ['SUV/Off-Road/Pick-Up', 'Overig', 'Cabrio', 'Sedan']
    df = df[~df['Body_Type'].isin(rare_body_types)]

    # Drive_Type: missing -> Front
    df['Drive_Type'] = df['Drive_Type'].replace(np.nan, 'Front')

    # Transmission: Semi-automatic -> Automatic
    df['Transmission'] = df['Transmission'].replace('Semi-automatic', 'Automatic')

    current_date = pd.to_datetime('2025-07-01')
    df['Age_Months'] = (current_date.year - df['Production_Date'].dt.year) * 12 + (current_date.month - df['Production_Date'].dt.month)

    del df['Make']
    del df['Seats']
    del df['Non_Smoker_Car']
    del df['Warranty']
    del df['Last_Service']
    del df['Electric_Range']
    del df['Other_Energy_Sources']
    del df['Production_Year']
    del df['Build_Year']
    del df['Technical_Inspection']
    del df['Production_Date']

    categorical_features = ['Model', 'Color', 'Body_Type', 'Drive_Type',
       'Transmission', 'Emission_Class', 'Fuel', 'Category',
       'Interior_Material', 'Interior_Color', 'Paint_Type', 'Doors']
    
    # Convert to categorical
    for feature in categorical_features:
        df[feature] = df[feature].astype('category')

    return df

In [265]:
train_clean = clean_data(train)
validation_clean = clean_data(validation)

In [266]:
train_clean

Unnamed: 0,Model,Color,Body_Type,Drive_Type,Transmission,Mileage,Emission_Class,Power,Engine_Displacement,Fuel,...,Gears,Cylinders,Curb_Weight,CO2_Emission,Interior_Material,Price,Interior_Color,Paint_Type,Fuel_Consumption,Age_Months
253,Golf,Blue,Hatchback,Front,Manual,89961.0,Euro 4,55.0,1.390,Petrol,...,,4.0,1065.0,161.0,,2450.0,Black,Metallic,,284
891,Golf,Black,Hatchback,Front,Manual,253000.0,Euro 5,90.0,1.390,Petrol,...,,4.0,1190.0,144.0,,3999.0,,,,189
9306,Golf GTI,White,Hatchback,Front,Automatic,253775.0,Euro 5,155.0,1.984,Petrol,...,6.0,4.0,1314.0,173.0,Fabric,7945.0,Black,,,186
11,Golf,Black,Hatchback,Front,Manual,168000.0,Euro 2,74.0,1.595,Petrol,...,5.0,4.0,1049.0,185.0,,1250.0,,Other,7.7,313
257,Golf,Blue,Hatchback,Front,Manual,303103.0,Euro 4,77.0,1.595,Petrol,...,5.0,4.0,1107.0,166.0,Velour,1250.0,Beige,Metallic,,281
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3742,Golf Variant,Black,Stationwagen,Front,Automatic,178342.0,Euro 6,85.0,0.999,Petrol,...,,3.0,1210.0,103.0,Fabric,10445.0,Black,,,106
896,Golf,Black,Stationwagen,Front,Manual,154000.0,,59.0,,Petrol,...,5.0,,,,Fabric,4500.0,Black,,,192
1730,Golf,Black,Hatchback,Front,Automatic,205000.0,,125.0,,Diesel,...,,,,,,7999.0,Black,Metallic,,168
7347,Golf,Black,Hatchback,Front,Automatic,9179.0,Euro 6d-TEMP,150.0,1.498,Hybrid Petrol,...,6.0,4.0,1549.0,7.0,Fabric,40600.0,Black,Metallic,,4


In [267]:
train_clean.dtypes

Model                  category
Color                  category
Body_Type              category
Drive_Type             category
Transmission           category
Mileage                 float64
Emission_Class         category
Power                   float64
Engine_Displacement     float64
Fuel                   category
Category               category
Doors                  category
Gears                   float64
Cylinders               float64
Curb_Weight             float64
CO2_Emission            float64
Interior_Material      category
Price                   float64
Interior_Color         category
Paint_Type             category
Fuel_Consumption        float64
Age_Months                int32
dtype: object

In [268]:
train_clean.columns

Index(['Model', 'Color', 'Body_Type', 'Drive_Type', 'Transmission', 'Mileage',
       'Emission_Class', 'Power', 'Engine_Displacement', 'Fuel', 'Category',
       'Doors', 'Gears', 'Cylinders', 'Curb_Weight', 'CO2_Emission',
       'Interior_Material', 'Price', 'Interior_Color', 'Paint_Type',
       'Fuel_Consumption', 'Age_Months'],
      dtype='object')

In [269]:
categorical_features = train_clean.select_dtypes(include=['category']).columns.tolist()
numerical_features = [f for f in train_clean.columns if f not in categorical_features + ['Price']]

In [270]:
categorical_features

['Model',
 'Color',
 'Body_Type',
 'Drive_Type',
 'Transmission',
 'Emission_Class',
 'Fuel',
 'Category',
 'Doors',
 'Interior_Material',
 'Interior_Color',
 'Paint_Type']

In [271]:
numerical_features

['Mileage',
 'Power',
 'Engine_Displacement',
 'Gears',
 'Cylinders',
 'Curb_Weight',
 'CO2_Emission',
 'Fuel_Consumption',
 'Age_Months']

In [272]:
X_train, y_train = train_clean[numerical_features + categorical_features], train_clean['Price']
X_validation, y_validation = validation_clean[numerical_features + categorical_features], validation_clean['Price']

## Encoders

### Ordinal encoder + Simple imputer

Using solely pandas

In [273]:
model = LinearRegression()

# # Encoding categorical features with OrdinalEncoder
encoder = OrdinalEncoder(handle_unknown='use_encoded_value',
                         unknown_value=np.nan,
                        #  encoded_missing_value=-1
                         )

encoder.fit(X_train[categorical_features])

X_train_clean = pd.DataFrame(encoder.transform(X_train[categorical_features]), 
                              columns=encoder.get_feature_names_out(), 
                              index=X_train.index)

X_validation_clean = pd.DataFrame(encoder.transform(X_validation[categorical_features]),
                                columns=encoder.get_feature_names_out(), 
                                index=X_validation.index)

# Missing values simple imputer
imputer = SimpleImputer(strategy='most_frequent')

imputer.fit(X_train_clean)

X_train_clean = pd.DataFrame(imputer.transform(X_train_clean), 
                            columns=X_train_clean.columns, 
                            index=X_train_clean.index)

X_validation_clean = pd.DataFrame(imputer.transform(X_validation_clean), 
                                columns=X_validation_clean.columns, 
                                index=X_validation_clean.index)

# Missing values numerical features simple  imputer
imputer_num = SimpleImputer(strategy='mean')

imputer_num.fit(X_train[numerical_features])

X_train_clean_num = pd.DataFrame(imputer_num.transform(X_train[numerical_features]), 
                            columns=numerical_features, 
                            index=X_train.index)

X_validation_clean_num = pd.DataFrame(imputer_num.transform(X_validation[numerical_features]), 
                                columns=numerical_features, 
                                index=X_validation.index)

X_train_clean = X_train_clean.join(X_train_clean_num)
X_validation_clean = X_validation_clean.join(X_validation_clean_num)

model.fit(X_train_clean, y_train)

print(model.score(X_train_clean, y_train),
    model.score(X_validation_clean, y_validation))


0.9010417083880922 0.8718489378305095


In [274]:
preprocessing = ColumnTransformer(
    transformers=[
        ('cat', Pipeline(steps=[
            ('encoder', OrdinalEncoder(
                         handle_unknown='use_encoded_value',
                         unknown_value=np.nan,)),
            ('imputer', SimpleImputer(strategy='most_frequent'))
        ]), categorical_features),
        ('num', SimpleImputer(strategy='mean'), numerical_features)
    ]
)

pipeline = Pipeline(steps=[
    ('preprocessing', preprocessing),
    ('model', LinearRegression())
])

pipeline.fit(X_train, y_train)

print(pipeline.score(X_train, y_train),
    pipeline.score(X_validation, y_validation))

0.901041708388092 0.8718489378305105


### One hot

In [275]:
model = LinearRegression()

preprocessing = ColumnTransformer(
    transformers=[
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')), # With most frequent first now
            ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore')),
        ]), categorical_features),
        ('num',  SimpleImputer(strategy='mean'), 
         numerical_features)
    ])

pipeline = Pipeline(steps=[
    ('preprocessing', preprocessing),
    ('model', model)
])

pipeline.fit(X_train, y_train)

print(pipeline.score(X_train, y_train),
    pipeline.score(X_validation, y_validation))

0.9200499328430262 0.8803179238585175


Replace simple imputer with handle_unkown with ignore

In [276]:
model = LinearRegression()

preprocessing = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features),
        ('num', SimpleImputer(strategy='mean'), numerical_features)
    ])

pipeline = Pipeline(steps=[
    ('preprocessing', preprocessing),
    ('model', model)
])

pipeline.fit(X_train, y_train)

print(pipeline.score(X_train, y_train),
    pipeline.score(X_validation, y_validation))

0.928259041814029 0.8803676140474943


Add treshold

In [277]:
model = LinearRegression()

preprocessing = ColumnTransformer(
    transformers=[
        ('cat', Pipeline(steps=[
            ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore', min_frequency=6)),
        ]), categorical_features),
        ('num', SimpleImputer(strategy='mean'), 
         numerical_features)
    ])

pipeline = Pipeline(steps=[
    ('preprocessing', preprocessing),
    ('model', model)
])

pipeline.fit(X_train, y_train)

print(pipeline.score(X_train, y_train),
    pipeline.score(X_validation, y_validation))

0.9202987049328966 0.860109667213366


### Target encoding

In [278]:
model = LinearRegression()

preprocessing = ColumnTransformer(
    transformers=[
        ('cat', Pipeline(steps=[
            ('encoder', TargetEncoder(target_type='continuous')),
        ]), categorical_features),
        ('num', SimpleImputer(strategy='mean'), 
         numerical_features)
    ])

pipeline = Pipeline(steps=[
    ('preprocessing', preprocessing),
    ('model', model)
])

pipeline.fit(X_train, y_train)

print(pipeline.score(X_train, y_train),
    pipeline.score(X_validation, y_validation))

0.906006071728734 0.8717450651102618


Turn smoothing off

In [281]:
model = LinearRegression()

preprocessing = ColumnTransformer(
    transformers=[
        ('cat', Pipeline(steps=[
            ('encoder', TargetEncoder(target_type='continuous', smooth=0)),
        ]), categorical_features),
        ('num', SimpleImputer(strategy='mean'), 
         numerical_features)
    ])

pipeline = Pipeline(steps=[
    ('preprocessing', preprocessing),
    ('model', model)
])

pipeline.fit(X_train, y_train)

print(pipeline.score(X_train, y_train),
    pipeline.score(X_validation, y_validation))

0.9062269096189138 0.8756064352292097


## Imputers

### KNN imputer

In [291]:
model = LinearRegression()

preprocessing = ColumnTransformer(
    transformers=[
        ('cat', Pipeline(steps=[
            ('encoder', TargetEncoder(target_type='continuous', random_state=42)), # Different random states,
        ]), categorical_features),
        ('num', Pipeline(steps=[
            ('scaler', StandardScaler()),
            ('imputer', KNNImputer()),
        ]), numerical_features),
    ])

pipeline = Pipeline(steps=[
    ('preprocessing', preprocessing),
    ('model', model)
])

pipeline.fit(X_train, y_train)

print(pipeline.score(X_train, y_train),
    pipeline.score(X_validation, y_validation))

0.9092789833737461 0.8692162428972547


Train the KNN on all features

In [296]:
model = LinearRegression()

preprocessing = ColumnTransformer(
    transformers=[
        ('cat', TargetEncoder(target_type='continuous', random_state=42), categorical_features),
        ('num', StandardScaler(), numerical_features),
    ])

pipeline = Pipeline(steps=[
    ('preprocessing', preprocessing),
    ('imputer', KNNImputer()),
    ('model', model)
])

pipeline.fit(X_train, y_train)

print(pipeline.score(X_train, y_train),
    pipeline.score(X_validation, y_validation))

0.9064733814822454 0.8715218232600326


### Iterative imputer

In [297]:
model = LinearRegression()

preprocessing = ColumnTransformer(
    transformers=[
        ('cat', Pipeline(steps=[
            ('encoder', TargetEncoder(target_type='continuous', random_state=42)),
        ]), categorical_features),
        ('num', Pipeline(steps=[
            ('scaler', StandardScaler()),
        ]), numerical_features),
    ])

pipeline = Pipeline(steps=[
    ('preprocessing', preprocessing),
    ('imputing', IterativeImputer()),
    ('model', model)
])

pipeline.fit(X_train, y_train)

print(pipeline.score(X_train, y_train),
    pipeline.score(X_validation, y_validation))

0.9073823541699946 0.8610081131936762


Now use random forest instead. Which is more flexible than linear model

In [325]:
from sklearn.ensemble import RandomForestRegressor

model = LinearRegression()

preprocessing = ColumnTransformer(
    transformers=[
        ('cat', Pipeline(steps=[
            ('encoder', TargetEncoder(target_type='continuous', random_state=42)),
        ]), categorical_features),
        ('num', Pipeline(steps=[
            ('scaler', StandardScaler()),
        ]), numerical_features),
    ])

pipeline = Pipeline(steps=[
    ('preprocessing', preprocessing),
    ('imputing', IterativeImputer(estimator=RandomForestRegressor(max_depth=10), random_state=42)),
    ('model', model)
])

pipeline.fit(X_train, y_train)

print(pipeline.score(X_train, y_train),
    pipeline.score(X_validation, y_validation))

0.918508197706179 0.8915994059398358


Now with one hot encoder

In [110]:
from sklearn.ensemble import RandomForestRegressor

model = LinearRegression()

preprocessing = ColumnTransformer(
    transformers=[
        ('cat', Pipeline(steps=[
            ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore', min_frequency=10)),
        ]), categorical_features),
        ('num', Pipeline(steps=[
            ('scaler', StandardScaler()),
        ]), numerical_features),
    ])

pipeline = Pipeline(steps=[
    ('preprocessing', preprocessing),
    ('imputing', IterativeImputer(estimator=RandomForestRegressor(max_depth=10), random_state=42)),
    ('model', model)
])

pipeline.fit(X_train, y_train)

print(pipeline.score(X_train, y_train),
    pipeline.score(X_validation, y_validation))



0.9263512086035525 0.884709152845709
