In [92]:
import re

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoLars
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

from sklearn.metrics import explained_variance_score
from sklearn.metrics import max_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.decomposition import PCA
from sklearn.pipeline import FeatureUnion, Pipeline


## Load & prepare data

In [2]:
df_raw = pd.read_csv('car4you_suv_all_final.csv', delimiter=';')
df_raw.head()

Unnamed: 0,web-scraper-order,web-scraper-start-url,price_raw,marke_modell_raw,baujahr_km_treibstoff_getriebe_raw,properties_raw,garage_raw,garage_place_raw,description_raw
0,1652200447-14927,https://www.carforyou.ch/de/auto/suv?page=1205,,,,,,,
1,1652197876-3840,https://www.carforyou.ch/de/auto/suv?page=1667,,,,,,,
2,1652201197-18131,https://www.carforyou.ch/de/auto/suv?page=1071,CHF 80’500,Audi Q5 Sportback 45 TFSI Black Edition quattr...,2021 · 2’900 km · Elektro/Benzin · Automatik,pricecheckMarktpreis,,,
3,1652197512-2221,https://www.carforyou.ch/de/auto/suv?page=1734,CHF 39’990,Audi Q2 35 TDI quattro S-tronic,2020 · 10’000 km · Diesel · Automatik,pricecheckÜber dem Marktpreis,,,[YS] schwarz - felsgrau[GB1] LTE-Unterstützung...
4,1652204391-31474,https://www.carforyou.ch/de/auto/suv?page=515,CHF 73’000,Jaguar E-Pace 2.0 I4 200 R-Dynamic Black AWD,2022 · 45 km · Elektro/Benzin · Automatik,pricecheckÜber dem MarktpreisumbrellaKäuferschutz,Emil Frey Zürich Altstetten,"8048 Zürich, Zürich",


In [33]:
from typing import Dict
import re


def extract_regex(pattern: str, target: str) -> str:
    if not pd.isnull(target) and re.search(pattern, target):
        return re.findall(pattern, target)[0]
    return np.NaN


def extract_manufacturer(cell_content) -> str:
    if not pd.isnull(cell_content):
        return cell_content.split(' ')[0]
    return np.NaN


def extract_year(cell_content: str) -> int:
    year_str = extract_regex(r'[12][0-9]{3}', cell_content)
    return np.NaN if pd.isnull(year_str) else int(year_str)


def extract_km(cell_content: str) -> int:
    km_str = extract_regex(r'\d*’*\d+ km', cell_content)
    if pd.isnull(km_str):
        return np.NaN
    km = km_str.replace('’', '').replace(' km', '')
    return int(km)


def extract_fuel(cell_content: str) -> str:
    if "Elektro/Benzin" in cell_content or "Elektro/Diesel" in cell_content:
        return "Hybrid"
    if "Benzin" in cell_content:
        return "Benzin"
    if "Diesel" in cell_content:
        return "Diesel"
    if "Elektro" in cell_content:
        return "Electro"

    return np.NaN

def extract_transmission(cell_content: str) -> str:
    if "Automatik" in cell_content:
        return "Automate"
    if "Manuell" in cell_content:
        return "Manuel"
    return np.NaN

def extract_price(cell_content: str) -> str:
    if pd.isnull(cell_content):
        return np.NaN
    return str(cell_content).replace('CHF ', '').replace('’', '')

def extract_price_check(cell_content: str) -> str:
    if "Über dem Marktpreis" in cell_content:
        return "Above market"
    if "Unter dem Marktpreis" in cell_content:
        return "Below market"
    if "Marktpreis" in cell_content:
        return "Market"
    return np.NaN

def row_to_dict(row: pd.Series) -> Dict:
    row_dict = {
        'manufacturer': extract_manufacturer(row[3]),
        'year': extract_year(str(row[4])),
        'km': extract_km(str(row[4])),
        'fuel': extract_fuel(str(row[4])),
        'transmission': extract_transmission(str(row[4])),
        'price': extract_price(row[2]),
        'price_check': extract_price_check(str(row[5])),
    }

    return row_dict


In [37]:
data_raw = []
for index, row in df_raw.iterrows():
    data_raw.append(row_to_dict(row))

df = pd.DataFrame(data_raw)
df.head()

Unnamed: 0,manufacturer,year,km,fuel,transmission,price,price_check
0,,,,,,,
1,,,,,,,
2,Audi,2021.0,2900.0,Hybrid,Automate,80500.0,Market
3,Audi,2020.0,10000.0,Diesel,Automate,39990.0,Above market
4,Jaguar,2022.0,45.0,Hybrid,Automate,73000.0,Above market


In [38]:
len(df)

43897

In [42]:
df = df.dropna()
len(df)

12984

In [57]:
df = df.astype({
    'manufacturer': 'category',
    'year': 'int32',
    'km': 'int32',
    'fuel': 'category',
    'transmission': 'category',
    'price':'int32',
    'price_check': 'category'
})
df.head()

Unnamed: 0,manufacturer,year,km,fuel,transmission,price,price_check
2,1,2021,2900,2,0,80500,Market
3,1,2020,10000,1,0,39990,Above market
4,9,2022,45,2,0,73000,Above market
7,28,2021,23336,0,0,29900,Market
9,2,2019,20400,1,0,36900,Market


In [58]:
df.dtypes

manufacturer    category
year               int32
km                 int32
fuel            category
transmission    category
price              int32
price_check     category
dtype: object

In [59]:
df.describe()

Unnamed: 0,year,km,price
count,12984.0,12984.0,12984.0
mean,2017.849045,51624.138324,40839.75439
std,3.352763,41881.072839,23523.815691
min,1985.0,1.0,3900.0
25%,2016.0,14996.75,25000.0
50%,2018.0,43000.0,35900.0
75%,2020.0,82637.75,49890.0
max,2023.0,244000.0,259900.0


In [60]:
df.manufacturer = df.manufacturer.cat.codes
df.transmission = df.transmission.cat.codes
df.fuel = df.fuel.cat.codes
df.price_check = df.price_check.cat.codes

In [67]:
df.price_check.value_counts()

1    7583
0    5401
Name: price_check, dtype: int64

## Outlier analysis

## Data preparation

In [73]:
X = df.drop(columns='price')
y = df.price


## Price prediction

In [85]:
class BenchmarkRegressor:
    def __init__(self):
        pass

    def fit(self, X, y, **kwargs):
        self.mean = y.mean()

    def predict(self, X):
        return [self.mean] * len(X)

    def get_params(self, deep=False):
        return {}

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=21, stratify=df.price_check)

In [90]:
class BenchmarkRegressor:
    def __init__(self):
        pass

    def fit(self, X, y, **kwargs):
        self.mean = y.mean()

    def predict(self, X):
        return [self.mean] * len(X)

    def get_params(self, deep=False):
        return {}

# The following Models are currently computing used the Standard Parameters. Please keep in mind,
# that they need to be optimized using the hyperparameters within the modeling process.

bm_regr = BenchmarkRegressor()
lr_regr = LinearRegression()
ri_regr = Ridge()
br_regr = BayesianRidge()
ls_regr = Lasso()
ll_regr = LassoLars()
en_regr = ElasticNet()
ne_regr = KNeighborsRegressor()
dt_regr = DecisionTreeRegressor()
rf_regr = RandomForestRegressor()
xg_regr = xgb.XGBRegressor()

models = [
    ('Benchmark', bm_regr),
    ('LR', lr_regr),
    ('Ridge', ri_regr),
    ('Bayesian Ridge', br_regr),
    ('Lasso', ls_regr),
    ('LARS Lasso', ll_regr),
    ('Elastic Net', en_regr),
    ('Nearest Neighbors (KNN) regression', ne_regr),
    ('Decision Tree', dt_regr),
    ('Random Forest', rf_regr),
    ('XGBoost', xg_regr),
]

In [93]:

for name, model in models:
    pipelined_model = Pipeline([
        ('pca', PCA(n_components = 4)),
        (name, model)
    ])

    # Training des Models
    pipelined_model.fit(X_train, y_train)

    # Vorhersage des trainierten models auf X_test
    y_hat = pipelined_model.predict(X_test)

    # Berechnung der verschiedenen Messwerte resp. KPI's
    EVS = (explained_variance_score(y_test, y_hat))
    ME = (max_error(y_test, y_hat))
    MAE = (mean_absolute_error(y_test, y_hat))
    MSE = np.sqrt(mean_squared_error(y_test, y_hat, squared=True))
    RMSE = np.sqrt(mean_squared_error(y_test, y_hat, squared=False))
    R2 = r2_score(y_test, y_hat)
    print('Model: ', name, ' | EVS: ', EVS)
    print('Model: ', name, ' | ME: ', ME)
    print('Model: ', name, ' | MAE: ', MAE)
    print('Model: ', name, ' | MSE ', MSE)
    print('Model: ', name, ' | RMSE ', RMSE)
    print('Model: ', name, ' | R2 ', R2)
    print('----------------')


Model:  Benchmark  | EVS:  0.0
Model:  Benchmark  | ME:  209198.93385963223
Model:  Benchmark  | MAE:  17215.943403863104
Model:  Benchmark  | MSE  23877.230271595836
Model:  Benchmark  | RMSE  154.52258822449176
Model:  Benchmark  | R2  -0.0008440165256577981
----------------
Model:  LR  | EVS:  0.25518868161673725
Model:  LR  | ME:  206134.78467719074
Model:  LR  | MAE:  14477.484201493577
Model:  LR  | MSE  20599.704176589537
Model:  LR  | RMSE  143.5259703906911
Model:  LR  | R2  0.2550614161847786
----------------
Model:  Ridge  | EVS:  0.2551907770318995
Model:  Ridge  | ME:  206134.0777601338
Model:  Ridge  | MAE:  14477.484658101212
Model:  Ridge  | MSE  20599.675265388283
Model:  Ridge  | RMSE  143.5258696729906
Model:  Ridge  | R2  0.25506350719094273
----------------
Model:  Bayesian Ridge  | EVS:  0.1825949170847695
Model:  Bayesian Ridge  | ME:  196449.67094845296
Model:  Bayesian Ridge  | MAE:  15599.221677692572
Model:  Bayesian Ridge  | MSE  21580.4704968459
Model:  Bay

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LassoLars())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * np.sqrt(n_samples). 


Model:  Random Forest  | EVS:  0.6737872642885483
Model:  Random Forest  | ME:  142816.34
Model:  Random Forest  | MAE:  8367.090711575109
Model:  Random Forest  | MSE  13635.21320464472
Model:  Random Forest  | RMSE  116.76991566599986
Model:  Random Forest  | R2  0.6736208795035655
----------------
Model:  XGBoost  | EVS:  0.6729767258985558
Model:  XGBoost  | ME:  159862.7265625
Model:  XGBoost  | MAE:  8819.02591246736
Model:  XGBoost  | MSE  13651.48204232706
Model:  XGBoost  | RMSE  116.83955683897068
Model:  XGBoost  | R2  0.672841577185298
----------------
