In [1]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("data/autoscout24-germany-dataset.csv")

In [2]:
df = df.dropna() 

In [3]:
def drop_make(df, column='make', threshold=25, drop_value=['Porsche', 'Aston']):
    df_filtered = df.groupby(column).filter(lambda x: len(x) >= threshold)
    df_filtered = df_filtered[~df_filtered[column].isin(drop_value)]
    return df_filtered

df = drop_make(df)

In [4]:
def drop_model(df, column='model', threshold=5):
    df_filtered = df.groupby(column).filter(lambda x: len(x) >= threshold)
    return df_filtered

df = drop_model(df)

In [5]:
df = df[df['fuel'].isin(['Gasoline', 'Diesel'])]

In [6]:
df = df[df['gear'].isin(['Manual', 'Automatic'])]

In [7]:
df = df.drop('offerType', axis=1)
df = df.drop('hp', axis=1)
df = df.drop('gear', axis=1)

In [8]:
df = df[df['price'] <= 100000]

In [9]:
df = df[df['mileage'] <= 300000]

In [10]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


X = df.drop(columns=['price'])
y = df['price']

categorical_columns = ['make', 'model', 'fuel']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_columns)
    ],
    remainder='passthrough'
)
X_processed = preprocessor.fit_transform(X)

one_hot_encoders = preprocessor.named_transformers_['cat']

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)
regressor = RandomForestRegressor(random_state=42)

param_grid = {
    'n_estimators': [50],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 3]
}

grid_search = GridSearchCV(regressor, param_grid, cv=3, n_jobs=-1, verbose=1)

grid_search.fit(X_train, y_train)

model = grid_search.best_estimator_

Fitting 3 folds for each of 8 candidates, totalling 24 fits


In [None]:
print("Best parameters found: ", grid_search.best_params_)
print("Best score found: ", grid_search.best_score_)

In [12]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", np.sqrt(mse))
print("R^2 Score:", r2)

Mean Squared Error: 3863.0596885210143
R^2 Score: 0.891761547045046


In [13]:
importances = model.feature_importances_ 

make_importance = sum(importances[:len(one_hot_encoders.categories_[0])])
model_importance = sum(importances[len(one_hot_encoders.categories_[0]):len(one_hot_encoders.categories_[0]) + len(one_hot_encoders.categories_[1])])
fuel_importance = sum(importances[len(one_hot_encoders.categories_[0]) + len(one_hot_encoders.categories_[1]):-2])

mileage_importance = importances[-2]
year_importance = importances[-1]

feature_importances = [
    ('mileage', mileage_importance),
    ('make', make_importance),
    ('model', model_importance),
    ('fuel', fuel_importance),
    ('year', year_importance)
]

sorted_feature_importances = sorted(feature_importances, key=lambda x: x[1], reverse=True)

print("\nFeature Importances:")
for feature, importance in sorted_feature_importances:
    print(f"{feature}: {importance}")


Feature Importances:
year: 0.3657515912219595
model: 0.25684136061650703
make: 0.22538208722669043
mileage: 0.10174981400985797
fuel: 0.05027514692498501


In [14]:
import pickle

data = {
    "model": model,
    "preprocessor": preprocessor
}

with open('saved_model.pkl', 'wb') as file:
    pickle.dump(data, file)