In [88]:
import pandas as pd
import numpy as np
import joblib

from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [89]:
scaler = StandardScaler()
imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')
mark_mapper = dict()
model_mapper = dict()
fuel_mapper = dict()
body_mapper = dict()

def create_mapper(df, source_column, result_column, clusters, mapper):
    avg_price = df.groupby(source_column)['Cena'].mean().reset_index()
    kmeans = KMeans(n_clusters=clusters, random_state=42, n_init=5)
    avg_price['cluster'] = kmeans.fit_predict(avg_price[['Cena']])
    avg_price['cluster_str'] = avg_price['cluster'].apply(str)
    mapper = dict(zip(avg_price[source_column], avg_price['cluster_str']))
    df[result_column] = df[source_column].map(mapper)
    return mapper

def prepare_data(path, is_train_data):
    df = pd.read_csv(path)
    df.loc[df['Stan'] == 'New', 'Przebieg_km'] = 1
    df["generation_set"] = df["Generacja_pojazdu"].apply(lambda x: False if pd.isnull(x) else True)
    cat_source_columns = ["Stan", "Rodzaj_paliwa", "Naped", "Skrzynia_biegow", "Typ_nadwozia"]
    cat_result_columns = ["Stan", "Naped", "Skrzynia_biegow", "mark_cluster", "model_cluster", "generation_set", "fuel_cluster", "body_cluster"]
    numerical_cols = ["Przebieg_km", "Rok_produkcji", "Liczba_drzwi", "Moc_KM", "Pojemnosc_cm3", "Emisja_CO2",]
    if is_train_data:
        df[cat_source_columns] = pd.DataFrame(cat_imputer.fit_transform(df[cat_source_columns]), columns=cat_source_columns)
        X_num = pd.DataFrame(imputer.fit_transform(df[numerical_cols]), columns=numerical_cols)

        global mark_mapper
        mark_mapper = create_mapper(df, "Marka_pojazdu", "mark_cluster", 5, mark_mapper)

        global model_mapper
        model_mapper = create_mapper(df, "Model_pojazdu", "model_cluster", 5, model_mapper)

        global fuel_mapper
        fuel_mapper = create_mapper(df, "Rodzaj_paliwa", "fuel_cluster", 3, fuel_mapper)

        global body_mapper
        body_mapper = create_mapper(df, "Typ_nadwozia", "body_cluster", 3, body_mapper)

        X_cat = df[cat_result_columns]
        X_encoded = pd.get_dummies(X_cat)
        y = df.Cena
        X = X_num.join(X_encoded)
        return X, y
    df[cat_source_columns] = pd.DataFrame(cat_imputer.transform(df[cat_source_columns]), columns=cat_source_columns)
    X_num = pd.DataFrame(imputer.transform(df[numerical_cols]), columns=numerical_cols)
    df["mark_cluster"] = df['Marka_pojazdu'].map(mark_mapper)
    df["model_cluster"] = df['Model_pojazdu'].map(model_mapper)
    df["fuel_cluster"] = df['Rodzaj_paliwa'].map(mark_mapper)
    df["body_cluster"] = df['Typ_nadwozia'].map(model_mapper)
    X_test = df[cat_result_columns]
    X_encoded_test = pd.get_dummies(X_test)
    X_test = X_num.join(X_encoded_test)
    return X_test

In [90]:
path = "../data/raw/sales_ads_train.csv"
X, y = prepare_data(path, True)
pd.DataFrame.to_csv(X, "../data/processed/X.csv", index=False)

In [91]:
X = pd.read_csv("../data/processed/X.csv")
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [92]:
y_pred = model.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_val, y_pred)

print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R² Score: {r2:.4f}")

Root Mean Squared Error (RMSE): 28363.38
R² Score: 0.8970


In [93]:
errors = np.abs(y_val - y_pred)
df_errors = pd.DataFrame({
    'y_val': y_val / 1000,
    'y_pred': y_pred / 1000,
    'error': errors / 1000,
})
df_errors_sorted = df_errors.sort_values(by='error', ascending=False)
df_errors_sorted.head(10)

Unnamed: 0,y_val,y_pred,error
76328,2583.0,991.90471,1591.09529
8924,225.0,1453.27227,1228.27227
47492,1500.0,539.441554,960.558446
59503,209.0,1074.98249,865.98249
46345,2100.0,1307.35966,792.64034
53600,1799.9,1036.21917,763.68083
47174,239.0,808.73706,569.73706
81809,1569.0,1074.51534,494.48466
96603,176.0,659.76098,483.76098
92309,157.9,635.48463,477.58463


In [94]:
df = pd.read_csv(path)

In [95]:
df[df.index == 76328]

Unnamed: 0,ID,Cena,Waluta,Stan,Marka_pojazdu,Model_pojazdu,Wersja_pojazdu,Generacja_pojazdu,Rok_produkcji,Przebieg_km,...,Skrzynia_biegow,Typ_nadwozia,Liczba_drzwi,Kolor,Kraj_pochodzenia,Pierwszy_wlasciciel,Data_pierwszej_rejestracji,Data_publikacji_oferty,Lokalizacja_oferty,Wyposazenie
76328,76329,2583000,PLN,,Porsche,911,,991 (2011-2018),2019.0,40.0,...,Automatic,coupe,2.0,white,Poland,Yes,12/11/2019,30/04/2021,"Warszawska 67 - 61-028 Poznań, Nowe Miasto (Po...",[]


In [96]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X, y)
joblib.dump(model, "../models/random_forest.joblib")

['../models/random_forest.joblib']

In [97]:
path_test = "../data/raw/sales_ads_test.csv"
X_test = prepare_data(path_test, False)
missing_cols = set(X.columns) - set(X_test.columns)
for col in missing_cols:
    X_test[col] = False
X_test = X_test[X.columns]
y_pred_test = model.predict(X_test)

In [98]:
df_output = pd.DataFrame({
    'ID': np.arange(1, len(y_pred_test) + 1),  
    'Cena': y_pred_test
})

df_output.to_csv("../results/predictions.csv", index=False)