In [165]:
import pandas as pd
import numpy as np
import joblib

from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [166]:
cat_imputer = SimpleImputer(strategy='most_frequent')

def clean_ds(df, df_test):
    cat_source_columns = ["Stan", "Rodzaj_paliwa", "Naped", "Skrzynia_biegow", "Typ_nadwozia"]
    df[cat_source_columns] = pd.DataFrame(cat_imputer.fit_transform(df[cat_source_columns]), columns=cat_source_columns)
    df_test[cat_source_columns] = pd.DataFrame(cat_imputer.transform(df_test[cat_source_columns]), columns=cat_source_columns)
    test_categories = df_test["Rodzaj_paliwa"].unique()
    df = df[df["Rodzaj_paliwa"].isin(test_categories)]
    df = df.reset_index(drop=True)
    return df, df_test

In [167]:
scaler = StandardScaler()
imputer = SimpleImputer(strategy='median')

mark_mapper = dict()
model_mapper = dict()
fuel_mapper = dict()
body_mapper = dict()

def create_mapper(df, source_column, result_column, clusters, mapper):
    avg_price = df.groupby(source_column)['Cena'].mean().reset_index()
    kmeans = KMeans(n_clusters=clusters, random_state=42, n_init=5)
    avg_price['cluster'] = kmeans.fit_predict(avg_price[['Cena']])
    avg_price['cluster_str'] = avg_price['cluster'].apply(str)
    mapper = dict(zip(avg_price[source_column], avg_price['cluster_str']))
    df[result_column] = df[source_column].map(mapper)
    return mapper

def prepare_data(df, is_train_data):
    df.loc[df['Stan'] == 'New', 'Przebieg_km'] = 1
    df["generation_set"] = df["Generacja_pojazdu"].apply(lambda x: False if pd.isnull(x) else True)
    
    cat_result_columns = ["Stan", "Naped", "Skrzynia_biegow", "mark_cluster", "model_cluster", "generation_set", "fuel_cluster", "body_cluster"]
    numerical_cols = ["Przebieg_km", "Rok_produkcji", "Liczba_drzwi", "Moc_KM", "Pojemnosc_cm3", "Emisja_CO2",]
    if is_train_data:
        X_num = pd.DataFrame(imputer.fit_transform(df[numerical_cols]), columns=numerical_cols)

        global mark_mapper
        mark_mapper = create_mapper(df, "Marka_pojazdu", "mark_cluster", 5, mark_mapper)

        global model_mapper
        model_mapper = create_mapper(df, "Model_pojazdu", "model_cluster", 5, model_mapper)

        global fuel_mapper
        fuel_mapper = create_mapper(df, "Rodzaj_paliwa", "fuel_cluster", 3, fuel_mapper)

        global body_mapper
        body_mapper = create_mapper(df, "Typ_nadwozia", "body_cluster", 3, body_mapper)

        X_cat = df[cat_result_columns]
        X_encoded = pd.get_dummies(X_cat)
        y = df.Cena
        X = X_num.join(X_encoded)
        return X, y
    X_num = pd.DataFrame(imputer.transform(df[numerical_cols]), columns=numerical_cols)
    df["mark_cluster"] = df['Marka_pojazdu'].map(mark_mapper)
    df["model_cluster"] = df['Model_pojazdu'].map(model_mapper)
    df["fuel_cluster"] = df['Rodzaj_paliwa'].map(fuel_mapper)
    df["body_cluster"] = df['Typ_nadwozia'].map(body_mapper)
    X_test = df[cat_result_columns]
    X_encoded_test = pd.get_dummies(X_test)
    X_test = X_num.join(X_encoded_test)
    return X_test

In [168]:
path_test = "../data/raw/sales_ads_test.csv"
path = "../data/raw/sales_ads_train.csv"
df = pd.read_csv(path)
df_test = pd.read_csv(path_test)
df, df_test = clean_ds(df, df_test)

In [169]:
len(df.index)

135395

In [170]:
X, y = prepare_data(df, True)
# pd.DataFrame.to_csv(X, "../data/processed/X.csv", index=False)

In [171]:
# X = pd.read_csv("../data/processed/X.csv")
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [172]:
y_pred = model.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_val, y_pred)

print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R² Score: {r2:.4f}")

Root Mean Squared Error (RMSE): 30463.20
R² Score: 0.8817


In [173]:
errors = np.abs(y_val - y_pred)
df_errors = pd.DataFrame({
    'y_val': y_val / 1000,
    'y_pred': y_pred / 1000,
    'error': errors / 1000,
})
df_errors_sorted = df_errors.sort_values(by='error', ascending=False)
df_errors_sorted.head(10)

Unnamed: 0,y_val,y_pred,error
105825,94.0,1892.46191,1798.46191
53598,1799.9,655.66763,1144.23237
32581,1300.0,169.74908,1130.25092
47490,1500.0,486.443969,1013.556031
9072,1800.0,903.99247,896.00753
100988,968.0,122.66291,845.33709
68103,1500.0,674.83206,825.16794
130129,79.999,766.88759,686.88859
86141,1299.0,626.2669,672.7331
36841,780.0,137.79642,642.20358


In [174]:
df_error = pd.read_csv(path)

In [175]:
df_error[df_error.index == 76328]

Unnamed: 0,ID,Cena,Waluta,Stan,Marka_pojazdu,Model_pojazdu,Wersja_pojazdu,Generacja_pojazdu,Rok_produkcji,Przebieg_km,...,Skrzynia_biegow,Typ_nadwozia,Liczba_drzwi,Kolor,Kraj_pochodzenia,Pierwszy_wlasciciel,Data_pierwszej_rejestracji,Data_publikacji_oferty,Lokalizacja_oferty,Wyposazenie
76328,76329,2583000,PLN,,Porsche,911,,991 (2011-2018),2019.0,40.0,...,Automatic,coupe,2.0,white,Poland,Yes,12/11/2019,30/04/2021,"Warszawska 67 - 61-028 Poznań, Nowe Miasto (Po...",[]


In [176]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X, y)
# joblib.dump(model, "../models/random_forest.joblib")

In [None]:
X_test = prepare_data(df_test, False)
# missing_cols = set(X.columns) - set(X_test.columns)
# print(missing_cols)
# for col in missing_cols:
#     X_test[col] = False
# X_test = X_test[X.columns]
y_pred_test = model.predict(X_test)

set()


In [178]:
df_output = pd.DataFrame({
    'ID': np.arange(1, len(y_pred_test) + 1),  
    'Cena': y_pred_test
})

df_output.to_csv("../results/predictions.csv", index=False)