In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import linear_model, metrics
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
import random
import os

In [None]:
SEED = 42 
# Python RNG 
random.seed(SEED) 
# NumPy RNG 
np.random.seed(SEED) 
# Optional: full determinism for sklearn parallel algorithms 
os.environ["PYTHONHASHSEED"] = str(SEED) 
os.environ["OMP_NUM_THREADS"] = "1" 
os.environ["MKL_NUM_THREADS"] = "1"

In [38]:
df = pd.read_csv('../data/data_cleaned.csv')

In [39]:
#usuwanie i wypełnianie pustych danych 

missing_data = (df.isnull().sum()/len(df)) *100
missing_data = missing_data[missing_data>0].sort_values(ascending = False).reset_index()
missing_data.columns = ["Cecha","Procent"]


# do modyfikacji w zależności od ostatecznej potrzeby (jeszcze idk jakie będzią ostatecznie)
# w %
col_cutoff_treshold = 48
col_trim_treshhold = 2


cols_to_drop  = missing_data[missing_data['Procent'] > col_cutoff_treshold]['Cecha'].tolist()
cols_to_trim = missing_data[missing_data['Procent'] < col_trim_treshhold]['Cecha'].tolist()

df.drop(columns=cols_to_drop,inplace=True)
df.dropna(subset=cols_to_trim,inplace=True)

In [None]:
unique_treshold = 0.90
# te zmienne wprowadzały szum wcześniej ale jak dojdę do ogarniania feature engineering to coś z nimi zrobię obiecuje
cols_to_drop  = [col for col in df.select_dtypes(include='object').columns if df[col].nunique()/len(df) > unique_treshold or df[col].nunique() == 1 ]
df.drop(columns=['model','model_version','body_color_original'], inplace=True)

df.drop(columns=cols_to_drop,inplace=True)


In [41]:
target_col = 'price'

# podział kolumn ze względu na typ
numeric_cols = df.select_dtypes(include=["float","int"]).columns.tolist()

bool_cols = df.select_dtypes(include=["bool"]).columns.tolist()
categorical_cols = df.select_dtypes(include=["object"]).columns.tolist()


#dopiero w tym miejscu żeby na wykresie było widać
numeric_cols.remove(target_col)

In [42]:
def fill_cols(data,numeric_cols,categorical_cols,stats):
    for col in numeric_cols:
        if col == 'gears' or col == 'nr_prev_owners':
            data[col] = data[col].fillna(0)
        else :
            data[col] = data[col].fillna(stats[col])
    for col,_ in categorical_cols:
        data[col] = data[col].fillna(stats[col])
    return data


def preprocess(data,numeric_cols,categorical_cols,scaler,stats):
    data = fill_cols(data,numeric_cols,categorical_cols,stats)
    data[numeric_cols] = scaler.transform(data[numeric_cols])
    for col,vals in categorical_cols:
        data[col] = pd.Categorical(data[col], categories=vals)
    data = pd.get_dummies(data, drop_first=True)
    return data


In [None]:
class Seeded:
    def __init__(self, estimator):
        self.estimator = estimator

    def __call__(self, *args, **kwargs):
        # Case 1: estimator is a class (e.g., RandomForestRegressor)
        if hasattr(self.estimator, "get_params"):
            # Instantiate a temporary object to inspect parameters
            params = self.estimator().get_params()
            if "random_state" in params and "random_state" not in kwargs:
                kwargs["random_state"] = SEED
            return self.estimator(*args, **kwargs)

        # Case 2: estimator is a function (e.g., train_test_split)
        if hasattr(self.estimator, "__code__"):
            if "random_state" in self.estimator.__code__.co_varnames and "random_state" not in kwargs:
                kwargs["random_state"] = SEED
            return self.estimator(*args, **kwargs)

        # Fallback
        return self.estimator(*args, **kwargs)

In [None]:
X, y = df.drop(target_col, axis=1), df[target_col]

TTS = Seeded(train_test_split)
X_train, X_test, y_train, y_test = TTS(X, y, test_size=0.2)
X_subtest, X_valid, y_subtest, y_valid = TTS(X_test, y_test, test_size=0.5)

scaler = StandardScaler()
scaler.fit(X_train[numeric_cols])

categorical_cols_with_vals = [ (col,[val for val in X_train[col].unique().tolist() if pd.notna(val)] ) for col in categorical_cols]

# to robię by uniknąć data leakage (pewnie poprawię potem by ładniej wyglądało)
stats = {col: X_train[col].median() for col in numeric_cols}
stats.update({col: X_train[col].mode()[0] for col in categorical_cols})

# tutaj gotowe dane do karmienia modelu
X_train_preprocessed = preprocess(X_train,numeric_cols,categorical_cols_with_vals,scaler,stats)
X_test_preprocessed = preprocess(X_test,numeric_cols,categorical_cols_with_vals,scaler,stats)
X_subtest_preprocessed = preprocess(X_subtest,numeric_cols,categorical_cols_with_vals,scaler,stats)

In [45]:
baseline = np.ones(len(y))*np.average(y)
mse_baseline = metrics.mean_squared_error(y, baseline)

clf = linear_model.LinearRegression()
clf.fit(X_train_preprocessed, y_train)

y_pred = clf.predict(X_test_preprocessed)
mse_st = metrics.mean_squared_error(y_test, y_pred)

print(mse_baseline / mse_st)

1.7293326026353828


In [None]:
models = {
    "Linear (double split)": linear_model.LinearRegression(),
    "Ridge": Seeded(linear_model.Ridge)(alpha=0.1),
    "Lasso": Seeded(linear_model.Lasso)(alpha=0.01),
    "kNN": KNeighborsRegressor(n_neighbors=50, weights="distance"),
    "ElasticNet": Seeded(linear_model.ElasticNet)(alpha=0.1, l1_ratio=0.5),
    "RandomForestRegressor": Seeded(RandomForestRegressor)(n_estimators=100, max_depth=20)
}

results = {}
for name, model in models.items():
    model.fit(X_train_preprocessed, y_train)
    y_val_pred = model.predict(X_subtest_preprocessed)
    mse_val_best = metrics.mean_squared_error(y_subtest, y_val_pred)
    results[name] = mse_val_best
    print(f"{name}: MSE walidacja = {mse_val_best:.4f}")

best_model_name = min(results, key=results.get)
print(f"Najlepszy model: {best_model_name}")

y_pred = models[best_model_name].predict(X_test_preprocessed)
mse_val_best = metrics.mean_squared_error(y_test, y_pred)
print(mse_st / mse_val_best)

rmse = np.sqrt(mse_val_best)
mae  = metrics.mean_absolute_error(y_test, y_pred)
r2   = metrics.r2_score(y_test, y_pred)
mape = metrics.mean_absolute_percentage_error(y_test, y_pred)

print("\n=== Ewaluacja najlepszego modelu (test) ===")
print(f"MSE:   {mse_val_best:.2f}")
print(f"RMSE:  {rmse:.2f}")
print(f"MAE:   {mae:.2f}")
print(f"R2:    {r2:.4f}")
print(f"MAPE:  {mape*100:.2f}%")

Linear (double split): MSE walidacja = 3714596510.2884
Ridge: MSE walidacja = 3732292913.2306


  model = cd_fast.enet_coordinate_descent(


Lasso: MSE walidacja = 3714604547.3699
kNN: MSE walidacja = 5418547384.7756
ElasticNet: MSE walidacja = 8786372685.9591
RandomForestRegressor: MSE walidacja = 3646416435.8114
Najlepszy model: RandomForestRegressor
1.0426585540942201


In [None]:
### Log-ification of database

results_log = {}
for name, model in models.items():
    model.fit(X_train_preprocessed, np.log1p(y_train))
    y_val_pred_log = model.predict(X_subtest_preprocessed)
    y_val_pred = np.expm1(y_val_pred_log)
    mse_val = metrics.mean_squared_error(y_subtest, y_val_pred)
    results_log[name] = mse_val
    print(f"{name}: MSE walidacja = {mse_val:.4f}")

best_model = models[best_model_name]
print(f"Najlepszy model (skala log.): {best_model_name}")
best_model.fit(X_train_preprocessed, np.log1p(y_train))

y_pred_log = best_model.predict(X_test_preprocessed)
y_pred = np.expm1(y_pred_log)

mse_log = metrics.mean_squared_error(y_test, y_pred)
print("MSE test:", mse_val_best / mse_log)

rmse = np.sqrt(mse_log)
mae  = metrics.mean_absolute_error(y_test, y_pred)
r2   = metrics.r2_score(y_test, y_pred)
mape = metrics.mean_absolute_percentage_error(y_test, y_pred)

print("\n=== Ewaluacja najlepszego modelu (test) ===")
print(f"MSE:   {mse_log:.2f}")
print(f"RMSE:  {rmse:.2f}")
print(f"MAE:   {mae:.2f}")
print(f"R2:    {r2:.4f}")
print(f"MAPE:  {mape*100:.2f}%")

Linear (double split): MSE walidacja = 0.0800
Ridge: MSE walidacja = 0.0801
Lasso: MSE walidacja = 0.1005
kNN: MSE walidacja = 0.0550
ElasticNet: MSE walidacja = 0.1386
RandomForestRegressor: MSE walidacja = 0.0336
Najlepszy model (skala log.): RandomForestRegressor


In [48]:
print (df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 112227 entries, 0 to 118381
Data columns (total 34 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   price                     112227 non-null  float64
 1   price_tax_deductible      112227 non-null  bool   
 2   price_negotiable          112227 non-null  bool   
 3   make                      112227 non-null  object 
 4   mileage_km                112227 non-null  float64
 5   nr_seats                  109296 non-null  float64
 6   nr_doors                  112227 non-null  float64
 7   body_color                101406 non-null  object 
 8   paint_type                88557 non-null   object 
 9   upholstery                87982 non-null   object 
 10  upholstery_color          82242 non-null   object 
 11  power_kw                  112227 non-null  float64
 12  power_hp                  112227 non-null  float64
 13  transmission              112227 non-null  object