In [None]:
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import KFold, train_test_split
import catboost as cb
import xgboost as xgb
import optuna
from optuna.integration import CatBoostPruningCallback
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

In [None]:
train_data = pd.read_csv('../data/raw/sales_ads_train.csv')
test_data = pd.read_csv('../data/raw/sales_ads_test.csv')

In [None]:
print(f"Train data shape: {train_data.shape}")
print(f"Test data shape: {test_data.shape}")

In [None]:
print(train_data['Cena'].describe())

In [None]:
train_data['data_source'] = 'original'
train_data

In [None]:
combined_train_data = train_data.copy()

In [None]:
test_data['data_source'] = 'test'

In [None]:
test_ids = test_data['ID'].values
combined_train_data['is_train'] = 1
test_data['is_train'] = 0
all_data = pd.concat([combined_train_data, test_data], axis=0, ignore_index=True)

In [None]:
KURS_EUR_PLN = 4.5

def przelicz_na_pln(row):
    if pd.notna(row['Waluta']) and row['Waluta'] == 'EUR':
        return row['Cena'] * KURS_EUR_PLN
    else:
        return row['Cena']

In [None]:
all_data['Cena_PLN'] = all_data.apply(przelicz_na_pln, axis=1)

In [None]:
median_price = all_data.loc[all_data['is_train'] == 1, 'Cena_PLN'].median()
all_data['Cena_PLN'] = all_data['Cena_PLN'].fillna(median_price)

In [None]:
all_data['log_Cena'] = np.log1p(all_data['Cena_PLN'])

In [None]:
current_year = datetime.now().year
all_data['Wiek_pojazdu'] = current_year - all_data['Rok_produkcji']

In [None]:
all_data['log_Przebieg_km'] = np.log1p(all_data['Przebieg_km'])

In [None]:

all_data['Efektywnosc_silnika'] = all_data['Moc_KM'] / (all_data['Pojemnosc_cm3'] / 1000)
all_data['Efektywnosc_silnika'].replace([np.inf, -np.inf], np.nan, inplace=True)
all_data['Efektywnosc_silnika'] = all_data['Efektywnosc_silnika'].fillna(all_data['Efektywnosc_silnika'].median())

In [None]:

all_data['Sredni_roczny_przebieg'] = all_data['Przebieg_km'] / all_data['Wiek_pojazdu'].replace(0, 0.5)
all_data['Sredni_roczny_przebieg'].replace([np.inf, -np.inf], np.nan, inplace=True)
all_data['Sredni_roczny_przebieg'] = all_data['Sredni_roczny_przebieg'].fillna(all_data['Sredni_roczny_przebieg'].median())

In [None]:
if 'Wyposazenie' in all_data.columns:
    if isinstance(all_data['Wyposazenie'].iloc[0], str):
        all_data['Wyposazenie'] = all_data['Wyposazenie'].apply(lambda x: eval(x) if isinstance(x, str) and x.startswith('[') else [])
    all_data['Liczba_elementow_wyposazenia'] = all_data['Wyposazenie'].apply(len)

    premium_features = [
        'Leather upholstery', 'GPS navigation', 'Heated front seats', 
        'Xenon lights', 'LED lights', 'Automatic air conditioning',
        'Panoramic roof', 'Electrically adjustable seats', 'Active cruise control'
    ]
    for feature in premium_features:
        all_data[f'ma_{feature.replace(" ", "_")}'] = all_data['Wyposazenie'].apply(
            lambda x: 1 if isinstance(x, list) and any(feature in item for item in x) else 0
        )

In [None]:
numeric_cols = ['Rok_produkcji', 'Przebieg_km', 'Moc_KM', 'Pojemnosc_cm3', 
                'Liczba_drzwi', 'Liczba_elementow_wyposazenia', 'Efektywnosc_silnika',
                'Wiek_pojazdu', 'log_Przebieg_km', 'Sredni_roczny_przebieg']

for col in numeric_cols:
    if col in all_data.columns and all_data[col].isnull().sum() > 0:
        median_val = all_data.loc[(all_data['is_train'] == 1) & (all_data['data_source'] == 'original'), col].median()
        all_data[col] = all_data[col].fillna(median_val)

In [None]:
categorical_cols = ['Stan', 'Marka_pojazdu', 'Model_pojazdu', 'Rodzaj_paliwa', 
                   'Naped', 'Skrzynia_biegow', 'Typ_nadwozia', 'Kolor', 'Kraj_pochodzenia']

for col in categorical_cols:
    if col in all_data.columns and all_data[col].isnull().sum() > 0:
        all_data[col] = all_data[col].fillna('nieznany')

In [None]:
train_marka_mean_price = all_data.loc[(all_data['is_train'] == 1) & 
                                      (all_data['data_source'] == 'original')].groupby('Marka_pojazdu')['log_Cena'].mean()
all_data['Marka_avg_price'] = all_data['Marka_pojazdu'].map(train_marka_mean_price)
all_data['Marka_avg_price'] = all_data['Marka_avg_price'].fillna(train_marka_mean_price.mean())

In [None]:
train_model_mean_price = all_data.loc[(all_data['is_train'] == 1) & 
                                      (all_data['data_source'] == 'original')].groupby(['Marka_pojazdu', 'Model_pojazdu'])['log_Cena'].mean()
all_data['Model_avg_price'] = all_data.apply(
    lambda x: train_model_mean_price.get((x['Marka_pojazdu'], x['Model_pojazdu']), np.nan), axis=1)
all_data['Model_avg_price'] = all_data['Model_avg_price'].fillna(all_data['Marka_avg_price'])

In [None]:
color_counts = all_data.loc[all_data['is_train'] == 1, 'Kolor'].value_counts(normalize=True)
all_data['Kolor_freq'] = all_data['Kolor'].map(color_counts)
all_data['Kolor_freq'] = all_data['Kolor_freq'].fillna(color_counts.min())

In [None]:
all_data['Wiek_x_Przebieg'] = all_data['Wiek_pojazdu'] * all_data['log_Przebieg_km']

In [None]:
all_data['Moc_x_Pojemnosc'] = all_data['Moc_KM'] * all_data['Pojemnosc_cm3'] / 1000

In [None]:
if 'Liczba_elementow_wyposazenia' in all_data.columns:
    all_data['Wiek_per_Wyposazenie'] = all_data['Wiek_pojazdu'] / (all_data['Liczba_elementow_wyposazenia'] + 1)

In [None]:
all_data['Oryginalnie_EUR'] = all_data['Waluta'].apply(lambda x: 1 if pd.notna(x) and x == 'EUR' else 0)

In [None]:
all_data_encoded = pd.get_dummies(all_data, columns=[
    'Stan', 'Rodzaj_paliwa', 'Naped', 'Skrzynia_biegow', 'Typ_nadwozia'
])

In [None]:
for cat_col in ['Marka_pojazdu', 'Model_pojazdu', 'Kolor', 'Kraj_pochodzenia']:
    if cat_col in all_data.columns:
        target_means = all_data.loc[(all_data['is_train'] == 1) & 
                                    (all_data['data_source'] == 'original')].groupby(cat_col)['log_Cena'].mean()
        all_data_encoded[f'{cat_col}_target_enc'] = all_data[cat_col].map(target_means)
        all_data_encoded[f'{cat_col}_target_enc'].fillna(target_means.mean(), inplace=True)

In [None]:
features = [
    'Wiek_pojazdu', 'log_Przebieg_km', 'Moc_KM', 'Pojemnosc_cm3', 
    'Liczba_elementow_wyposazenia', 'Efektywnosc_silnika', 'Sredni_roczny_przebieg',
    'Oryginalnie_EUR',
    
    'Marka_avg_price', 'Model_avg_price', 'Kolor_freq',
    
    'ma_Leather_upholstery', 'ma_GPS_navigation', 'ma_Heated_front_seats',
    'ma_Xenon_lights', 'ma_LED_lights', 'ma_Automatic_air_conditioning',
    'ma_Panoramic_roof', 'ma_Electrically_adjustable_seats', 'ma_Active_cruise_control',
    
    'Wiek_x_Przebieg', 'Moc_x_Pojemnosc', 'Wiek_per_Wyposazenie',
    
    'Marka_pojazdu_target_enc', 'Model_pojazdu_target_enc', 
    'Kolor_target_enc', 'Kraj_pochodzenia_target_enc'
]

features += [col for col in all_data_encoded.columns if col.startswith(('Stan_', 'Rodzaj_paliwa_', 
                                                        'Naped_', 'Skrzynia_biegow_', 'Typ_nadwozia_'))]

In [None]:
X_all = all_data_encoded[features].copy()

missing = X_all.isnull().sum()
if missing.sum() > 0:
    print(f"Brakujące wartości w danych: {missing[missing > 0]}")
    
    for col in X_all.columns:
        if X_all[col].isnull().sum() > 0:
            if X_all[col].dtype.kind in 'ifc':
                median_val = all_data_encoded.loc[(all_data_encoded['is_train'] == 1) & 
                                                  (all_data_encoded['data_source'] == 'original'), col].median()
                X_all[col] = X_all[col].fillna(median_val)
            else:
                X_all[col] = X_all[col].fillna('nieznany')

In [None]:
X_train_all = X_all[all_data_encoded['is_train'] == 1]
y_train_all = all_data_encoded.loc[all_data_encoded['is_train'] == 1, 'log_Cena']
X_test = X_all[all_data_encoded['is_train'] == 0]

X_train, X_val, y_train, y_val = train_test_split(
    X_train_all, y_train_all, test_size=0.2, random_state=42
)

In [None]:
def calculate_rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [None]:
def objective_catboost(trial):
    param = {
        "iterations": trial.suggest_int("iterations", 500, 3000),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "depth": trial.suggest_int("depth", 4, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-8, 10.0, log=True),
        "random_strength": trial.suggest_float("random_strength", 1e-8, 10.0, log=True),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0, 10.0),
        "border_count": trial.suggest_int("border_count", 32, 255),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
        "verbose": False,
        "random_seed": 42
    }
    
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    log_rmse_scores = []
    
    for train_idx, val_idx in kf.split(X_train_all):
        X_train_fold, X_val_fold = X_train_all.iloc[train_idx], X_train_all.iloc[val_idx]
        y_train_fold, y_val_fold = y_train_all.iloc[train_idx], y_train_all.iloc[val_idx]
        
        pruning_callback = CatBoostPruningCallback(trial, "RMSE")
        
        model = cb.CatBoostRegressor(**param)
        model.fit(
            X_train_fold, 
            y_train_fold,
            eval_set=[(X_val_fold, y_val_fold)],
            callbacks=[pruning_callback],
            early_stopping_rounds=100,
            verbose=0
        )
        
        y_val_pred_log = model.predict(X_val_fold)
        
        log_rmse = calculate_rmse(y_val_fold, y_val_pred_log)
        log_rmse_scores.append(log_rmse)
    
    return np.mean(log_rmse_scores)

study_catboost = optuna.create_study(
    direction="minimize",
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=10),
    sampler=optuna.samplers.TPESampler(seed=42)
)

In [None]:
n_trials = 100
study_catboost.optimize(objective_catboost, n_trials=n_trials)

In [None]:
best_params_catboost = study_catboost.best_params
print(f"Najlepsze parametry CatBoost: {best_params_catboost}")
print(f"Najlepszy RMSE CatBoost: {study_catboost.best_value:.6f}")

In [None]:
def objective_xgboost(trial):
    param = {
        "n_estimators": trial.suggest_int("n_estimators", 500, 2000),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 1.0),
        "random_state": 42,
    }
    
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    rmse_scores = []
    
    for train_idx, val_idx in kf.split(X_train_all):
        X_train_fold, X_val_fold = X_train_all.iloc[train_idx], X_train_all.iloc[val_idx]
        y_train_fold, y_val_fold = y_train_all.iloc[train_idx], y_train_all.iloc[val_idx]
        
        model = xgb.XGBRegressor(**param)
        model.fit(X_train_fold, y_train_fold)
        
        y_val_pred = model.predict(X_val_fold)
        rmse = np.sqrt(mean_squared_error(y_val_fold, y_val_pred))
        rmse_scores.append(rmse)
    
    return np.mean(rmse_scores)

study_xgboost = optuna.create_study(direction="minimize")
study_xgboost.optimize(objective_xgboost, n_trials=n_trials)

In [None]:
best_params_xgboost = study_xgboost.best_params
print(f"Najlepsze parametry XGBoost: {best_params_xgboost}")
print(f"Najlepszy RMSE XGBoost: {study_xgboost.best_value:.6f}")

In [None]:
def objective_lightgbm(trial):
    param = {
        "n_estimators": trial.suggest_int("n_estimators", 500, 2000),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 20, 150),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 5, 100),
        "max_bin": trial.suggest_int("max_bin", 32, 512),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 10),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0),
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "random_state": 42,
    }
    
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    rmse_scores = []
    
    for train_idx, val_idx in kf.split(X_train_all):
        X_train_fold, X_val_fold = X_train_all.iloc[train_idx], X_train_all.iloc[val_idx]
        y_train_fold, y_val_fold = y_train_all.iloc[train_idx], y_train_all.iloc[val_idx]
        
        model = lgb.LGBMRegressor(**param)
        model.fit(X_train_fold, y_train_fold)
        
        y_val_pred = model.predict(X_val_fold)
        rmse = np.sqrt(mean_squared_error(y_val_fold, y_val_pred))
        rmse_scores.append(rmse)
    
    return np.mean(rmse_scores)

study_lightgbm = optuna.create_study(direction="minimize")
study_lightgbm.optimize(objective_lightgbm, n_trials=n_trials)

In [None]:
best_params_lightgbm = study_lightgbm.best_params
print(f"Najlepsze parametry LightGBM: {best_params_lightgbm}")
print(f"Najlepszy RMSE LightGBM: {study_lightgbm.best_value:.6f}")

In [None]:
final_catboost_params = best_params_catboost.copy()
final_catboost_params['verbose'] = 0

In [None]:
final_catboost_model = cb.CatBoostRegressor(**final_catboost_params)
final_catboost_model.fit(X_train_all, y_train_all)

final_xgboost_model = xgb.XGBRegressor(**best_params_xgboost, random_state=42)
final_xgboost_model.fit(X_train_all, y_train_all)

final_lightgbm_model = lgb.LGBMRegressor(**best_params_lightgbm, random_state=42)
final_lightgbm_model.fit(X_train_all, y_train_all)

In [None]:
def objective_ensemble_weights(trial):
    w1 = trial.suggest_float("catboost_weight", 0.1, 0.7)
    w2 = trial.suggest_float("xgboost_weight", 0.1, 0.7)
    w3 = trial.suggest_float("lightgbm_weight", 0.1, 0.7)
    
    sum_weights = w1 + w2 + w3
    w1 /= sum_weights
    w2 /= sum_weights
    w3 /= sum_weights
    
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    rmse_scores = []
    
    for train_idx, val_idx in kf.split(X_train_all):
        X_val_fold = X_train_all.iloc[val_idx]
        y_val_fold = y_train_all.iloc[val_idx]
        
        y_pred_catboost = final_catboost_model.predict(X_val_fold)
        y_pred_xgboost = final_xgboost_model.predict(X_val_fold)
        y_pred_lightgbm = final_lightgbm_model.predict(X_val_fold)
        
        y_pred_ensemble = w1 * y_pred_catboost + w2 * y_pred_xgboost + w3 * y_pred_lightgbm
        
        rmse = np.sqrt(mean_squared_error(y_val_fold, y_pred_ensemble))
        rmse_scores.append(rmse)
    
    return np.mean(rmse_scores)

study_ensemble = optuna.create_study(direction="minimize")
study_ensemble.optimize(objective_ensemble_weights, n_trials=200)

In [None]:
best_weights = study_ensemble.best_params
catboost_weight = best_weights["catboost_weight"]
xgboost_weight = best_weights["xgboost_weight"]
lightgbm_weight = best_weights["lightgbm_weight"]

In [None]:
sum_weights = catboost_weight + xgboost_weight + lightgbm_weight
catboost_weight /= sum_weights
xgboost_weight /= sum_weights
lightgbm_weight /= sum_weights

In [None]:
print(f"Wagi ensembla: CatBoost={catboost_weight:.3f}, XGBoost={xgboost_weight:.3f}, LightGBM={lightgbm_weight:.3f}")
print(f"Najlepszy RMSE ensembla: {study_ensemble.best_value:.6f}")

In [None]:
y_pred_catboost = final_catboost_model.predict(X_test)
y_pred_xgboost = final_xgboost_model.predict(X_test)
y_pred_lightgbm = final_lightgbm_model.predict(X_test)

y_pred_ensemble_log = catboost_weight * y_pred_catboost + xgboost_weight * y_pred_xgboost + lightgbm_weight * y_pred_lightgbm

y_pred_ensemble = np.expm1(y_pred_ensemble_log)

test_orig_eur = all_data.loc[all_data['is_train'] == 0, 'Waluta'] == 'EUR'

In [None]:
submission = pd.DataFrame({
    'ID': test_ids,
    'Cena': y_pred_ensemble
})

submission_path = 'submit.csv'
submission.to_csv(submission_path, index=False)