In [None]:
from utils.preprocessing import BasicPreprocessPipeline
from scipy.stats import iqr
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
from scipy.stats import norm

In [None]:
pd.set_option('display.max_columns', 50)

In [None]:
df_test = BasicPreprocessPipeline.run(pd.read_csv('data/sales_ads_test.csv'))
df_train = BasicPreprocessPipeline.run(pd.read_csv('data/sales_ads_train.csv'))
df_data = pd.concat((df_train, df_test), ignore_index=True, copy=True).reset_index()
len(df_test), len(df_train)

## Price investigation
Analysing the distribution of price.

In [None]:
# Distribution of the price
plt.show(sns.displot(df_train.Cena))
print(f"Median : {df_train.Cena.quantile(0.5)}")
print(f"Quantile 90% : {df_train.Cena.quantile(0.9)}")
print(f"Top 10 : {df_train.Cena.sort_values(ascending=False).head(10).to_numpy()}")

In [None]:
display(df_train.sort_values('Cena', ascending=False).head(2))  # The most severe ones
print(f"Average McLaren price : {df_train.groupby('Marka_pojazdu').agg({'Cena': 'mean'}).loc['McLaren', 'Cena']}")

In [None]:
df_train.drop(df_train[df_train.Cena == 6999000.0].index, inplace=True)  # Such BMW seems like an anomaly
df_data.drop(df_data[df_data.Cena == 6999000.0].index, inplace=True)  # Such BMW seems like an anomaly

In [None]:
x = df_train.Cena.to_numpy()
x = np.log(x)
x = (x - x.mean()) / (x.std())
sns.displot(x, kde=True, stat="density", bins=60)
normal = np.linspace(min(x), max(x), 100)
y = norm.pdf(normal, 0, 1)
plt.plot(normal, y, 'r-', label='Normal Distribution')
outliers = x[(x < -3) | (x > 3)]
print(f"Amount of outliers 3-sigma rule : {len(outliers)}")
print(f"Outliers extremes : {df_train.Cena[(x < -3) | (x > 3)].min()}, {df_train.Cena[(x < -3) | (x > 3)].max()}")
print(f"Outliers ~3 sigma range : {df_train.Cena.quantile(0.9975)} {df_train.Cena.quantile(0.0025)}")
print(f"Outliers ~4 sigma range : {df_train.Cena.quantile(0.99975)} {df_train.Cena.quantile(0.00025)}")
plt.legend()
plt.show()

In [None]:
df_train['Cena_log'] = df_train.Cena.apply(np.log)
df_data['Cena_log'] = df_data.Cena.to_numpy()
df_data.loc[~df_data['Cena_log'].isna(), 'Cena_log'] = df_train['Cena_log']

Notes are that data is very skewed to the left. There are a few extreme target outliers, 
which will be hard to tackle during prediction (probable premium car brands).
Since top 2 cars are 3 times more expensive than the rest most expensive cars, they should
be excluded. They are too big of an outlier. Without augmentation it would be extremely hard
to take them into account. <br/><br/>
Therefore, a log can be applied as data show patterns of exponential growth typical for sales data. After log
transformation it can be seen that the data was in fact exponentially distributed. Furthermore, according to
3-sigma rule we can identify values, which would be hard targets for the model as they fall into extreme
regions of the normal distribution. This gives us total of 219 such samples.

## Horse power inverstigation
Analysing the correlation between horsepower and other features. Tracking anomalies
based on plots and logical real worlds assumptions (small engine can't have high power etc.)

In [None]:
plt.show(sns.displot(df_train.Moc_KM))
print(f"Top 10 horse power : {df_train.Moc_KM.sort_values(ascending=False).head(5).to_list()}")
print(f"Top 10 horse power (Test) : {df_test.Moc_KM.sort_values(ascending=False).head(5).to_list()}")
print(f"Top 10 horse power (Brands) : {df_train.sort_values('Moc_KM', ascending=False).Marka_pojazdu.head(5).to_list()}")

In [None]:
display(df_test.sort_values('Moc_KM', ascending=False).head(5))
display(df_train.sort_values('Moc_KM', ascending=False).head(5))
print(f"IQR based range {df_test.Moc_KM.quantile(0.25) - 1.5 * iqr(df_test.Moc_KM.values, nan_policy='omit')} : {df_test.Moc_KM.quantile(0.75) + 1.5 * iqr(df_test.Moc_KM.values, nan_policy='omit')}")
sns.lmplot(x="Moc_KM", y="Cena_log", data=df_train, ci=None, line_kws={'color': 'red'})
df_train.loc[:, 'Moc_KM_capped'] = df_train.Moc_KM.to_numpy()
df_train.loc[df_train.Moc_KM > 800, 'Moc_KM_capped'] = 800
plt.show(sns.displot(df_train.Moc_KM_capped))
df_train.drop(columns='Moc_KM_capped', inplace=True)

There are Suzuki and Passat with extreme horsepower values, which suggests some potential anomalies.
To fix them the approach would be
capping the value based on the standard deviation and mean of `Moc_KM` for each `Marka_pojazdu`. This could
help as it is also applicable for the testing set.
Furthermore, engine volume (`Pojemnocs_cm`) in real world has high correlation with power. Therefore, it could be used for
detecting outliers. Here a simple linear regression line should discover some anomalies.
<br/><br/>
Anyway, it can be seen that correlation between price and power exists, but it is messy.

In [None]:
value_threshold = 100
value_second_threshold = 10
sigma_scalar = 5
print(f"Value threshold : {value_threshold} - ({value_threshold / len(df_data) * 100:.5f}%)")
print(f"Second value threshold : {value_second_threshold} - ({value_second_threshold / len(df_data) * 100:.5f}%)")

funcs = ('mean', 'std', lambda x: x.quantile(0.25), lambda x: x.quantile(0.5), lambda x: x.quantile(0.75))
power_table = ((df_data.groupby('Marka_pojazdu').agg({'Moc_KM': funcs}))
               .sort_values(('Moc_KM', 'mean'), ascending=False))
power_table.columns = ('mean', 'std', 'Q1', 'Q2', 'Q3')

value_counts = df_data.Marka_pojazdu.value_counts()[power_table.index]
plt.figure(figsize=(14, 6))
common_brands = power_table[value_counts >= value_threshold]
plt.bar(common_brands.index, common_brands.loc[:, 'mean'])
plt.title('Power based on car brand (Medium Tier Segment)')
plt.show()

ceiling = common_brands.loc[:, 'mean'] + common_brands.loc[:, 'std'] * sigma_scalar
def cap_power_based_on_brand(row: pd.Series):
    row = row.copy()
    if row.Marka_pojazdu not in common_brands.index:
        return row
    if row.Moc_KM > ceiling[row.Marka_pojazdu]:
        row.Moc_KM = ceiling[row.Marka_pojazdu]
    return row
df_data_capped = df_data.loc[:, ['Moc_KM', 'Marka_pojazdu', 'Cena_log']].apply(cap_power_based_on_brand, axis=1)

uncommon_brands = power_table[(value_counts < value_threshold) & (value_counts >= value_second_threshold)]
IQR = uncommon_brands.loc[:, 'Q3'] - uncommon_brands.loc[:, 'Q1']
ceiling = uncommon_brands.loc[:, 'Q2'] + 1.5 * IQR
floor = uncommon_brands.loc[:, 'Q2'] - 1.5 * IQR
def cap_power_based_on_brand(row: pd.Series):
    row = row.copy()
    if row.Marka_pojazdu not in uncommon_brands.index:
        return row
    if row.Moc_KM > ceiling[row.Marka_pojazdu]:
        row.Moc_KM = ceiling[row.Marka_pojazdu]
    elif row.Moc_KM < floor[row.Marka_pojazdu]:
        row.Moc_KM = floor[row.Marka_pojazdu]
    return row
df_data_capped = df_data_capped.apply(cap_power_based_on_brand, axis=1)
sns.lmplot(x="Moc_KM", y="Cena_log", data=df_data_capped, ci=None, line_kws={'color': 'red'})

power_table = df_data_capped.groupby('Marka_pojazdu').agg({'Moc_KM': ('mean', 'std')}).sort_values(('Moc_KM', 'mean'), ascending=False)
value_counts = df_data_capped.Marka_pojazdu.value_counts()[power_table.index]
plt.figure(figsize=(14, 6))
common_brands = power_table[value_counts >= value_threshold]
plt.bar(common_brands.index, common_brands.loc[:, ('Moc_KM', 'mean')])
plt.title('Power based on car brand (Medium Tier Segment)')
plt.show()

df_data.loc[:, 'Moc_KM_brand_capped'] = df_data_capped.Moc_KM.to_numpy()
total_capped = (df_data.Moc_KM != df_data.Moc_KM_brand_capped)[~df_data.Moc_KM.isna()].sum()
print(f"Capped total of {total_capped} samples, which is {total_capped / len(df_data) * 100:.4f}%")

Here values were clipped based on the average horsepower per brand category. Categories with high amount of
samples were treated with 3-sigma rule to take into account variance of the data, whereas small samples
were filtered by IQR due to unreliable standard deviation.

In [None]:
# Manual checking for potential problems and ideas
df_data_capped.sort_values('Moc_KM', ascending=False).head(2)
display(df_data_capped.sort_values('Moc_KM', ascending=False).head(15))

Additionally, the engine volume can be taken into account. Notice, that although high volume engines can have
low power due to low rpm limit, inefficient burning and so on, the small volume engines are very hard to
improve on power. Therefore, the outliers below regression line should be taken into account.<br/><br/>
Keep in mind the volume may also have outliers! Furthermore, in this case horsepower outliers were managed
by the categories (with logical assumptions that categories has strong influence on the horsepower). Thus,
it is more probable that the engine volume contains outliers rather than horsepower.

In [None]:
sns.lmplot(x="Moc_KM", y="Pojemnosc_cm3", data=df_data, ci=None, line_kws={'color': 'red'})
plt.title("Correlation horsepower and engine volume (Train + Test)")
plt.show()
sns.lmplot(x="Moc_KM_brand_capped", y="Pojemnosc_cm3", data=df_data, ci=None, line_kws={'color': 'red'})
plt.title("Correlation horsepower:fixed and engine volume (Train + Test)")
plt.show()

Funny horizontal line patterns on the plot are due to often engines fixed volume, which can have
various power based on compression, turbo etc.

In [None]:
error_threshold = -2_001  # Only for significant outliers
volume_threshold = 1_001  # Only for engines smaller than 1 liter (assumption based on car domain knowledge)
volume_min = 1_000
decay_scalar = 0.5  # How much to diminish the total outliers
mask = ~(df_data.Pojemnosc_cm3.isna() | df_data.Moc_KM_brand_capped.isna())
slope, bias = np.polyfit(df_data.Moc_KM_brand_capped[mask].to_numpy(),
                         df_data.Pojemnosc_cm3[mask].to_numpy(), 1)
errors = (df_data.Pojemnosc_cm3[mask].to_numpy() - (bias + slope * df_data.Moc_KM_brand_capped[mask].to_numpy()))
args = np.argwhere((df_data.Pojemnosc_cm3[mask].to_numpy() < volume_threshold) & (errors < error_threshold))
indices = df_data.Moc_KM_brand_capped[mask].index[args.flatten()]

df_data.loc[:, 'Pojemnosc_cm3_capped'] = df_data.Pojemnosc_cm3.to_numpy()
df_data.loc[indices, 'Pojemnosc_cm3_capped'] = (df_data.loc[indices, 'Pojemnosc_cm3_capped'] + bias + slope * df_data.Moc_KM_brand_capped[mask].to_numpy()[indices]) / 2
sns.lmplot(x="Moc_KM_brand_capped", y="Pojemnosc_cm3_capped", data=df_data, ci=None, line_kws={'color': 'red'})
plt.title("Correlation horsepower:fixed and engine volume (Train + Test)")
plt.show()
df_data.loc[indices]

In [None]:
sns.lmplot(x="Pojemnosc_cm3", y="Cena_log", data=df_train, ci=None, line_kws={'color': 'red'})

There are only a few extreme outliers in this small engine volume region. Nontheless, for future more robust
model one fact should be remembered. The electric cars like Tesla shoud have their engine volume deleted or
be marked as electric.

In [None]:
print(f"Top 10 mileage : {df_data.Przebieg_km.sort_values(ascending=False).head(5).to_list()}")
print(f"Top 10 mileage (Test): {df_data.Przebieg_km.sort_values(ascending=False).head(5).to_list()}")
print(f"Top 10 mileage : {df_data.sort_values('Przebieg_km', ascending=False).Przebieg_km.head(5).to_list()}")
print(f"Quantile 90% mileage : {df_data.Przebieg_km.quantile(0.9)}")
print(f"IQR based range {df_data.Przebieg_km.quantile(0.25) - 1.5 * iqr(df_data.Przebieg_km.to_numpy(), nan_policy='omit')} : {df_data.Przebieg_km.quantile(0.75) + 1.5 * iqr(df_data.Przebieg_km.to_numpy(), nan_policy='omit')}")
max_value = df_data.Przebieg_km.quantile(0.75) + 1.5 * iqr(df_data.Przebieg_km.to_numpy(), nan_policy='omit')
super_max_value = df_data.Przebieg_km.quantile(0.75) + 5.0 * iqr(df_data.Przebieg_km.to_numpy(), nan_policy='omit')
print(f"Amount of IQR outliers : {df_data[df_data.Przebieg_km > max_value].shape[0]}")
sns.lmplot(x="Przebieg_km", y="Cena", data=df_data, ci=None)  # Ci will look ridiculous here


df_data.loc[:, 'Przebieg_km_capped'] = df_data.Przebieg_km.to_numpy()
df_data.loc[df_data.Przebieg_km > max_value, 'Przebieg_km_capped'] = max_value
df_data.loc[:, 'Przebieg_km_cleared'] = df_data.Przebieg_km.to_numpy()
df_data.loc[df_data.Przebieg_km > super_max_value, 'Przebieg_km_cleared'] = np.nan
sns.lmplot(x="Przebieg_km_capped", y="Cena", data=df_data, ci=None, line_kws={'color': 'red'})
sns.lmplot(x="Przebieg_km_capped", y="Cena_log", data=df_data, ci=None, line_kws={'color': 'red'})
print(f"Duplicates after clipping : {(df_data.Przebieg_km_capped == max_value).sum()}")
plt.show()

In [None]:
sns.histplot(x="Przebieg_km_capped", data=df_data, bins=100)
plt.show()

Here, on the other hand cutting the mileage based on the IQR might work. It can be seen that outliers are so big
that is hard to see the correlation. After capping the value correlation is much clearer and it can be seen
that for larger values of mileage it is decent.<br/></br>
Nonetheless, it will not help with small values.

In [None]:
sns.displot(df_data.Rok_produkcji)
plt.title("Year of production distribution (Train)")
plt.show()
print(f"Top 10 age : {df_data.Rok_produkcji.sort_values().head(5).to_list()}")
print("Oldest 10 age brands : ")
display(df_data.sort_values('Rok_produkcji').head(5))

sns.lmplot(x="Rok_produkcji", y="Cena_log", data=df_train, ci=None, line_kws={'color': 'red'})
sns.lmplot(x="Rok_produkcji", y="Przebieg_km_cleared", data=df_data, ci=None, line_kws={'color': 'red'})

## Imputation for numerical features

In [None]:
power_brand_sum = (df_data.Moc_KM.isna() & df_data.Marka_pojazdu.notna()).sum()
power_model_brand_sum = (df_data.Moc_KM.isna() & df_data.Model_pojazdu.isna() & df_data.Marka_pojazdu.notna()).sum()
power_brand_sum, power_model_brand_sum

Low ratio above indicates plethora of Horse Power missing data could be filled based on the brand value.

In [None]:
np.random.seed(100)
mask = df_data.Moc_KM.isna() & df_data.Marka_pojazdu.notna()
power = df_data.groupby('Marka_pojazdu').agg({'Moc_KM_brand_capped': ('mean', 'std')})
counts = df_data.Marka_pojazdu.value_counts()
reliable_references = counts.index[counts > 30]

display(df_data.loc[mask, ('Moc_KM_brand_capped', 'Marka_pojazdu')].head(5))
mask = df_data.Marka_pojazdu.isin(reliable_references) & mask
df_data.loc[:, 'Moc_KM_filled'] = df_data.Moc_KM_brand_capped.to_numpy()
df_data.loc[mask, 'Moc_KM_filled'] = df_data.loc[mask, 'Marka_pojazdu'].apply(
    lambda row: (power.loc[row, ('Moc_KM_brand_capped', 'mean')] +
                 np.random.uniform(-0.2, 0.2) * power.loc[row, ('Moc_KM_brand_capped', 'std')])
)
df_data.loc[mask, ('Moc_KM_filled', 'Marka_pojazdu')].head(5)

In [None]:
# Too extreme outliers make less reliable validation
df_data.drop(index=df_data.index[df_data.Cena > 6_000_000], inplace=True)

In [None]:
df_data.columns

In [None]:
from tqdm import tqdm
from typing import Any, List

##### For category wrapping or omitting
def cat_wrap(df, column, threshold):
    counts = df[column].value_counts()
    mask = df[column].isin(counts[counts < threshold].index)
    df.loc[mask, column] = 'Other'
    return df

def cat_select(df, column, threshold):
    counts = df[column].value_counts()
    mask = df[column].isin(counts[counts < threshold].index)
    df.loc[mask, column] = np.nan
    return df

def target_encode(df: pd.DataFrame, df_unknowns: Any, features: List):
    ##### Target Encoding for high cardinality
    for cat in tqdm(cat_features_high_card):
        df = cat_select(df, cat, 10)
        engineered_stats = ['median', 'max', 'min', 'nunique']
        table_of_cat_stats = df.groupby(cat).agg({'Cena': engineered_stats})
        engineered_features = []
        for stat in engineered_stats:
            stat_series = table_of_cat_stats[('Cena', stat)]
            df.loc[:, f'{cat}_{stat}'] = df.loc[:, cat].map(stat_series)
            if df_unknowns is not None:
                if isinstance(df_unknowns, pd.DataFrame):
                    df_unknowns.loc[:, f'{cat}_{stat}'] = df_unknowns.loc[:, cat].map(stat_series)
                elif isinstance(df_unknowns, list):
                    for dfu in df_unknowns:
                        dfu.loc[:, f'{cat}_{stat}'] = dfu.loc[:, cat].map(stat_series)
                else:
                    raise ValueError('Expected unknown data frames to be list or a single data frame.')
            engineered_features.append(f'{cat}_{stat}')
        features += engineered_features
    
    #### Either One Hot encoding or different Target encoding
    for cat in tqdm(cat_features_low_card):
        df = cat_select(df, cat, 50)
        engineered_stats = ['median', 'mean', 'std']
        table_of_cat_stats = df.groupby(cat).agg({'Cena': engineered_stats})
        engineered_features = []
        for stat in engineered_stats:
            stat_series = table_of_cat_stats[('Cena', stat)]
            df.loc[:, f'{cat}_{stat}'] = df.loc[:, cat].map(stat_series)
            if df_unknowns is not None:
                if isinstance(df_unknowns, pd.DataFrame):
                    df_unknowns.loc[:, f'{cat}_{stat}'] = df_unknowns.loc[:, cat].map(stat_series)
                elif isinstance(df_unknowns, list):
                    for dfu in df_unknowns:
                        dfu.loc[:, f'{cat}_{stat}'] = dfu.loc[:, cat].map(stat_series)
                else:
                    raise ValueError('Expected unknown data frames to be list or a single data frame.')
            engineered_features.append(f'{cat}_{stat}')
        features += engineered_features
    return df, df_unknowns

# Selected features
cat_features_high_card = ['Marka_pojazdu', 'Model_pojazdu', 'Generacja_pojazdu']
cat_features_low_card = ['Naped', 'Skrzynia_biegow', 'Typ_nadwozia', 'Kolor', 'Liczba_drzwi',
                         'Kraj_pochodzenia']
num_features=['Przebieg_km_cleared', 'Moc_KM_brand_capped', 'Pojemnosc_cm3_capped', 'Rok_produkcji']
features = num_features

# Encode the targets for the
assert np.all(df_test.ID.to_numpy() == df_data.loc[df_data.Cena.isna(), 'ID'].to_numpy()), "General data frame is incorrectly merged."
# Fill the missing brands with their equivalents
print("Maybach is chainged into Rolls-Royce for testing as it does not appear in training.")
df_test.loc[df_test.Marka_pojazdu == 'Maybach', 'Marka_pojazdu'] = 'Rolls-Royce'
print("Brands, which are in test, but not train. They will be set to Nan.")
print(df_test.loc[~np.isin(df_test.Marka_pojazdu, df_train.Marka_pojazdu.unique()), 'Marka_pojazdu'].to_numpy())
df_test.loc[~np.isin(df_test.Marka_pojazdu, df_train.Marka_pojazdu.unique()), 'Marka_pojazdu'] = np.nan

df = df_data.loc[~df_data.Cena.isna()].copy(deep=True).reset_index()
df_submition = df_data.loc[df_data.Cena.isna()].copy(deep=True).reset_index()

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from math import sqrt
from copy import deepcopy

##### Split the data into K folds, but preserve price distribution
num_folds = 6
target_column='Cena'
df.sort_values(by=target_column, inplace=True, kind='stable')  # Split randomly using modulo, but having even price distribution
folds_idx = np.arange(len(df))
for i in folds_idx:
    folds_idx[i] = i % num_folds
folds = []
for i in range(num_folds):
    test_mask = folds_idx == i % num_folds
    folds.append((np.argwhere(~test_mask).flatten(), np.argwhere(test_mask).flatten()))
y = df[target_column]

# XGBoost parameters
params = {
    'max_depth': 10,
    'learning_rate': 0.01,
    'subsample': 0.8,
    'colsample_bytree': 0.8,  # Equivalent to rsm
    'min_child_weight': 10,     # Smooth the groups, don't allow luxury cars to overfit
    'random_state': 100,
    'reg_lambda': 10,         # L2 regularization (similar to l2_leaf_reg)
    'objective': 'reg:squarederror',  # 'pseudohubererror'
    'eval_metric': 'rmse',
}

# Cross-validation stats
cv_results = {
    'test-rmse-mean': [],
    'test-rmse-std': [],
    'train-rmse-mean': [],
    'train-rmse-std': []
}

fold_scores = []
hardest_examples = []
best_model = None
best_rmse = 1_000_000
safe_num_of_estimators = 0
for i, (train_idx, test_idx) in enumerate(folds):
    print(f"Fold {i+1}/{num_folds}")

    X_train, y_train = df.iloc[train_idx], y.iloc[train_idx]
    X_test, y_test = df.iloc[test_idx], y.iloc[test_idx]
    
    # Encode the categories
    # and don't expose data information from validation to train (No data leakage)
    all_features = deepcopy(features)
    X_train, X_test = target_encode(X_train.copy(deep=True), X_test.copy(deep=True), all_features)
    X_train, X_test = X_train[all_features], X_test[all_features]

    # Create DMatrix objects for XGBoost
    dtrain = xgb.DMatrix(X_train.to_numpy(), label=y_train, feature_names=all_features)
    dtest = xgb.DMatrix(X_test.to_numpy(), label=y_test, feature_names=all_features)

    # Train the model
    model = xgb.train(
        params,
        dtrain,
        num_boost_round=15_000,
        evals=[(dtrain, 'train'), (dtest, 'test')],
        early_stopping_rounds=50,
        verbose_eval=100
    )

    # Get predictions
    train_preds = model.predict(dtrain)
    test_preds = model.predict(dtest)

    # Calculate metrics
    train_rmse = sqrt(mean_squared_error(y_train, train_preds))
    test_rmse = sqrt(mean_squared_error(y_test, test_preds))
    
    # Save model if it has the lowest RMSE
    if test_rmse < best_rmse:
        best_rmse = test_rmse
        best_model = model
    safe_num_of_estimators = (safe_num_of_estimators * i + model.best_iteration) / (i + 1)
    
    # Get the examples, which were the hardest to classify
    test_mse = (y_test - test_preds) ** 2
    hardest_args = np.argsort(test_mse)[-5:]
    hardest_validation_examples = X_test.iloc[hardest_args].copy(deep=True)
    hardest_validation_examples.loc[:, 'Prediction'] = test_preds[hardest_args]
    hardest_validation_examples.loc[:, 'Cena'] = y_test.iloc[hardest_args]
    hardest_examples.append(hardest_validation_examples)

    print(f"Fold {i+1} - Train RMSE: {train_rmse:.6f}, Test RMSE: {test_rmse:.6f}")
    fold_scores.append((train_rmse, test_rmse))

# Stats on best model
print(f"Best validation RMSE: {best_rmse:.6f}")
print(f"Average number of estimators: {safe_num_of_estimators}")

# Summarize results
train_scores = [score[0] for score in fold_scores]
test_scores = [score[1] for score in fold_scores]

cv_results['train-rmse-mean'] = np.mean(train_scores)
cv_results['train-rmse-std'] = np.std(train_scores)
cv_results['test-rmse-mean'] = np.mean(test_scores)
cv_results['test-rmse-std'] = np.std(test_scores)

print("\nCross-Validation Results:")
print(f"Train RMSE: {cv_results['train-rmse-mean']:.6f} ± {cv_results['train-rmse-std']:.6f}")
print(f"Test RMSE: {cv_results['test-rmse-mean']:.6f} ± {cv_results['test-rmse-std']:.6f}")

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
xgb.plot_importance(
    model,
    ax=ax,
    max_num_features=25,
    importance_type="weight", # 'gain', 'cover', 'total_gain', 'total_cover'
)
plt.title("XGBoost Top Features")
plt.show()

## Train the final model
Traininig the final model is performed on the whole dataset to maximise the gain from the data. In such case one have
to be extra careful about overfitting.

In [None]:
# Create very small validation set for final training early stopping
modulo = 1000
df.sort_values(by=target_column, inplace=True, kind='stable')  # Split randomly using modulo, but having even price distribution

test_mask = np.arange(len(df)) % modulo == 0
train_idx, test_idx = np.argwhere(~test_mask).flatten(), np.argwhere(test_mask).flatten()

y = df[target_column]

X_train, y_train = df.iloc[train_idx], y.iloc[train_idx]
X_test, y_test = df.iloc[test_idx], y.iloc[test_idx]

# Encode the categories
# and don't expose data information from validation to train (No data leakage)
all_features = deepcopy(features)
X_train, (X_test, X_submition) = target_encode(X_train.copy(deep=True), [X_test.copy(deep=True), df_submition.copy(deep=True)], all_features)
X_train, X_test, X_submition = X_train.loc[:, all_features], X_test.loc[:, all_features], X_submition.loc[:, all_features]

# Create DMatrix objects for XGBoost
dtrain = xgb.DMatrix(X_train.to_numpy(), label=y_train, feature_names=all_features)
dtest = xgb.DMatrix(X_test.to_numpy(), label=y_test, feature_names=all_features)
# Train the model
model = xgb.train(
    params,
    dtrain,
    num_boost_round=900,     # To ensure there is no overfitting (based on where model improvement plateaus)
    evals=[(dtrain, 'train'), (dtest, 'test')],
    early_stopping_rounds=50,
    verbose_eval=100
)

# Predict
dsub = xgb.DMatrix(X_submition.to_numpy(), feature_names=all_features)
predictions = model.predict(dsub)
print(f"Maximal prediction value detected : {predictions.max()}")
print(f"Maximal target in train : {y_train.max()}")
print(f"Maximal target in test : {y_test.max()}")

In [None]:
from utils.preprocessing import preprocess_currency

model.save_model('submission_model.json')

df_submit = df_test.copy(deep=True)
df_submit['Cena'] = predictions
df_submit = preprocess_currency(df_submit, invert=True)

df_submit.drop(columns=[col for col in df_test.columns if col not in ['ID', 'Cena']], inplace=True)
display(df_submit)
df_submit.to_csv('submission.csv', index=False)

In [None]:
df_test.iloc[np.argsort(predictions)[-2]]

In [None]:
np.sort(predictions)[-1]

In [None]:
df_submit = df_test.copy(deep=True)
df_submit['Cena'] = predictions
df_submit = preprocess_currency(df_submit, invert=True)

mask = df_submit.Waluta == 'EUR'
print(predictions[mask][:10], df_submit.Cena[mask].to_numpy()[:10])

In [None]:
predictions[1]