In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/russian-car-plates-prices-prediction/sample_submission.csv
/kaggle/input/russian-car-plates-prices-prediction/supplemental_english.py
/kaggle/input/russian-car-plates-prices-prediction/supplemental_russian.py
/kaggle/input/russian-car-plates-prices-prediction/train.csv
/kaggle/input/russian-car-plates-prices-prediction/test.csv


In [2]:
import pandas as pd
import numpy as np
import re
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
from collections import Counter
import holidays
from supplemental_english import GOVERNMENT_CODES, REGION_CODES 
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, KBinsDiscretizer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
import xgboost as xgb

In [3]:
print("--- XGBoost with Enhanced Features, Robust FE & CV---")

print("\n1. Load Data and Initial Exploration\n" + "="*50)
# Load original training data to derive global statistics
df_train_orig = pd.read_csv(r'/kaggle/input/russian-car-plates-prices-prediction/train.csv')
df_test_orig = pd.read_csv(r'/kaggle/input/russian-car-plates-prices-prediction/test.csv')

# Define global statistics from the original training set
global_mean_price = df_train_orig['price'].mean()
global_median_price = df_train_orig['price'].median()
global_std_price = df_train_orig['price'].std()
global_mean_price_log = np.log1p(global_mean_price)
global_median_price_log = np.log1p(global_median_price)

# Concatenate for consistent feature engineering
df_train_orig['is_train'] = 1
df_test_orig['is_train'] = 0 # Price column will be NaN here
df = pd.concat([df_train_orig, df_test_orig], ignore_index=True)
df['date'] = pd.to_datetime(df['date'])

region_lookup = {}
for region, codes in REGION_CODES.items():
    for code in codes:
        region_lookup[str(code)] = region # Ensure codes are strings for lookup

--- XGBoost with Enhanced Features, Robust FE & CV---

1. Load Data and Initial Exploration


In [4]:
# --- Feature Engineering Functions ---
def parse_plate_components(plate):
    plate = str(plate).upper().strip()
    match = re.match(r'^([ABEKMHOPCTYX])(\d{1,3})([ABEKMHOPCTYX]{2})(\d{2,3})$', plate)
    if match:
        first_letter = match.group(1)
        numbers_str = match.group(2).zfill(3) # Pad numbers to 3 digits
        last_letters_str = match.group(3)
        region_code_str = match.group(4)
        full_letters = first_letter + last_letters_str
        return full_letters, numbers_str, region_code_str, first_letter, last_letters_str

    # Fallback for less standard formats or if the above doesn't match perfectly
    letters_found = ''.join(re.findall(r'[ABEKMHOPCTYX]', plate))
    numbers_found = ''.join(re.findall(r'\d+', plate)) # Find all digit sequences

    parsed_letters = letters_found if letters_found else ''
    
    if len(numbers_found) >= 5: # e.g., 12377 or 123777
        if numbers_found[-3:].isdigit() and (len(numbers_found) - 3) > 0 : # Check for 3-digit region
             parsed_numbers = numbers_found[:-3].zfill(3)[-3:] # take last 3 of number part
             parsed_region = numbers_found[-3:]
        elif numbers_found[-2:].isdigit() and (len(numbers_found) - 2) > 0 : # Check for 2-digit region
             parsed_numbers = numbers_found[:-2].zfill(3)[-3:]
             parsed_region = numbers_found[-2:]
        else: # Default to taking first 3 as number, rest as region if possible
            parsed_numbers = numbers_found[:3].zfill(3)
            parsed_region = numbers_found[3:][-3:] if len(numbers_found[3:]) >=2 else numbers_found[3:]

    elif len(numbers_found) == 3 or len(numbers_found) == 4: # e.g. 123 or 1237 (assume 123 is number, 7 is bad region)
        parsed_numbers = numbers_found[:3].zfill(3)
        parsed_region = numbers_found[3:] if len(numbers_found) > 3 else '' # Region only if extra digits
    elif len(numbers_found) > 0 : # Less than 3 digits, assume all are numbers
        parsed_numbers = numbers_found.zfill(3)[-3:]
        parsed_region = ''
    else:
        parsed_numbers = '000'
        parsed_region = ''
        
    _first_letter = parsed_letters[0] if parsed_letters else ''
    _last_letters = parsed_letters[1:] if len(parsed_letters) > 1 else '' # Simplified, actual last two are harder if format varies

    return parsed_letters, parsed_numbers, parsed_region, _first_letter, _last_letters


def get_government_significance_enhanced(letters, numbers_str, region_code_str):
    if not letters and not numbers_str and not region_code_str:
        return False, 0, 0, 0, "Non-governmental"
    try: numbers_int = int(numbers_str) # numbers_str should already be 3 digits
    except ValueError: numbers_int = -1
    
    # Ensure letters is at least 1 char for gov_letters[0] access
    if not letters: 
        return False, 0, 0, 0, "Non-governmental"

    for (gov_letters, num_range, gov_region), details in GOVERNMENT_CODES.items():
        current_gov_first_letter = gov_letters[0] if isinstance(gov_letters, tuple) else gov_letters[0]
        if (letters == gov_letters and 
            num_range[0] <= numbers_int <= num_range[1] and 
            str(region_code_str) == str(gov_region)):
            agency, forbidden, advantage, significance_val = details[0], details[1], details[2], details[3]
            return True, forbidden, advantage, significance_val, agency
    return False, 0, 0, 0, "Non-governmental"


def categorize_agency(agency_desc):
    agency_desc = str(agency_desc)
    if agency_desc == 'Non-governmental': return 'Non-governmental'
    if 'President' in agency_desc: return 'Presidential'
    if 'Police' in agency_desc.lower() or 'Internal Affairs' in agency_desc: return 'Police/Security'
    if 'Government' in agency_desc and 'Federation Council' not in agency_desc and 'State Duma' not in agency_desc: return 'Government'
    if 'Military' in agency_desc or 'Army' in agency_desc or 'Defense' in agency_desc: return 'Military'
    if 'Federal' in agency_desc or 'FSB' in agency_desc or 'FSO' in agency_desc: return 'Federal Services'
    if 'Judge' in agency_desc or 'Court' in agency_desc or 'Justice' in agency_desc or 'prosecutor' in agency_desc.lower(): return 'Judicial'
    if 'Administration' in agency_desc: return 'Administration'
    if 'Diplomatic' in agency_desc: return 'Diplomatic'
    return 'Other Governmental'

def analyze_number_patterns_enhanced(numbers_str):
    numbers_str = str(numbers_str).zfill(3) # Ensure it's a 3-digit string
    digit_counts = Counter(numbers_str)
    max_repeat = 0
    if numbers_str.isdigit() and digit_counts: # Check if digit_counts is not empty
         max_repeat = max(digit_counts.values())
    
    is_sequential, is_reverse_seq, is_palindrome_flag, has_mirror_effect, is_low_number_flag = False, False, False, False, False
    if numbers_str.isdigit() and len(numbers_str) == 3 : 
        is_sequential = any(numbers_str[i:i+2] in '01234567890' for i in range(len(numbers_str)-1)) or \
                        any(numbers_str[i:i+3] in '012345678901' for i in range(len(numbers_str)-2))
        is_reverse_seq = any(numbers_str[i:i+2] in '98765432109' for i in range(len(numbers_str)-1)) or \
                         any(numbers_str[i:i+3] in '987654321098' for i in range(len(numbers_str)-2))
        is_palindrome_flag = numbers_str == numbers_str[::-1]
        has_mirror_effect = (numbers_str[0] == numbers_str[-1]) or is_palindrome_flag
        try: 
            is_low_number_flag = int(numbers_str) < 100
        except ValueError:
            is_low_number_flag = False
    return max_repeat, is_sequential, is_reverse_seq, is_palindrome_flag, is_low_number_flag, has_mirror_effect

def enrich_date_features(df_to_enrich):
    df_to_enrich['year'] = df_to_enrich['date'].dt.year
    df_to_enrich['month'] = df_to_enrich['date'].dt.month
    df_to_enrich['day'] = df_to_enrich['date'].dt.day
    df_to_enrich['day_of_week'] = df_to_enrich['date'].dt.dayofweek
    df_to_enrich['week_of_year'] = df_to_enrich['date'].dt.isocalendar().week.astype(int)
    df_to_enrich['quarter'] = df_to_enrich['date'].dt.quarter
    if not df_to_enrich['date'].empty and df_to_enrich['date'].notna().all():
         df_to_enrich['total_days'] = (df_to_enrich['date'] - df_to_enrich['date'].min()).dt.days
    else:
         df_to_enrich['total_days'] = 0 
         if df_to_enrich['date'].isna().any():
             print("Warning: NaNs found in 'date' column during total_days calculation. Filling with 0.")
    
    df_to_enrich['day_name'] = df_to_enrich['date'].dt.day_name() # ADDED day_name

    df_to_enrich['is_weekend'] = df_to_enrich['date'].dt.dayofweek.isin([5, 6]).astype(int)
    df_to_enrich['weekday_sin'] = np.sin(2 * np.pi * df_to_enrich['day_of_week'] / 7)
    df_to_enrich['weekday_cos'] = np.cos(2 * np.pi * df_to_enrich['day_of_week'] / 7)
    df_to_enrich['day_sin'] = np.sin(2 * np.pi * df_to_enrich['day'] / 31) 
    df_to_enrich['day_cos'] = np.cos(2 * np.pi * df_to_enrich['day'] / 31)
    df_to_enrich['month_sin'] = np.sin(2 * np.pi * df_to_enrich['month'] / 12)
    df_to_enrich['month_cos'] = np.cos(2 * np.pi * df_to_enrich['month'] / 12)
    
    unique_years = df_to_enrich['year'].dropna().unique() 
    if len(unique_years) > 0 :
        try:
            ru_holidays = holidays.Russia(years=unique_years.astype(int).tolist()) 
            df_to_enrich['is_holiday'] = df_to_enrich['date'].apply(lambda x: x.date() in ru_holidays if pd.notna(x) else False).astype(int)
        except Exception as e:
            print(f"Warning: Could not generate holiday features: {e}. Setting 'is_holiday' to 0.")
            df_to_enrich['is_holiday'] = 0 
    else:
        df_to_enrich['is_holiday'] = 0
    return df_to_enrich

In [5]:
# --- Apply Feature Engineering ---
print("\n2. Applying Feature Engineering...\n" + "="*50)
plate_components_extracted = df['plate'].apply(parse_plate_components)
df['letters'] = [comp[0] for comp in plate_components_extracted]
df['numbers'] = [comp[1] for comp in plate_components_extracted] 
df['region_code'] = [comp[2] for comp in plate_components_extracted]
df['first_letter'] = [comp[3] for comp in plate_components_extracted] # ADDED
df['last_letters'] = [comp[4] for comp in plate_components_extracted] # ADDED

df['numbers'] = df['numbers'].fillna('000').str.zfill(3) 
df['region_code'] = df['region_code'].fillna('0').astype(str) # Ensure string for lookup
df['letters'] = df['letters'].fillna('')
df['first_letter'] = df['first_letter'].fillna('') # ADDED
df['last_letters'] = df['last_letters'].fillna('') # ADDED
df['region_name'] = df['region_code'].map(lambda x: region_lookup.get(str(x), "UnknownRegion"))

gov_features = df.apply(lambda row: get_government_significance_enhanced(row['letters'], row['numbers'], row['region_code']), axis=1)
df['is_govt_plate'] = [feat[0] for feat in gov_features]
df['forbidden_to_buy'] = [feat[1] for feat in gov_features]
df['advantage_on_road'] = [feat[2] for feat in gov_features]
df['significance'] = [feat[3] for feat in gov_features]
df['agency'] = [feat[4] for feat in gov_features]
df['agency_category'] = df['agency'].apply(categorize_agency)

df = enrich_date_features(df) 

prestigious_numbers_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 111, 222, 333, 444, 555, 666, 777, 888, 999, 100, 200, 300, 400, 500, 600, 700, 800, 900]
prestigious_letters_list = ["AAA", "MMM", "EEE", "KKK", "OOO", "PPP", "CCC", "TTT", "XXX"] 
df['is_prestigious_number'] = df['numbers'].apply(lambda x: 1 if x.isdigit() and int(x) in prestigious_numbers_list else 0)
df['is_prestigious_letters'] = df['letters'].apply(lambda x: 1 if x in prestigious_letters_list else 0)

pattern_features = df['numbers'].apply(analyze_number_patterns_enhanced)
df['max_repeated_digits'] = [feat[0] for feat in pattern_features]
df['is_sequential_num'] = [feat[1] for feat in pattern_features] 
df['is_reverse_sequential_num'] = [feat[2] for feat in pattern_features] 
df['is_palindrome_num'] = [feat[3] for feat in pattern_features] 
df['is_low_number'] = [feat[4] for feat in pattern_features]
df['has_mirror_numbers'] = [feat[5] for feat in pattern_features]

df['unique_letters_count'] = df['letters'].apply(lambda x: len(set(x)) if isinstance(x, str) and x else 0)
df['has_repeated_letters'] = df['letters'].apply(lambda x: len(set(x)) < len(x) if isinstance(x, str) and x else False)

numbers_freq_map = df['numbers'].value_counts(normalize=True).to_dict()
df['numbers_freq_enc'] = df['numbers'].map(numbers_freq_map).fillna(0)
df['numbers_log_freq_enc'] = np.log1p(df['numbers_freq_enc'])

train_df_for_encoding = df[df['is_train'] == 1].copy()

# Target Encoding for letters, first_letter, last_letters
if 'price' in train_df_for_encoding.columns and not train_df_for_encoding['price'].isnull().all():
    letter_mean_price_map = train_df_for_encoding.groupby('letters')['price'].mean().apply(np.log1p).to_dict()
    df['letters_mean_price_log'] = df['letters'].map(letter_mean_price_map)
    
    first_letter_mean_price_map = train_df_for_encoding.groupby('first_letter')['price'].mean().apply(np.log1p).to_dict() # ADDED
    df['first_letter_mean_price_log'] = df['first_letter'].map(first_letter_mean_price_map) # ADDED
    
    last_letters_mean_price_map = train_df_for_encoding.groupby('last_letters')['price'].mean().apply(np.log1p).to_dict() # ADDED
    df['last_letters_mean_price_log'] = df['last_letters'].map(last_letters_mean_price_map) # ADDED
else:
    df['letters_mean_price_log'] = np.nan 
    df['first_letter_mean_price_log'] = np.nan # ADDED
    df['last_letters_mean_price_log'] = np.nan # ADDED

df['letters_mean_price_log'] = df['letters_mean_price_log'].fillna(global_mean_price_log)
df['first_letter_mean_price_log'] = df['first_letter_mean_price_log'].fillna(global_mean_price_log) # ADDED
df['last_letters_mean_price_log'] = df['last_letters_mean_price_log'].fillna(global_mean_price_log) # ADDED


df['region_avg_price'] = np.nan
df['region_median_price'] = np.nan
df['region_price_std'] = np.nan
df['region_count'] = np.nan

if ('price' in train_df_for_encoding.columns and \
    not train_df_for_encoding['price'].isnull().all() and \
    'region_name' in train_df_for_encoding.columns and \
    not train_df_for_encoding['region_name'].isnull().all() and \
    len(train_df_for_encoding) > 0):
    try:
        region_stats = train_df_for_encoding.groupby('region_name')['price'].agg(['mean', 'median', 'std', 'count']).reset_index()
        region_stats.columns = ['region_name', 'temp_region_avg_price', 'temp_region_median_price', 'temp_region_price_std', 'temp_region_count']
        if not region_stats.empty:
            df = df.merge(region_stats, on='region_name', how='left')
            df['region_avg_price'] = df['temp_region_avg_price'] 
            df['region_median_price'] = df['temp_region_median_price']
            df['region_price_std'] = df['temp_region_price_std']
            df['region_count'] = df['temp_region_count']
            df.drop(columns=['temp_region_avg_price', 'temp_region_median_price', 
                             'temp_region_price_std', 'temp_region_count'], inplace=True, errors='ignore')
    except Exception as e:
        print(f"Error during region_stats calculation or merge: {e}. Region stats will use global defaults.")
else:
    print("Warning: Insufficient data for region_stats. Using global defaults.")

df['region_avg_price'] = df['region_avg_price'].fillna(global_mean_price)
df['region_median_price'] = df['region_median_price'].fillna(global_median_price)
df['region_price_std'] = df['region_price_std'].fillna(global_std_price if pd.notna(global_std_price) else 0)
df['region_count'] = df['region_count'].fillna(1)

df['region_avg_price_log'] = np.log1p(df['region_avg_price'])
df['region_median_price_log'] = np.log1p(df['region_median_price'])
df['region_avg_price_log'] = df['region_avg_price_log'].fillna(global_mean_price_log)
df['region_median_price_log'] = df['region_median_price_log'].fillna(global_median_price_log)

premium_regions_list = ['Moscow', 'Saint Petersburg', 'Moscow Oblast', 'Leningrad Oblast', 'Republic of Tatarstan', 'Sverdlovsk Oblast', 'Krasnodar Krai']
df['is_premium_region'] = df['region_name'].isin(premium_regions_list).astype(int)

df = df.sort_values(['plate', 'date']) 
df['price_lag_1'] = df.groupby('plate')['price'].shift(1) 
df['plate_listing_count'] = df.groupby('plate').cumcount() + 1
df['price_lag_1'] = df['price_lag_1'].fillna(global_median_price) 
df['price_lag_1_log'] = np.log1p(df['price_lag_1'])

df['prestige_score'] = ((df['is_prestigious_letters'].astype(int) * 3) + \
                        (df['is_prestigious_number'].astype(int) * 2) + \
                        (df['has_repeated_letters'].astype(int) * 1) + \
                        (df['has_mirror_numbers'].astype(int) * 1) + \
                        (df['is_palindrome_num'].astype(int) *1) + \
                        (df['is_sequential_num'].astype(int) * 1) + \
                        (df['significance'].astype(int))) 
df['prestige_rank'] = df['prestige_score'].rank(method='average', pct=True)

df['letters_region_combo'] = df['letters'].astype(str) + "_" + df['region_code'].astype(str)
letters_region_freq_map = df['letters_region_combo'].value_counts(normalize=True).to_dict()
df['letters_region_freq'] = df['letters_region_combo'].map(letters_region_freq_map).fillna(0)

df['is_gov_and_prestige'] = df['is_govt_plate'].astype(int) * df['prestige_score'].astype(int)

df['letters_fillna'] = df['letters'].fillna('') 
vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 2), min_df=5, max_features=100) 
letter_text_features = vectorizer.fit_transform(df['letters_fillna'])
letter_text_features_df = pd.DataFrame(letter_text_features.toarray(), columns=[f"letter_vec_{i}" for i in vectorizer.get_feature_names_out()])
df = pd.concat([df.reset_index(drop=True), letter_text_features_df.reset_index(drop=True)], axis=1)

df.loc[df['is_train'] == 1, 'price_log'] = np.log1p(df.loc[df['is_train'] == 1, 'price'])
df['price_log'] = df['price_log'].fillna(0)


2. Applying Feature Engineering...


In [6]:
# --- Define Feature Columns ---
base_feature_cols = [
    'letters', 'numbers', 'region_code', 'region_name', 'agency', 'agency_category', 'day_name', # Added day_name
    'first_letter', 'last_letters', # ADDED
    'is_govt_plate', 'forbidden_to_buy', 'advantage_on_road', 'significance', 
    'is_prestigious_number', 'is_prestigious_letters', 'prestige_score', 'prestige_rank', 
    'max_repeated_digits', 'is_sequential_num', 'is_reverse_sequential_num', 'is_palindrome_num', 
    'is_low_number', 'has_repeated_letters', 'unique_letters_count', 'has_mirror_numbers', 
    'year', 'month', 'day', 'day_of_week', 'week_of_year', 'quarter', 'total_days', 'is_weekend', 'is_holiday', 
    'weekday_sin', 'weekday_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos', 
    'numbers_freq_enc', 'numbers_log_freq_enc', 
    'letters_mean_price_log', 
    'first_letter_mean_price_log', 'last_letters_mean_price_log', # ADDED
    'region_avg_price_log', 'region_median_price_log', 'region_price_std', 'region_count', 'is_premium_region', 
    'price_lag_1_log', 'plate_listing_count', 
    'letters_region_freq', 
    'is_gov_and_prestige'
]
text_feature_cols_final = [col for col in df.columns if col.startswith("letter_vec_")] 
feature_cols = sorted(list(set(base_feature_cols + text_feature_cols_final)))
feature_cols = [col for col in feature_cols if col in df.columns]


# Separate train and test
train_df = df[df['is_train'] == 1].copy()
test_df = df[df['is_train'] == 0].copy()

# --- Data for Models ---
X = train_df[feature_cols].copy() 
y_log = train_df['price_log'].copy()
y_original = train_df['price'].copy() 
X_test_payload_original_features = test_df[feature_cols].copy()


# --- SMAPE Metric ---
def calculate_smape(y_true_orig, y_pred_orig):
    y_true_orig = np.array(y_true_orig)
    y_pred_orig = np.array(y_pred_orig)
    y_pred_orig = np.maximum(y_pred_orig, 0) 
    numerator = np.abs(y_true_orig - y_pred_orig)
    denominator = (np.abs(y_true_orig) + np.abs(y_pred_orig) + 1e-8) 
    return np.mean(2 * numerator / denominator) * 100

# --- Preprocessing ---
categorical_cols_str = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=np.number).columns.tolist()

final_cat_cols = [col for col in categorical_cols_str if col in X.columns]
final_num_cols = [col for col in numerical_cols if col in X.columns and col not in final_cat_cols]

print(f"Final Categorical columns for Ordinal Encoding: {len(final_cat_cols)} ({final_cat_cols[:5]}...)") # Print first 5
print(f"Final Numerical columns for Imputation/Scaling: {len(final_num_cols)} ({final_num_cols[:5]}...)")


numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing_value_placeholder')), # More specific fill_value
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1, dtype=np.float32))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, final_num_cols),
        ('cat', categorical_transformer, final_cat_cols)
    ],
    remainder='drop' 
)

Final Categorical columns for Ordinal Encoding: 9 (['agency', 'agency_category', 'day_name', 'first_letter', 'last_letters']...)
Final Numerical columns for Imputation/Scaling: 138 (['advantage_on_road', 'day', 'day_cos', 'day_of_week', 'day_sin']...)


In [7]:
# --- Model Training and Cross-Validation ---
N_SPLITS = 10 
y_bins = KBinsDiscretizer(n_bins=N_SPLITS, encode='ordinal', strategy='quantile', subsample=None) 
y_log_binned = y_bins.fit_transform(y_log.values.reshape(-1, 1)).astype(int).ravel()
kf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

xgb_params = {
    'n_estimators': 10000, 
    'max_depth': 10, 
    'learning_rate': 0.01852160907217988,
    'subsample': 0.6786672470738663,
    'colsample_bytree': 0.46208650739218005,
    'reg_alpha': 0.017519138973638618,
    'reg_lambda': 0.2839310763317462,
    'gamma': 0.0033995958574628547,
    'objective': 'reg:tweedie', 
    'tweedie_variance_power': 1.0869464555654937, 
    'random_state': 42, 
    'n_jobs': -1, 
    'tree_method': 'hist',
    'early_stopping_rounds': 100 
}

model_name = 'XGBoost'
xgb_oof_preds_log = np.zeros(X.shape[0])
xgb_test_preds_log_sum = np.zeros(X_test_payload_original_features.shape[0])
xgb_fold_scores = []
xgb_feature_importances_list = []

print(f"\nTraining {model_name} with enhanced features...")
for fold_idx, (train_idx, val_idx) in enumerate(kf.split(X, y_log_binned)):
    print(f"  Fold {fold_idx+1}/{N_SPLITS}")
    X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
    y_train_fold_log, y_val_fold_log = y_log.iloc[train_idx], y_log.iloc[val_idx]
    y_val_fold_orig = y_original.iloc[val_idx] 

    preprocessor_fitted = preprocessor.fit(X_train_fold)
    X_train_fold_processed = preprocessor_fitted.transform(X_train_fold)
    X_val_fold_processed = preprocessor_fitted.transform(X_val_fold)
    X_test_fold_processed = preprocessor_fitted.transform(X_test_payload_original_features)

    current_model = xgb.XGBRegressor(**xgb_params) 
    current_model.fit(X_train_fold_processed, y_train_fold_log,
                      eval_set=[(X_val_fold_processed, y_val_fold_log)],
                      verbose=False) 
    
    fold_val_pred_log = current_model.predict(X_val_fold_processed)
    fold_test_pred_log = current_model.predict(X_test_fold_processed)
    
    processed_feature_names_for_fold = preprocessor_fitted.get_feature_names_out()
    fold_importances = pd.Series(current_model.feature_importances_, index=processed_feature_names_for_fold)
    xgb_feature_importances_list.append(fold_importances)

    xgb_oof_preds_log[val_idx] = fold_val_pred_log
    xgb_test_preds_log_sum += fold_test_pred_log

    fold_val_pred_orig = np.expm1(fold_val_pred_log)
    fold_smape = calculate_smape(y_val_fold_orig, fold_val_pred_orig)
    xgb_fold_scores.append(fold_smape)
    print(f"    Fold SMAPE: {fold_smape:.4f}")

xgb_avg_cv_smape = np.mean(xgb_fold_scores)
print(f"\n  {model_name} Average CV SMAPE (all features): {xgb_avg_cv_smape:.4f}")


Training XGBoost with enhanced features...
  Fold 1/10
    Fold SMAPE: 35.2761
  Fold 2/10
    Fold SMAPE: 34.7346
  Fold 3/10
    Fold SMAPE: 34.6729
  Fold 4/10
    Fold SMAPE: 35.3771
  Fold 5/10
    Fold SMAPE: 35.9054
  Fold 6/10
    Fold SMAPE: 35.3538
  Fold 7/10
    Fold SMAPE: 35.2809
  Fold 8/10
    Fold SMAPE: 35.6031
  Fold 9/10
    Fold SMAPE: 35.2824
  Fold 10/10
    Fold SMAPE: 36.2005

  XGBoost Average CV SMAPE (all features): 35.3687


In [8]:
avg_xgb_importances = pd.Series(dtype='float64') 
if xgb_feature_importances_list:
    try:
        common_index = xgb_feature_importances_list[0].index
        for fi_series in xgb_feature_importances_list[1:]:
            common_index = common_index.intersection(fi_series.index)
        
        reindexed_importances = [fi.reindex(common_index).fillna(0) for fi in xgb_feature_importances_list]
        if reindexed_importances:
             avg_xgb_importances = pd.concat(reindexed_importances, axis=1).mean(axis=1).sort_values(ascending=False)
        else: # Should not happen if xgb_feature_importances_list is not empty
            avg_xgb_importances = pd.Series(dtype='float64')


        print(f"\n  Top 20 {model_name} Features (averaged over folds):")
        print(avg_xgb_importances.head(20))
    except Exception as e:
        print(f"Could not average XGBoost importances due to: {e}.")
        if xgb_feature_importances_list: 
             avg_xgb_importances = xgb_feature_importances_list[-1].sort_values(ascending=False)


xgb_final_test_preds_log = xgb_test_preds_log_sum / N_SPLITS
xgb_final_test_preds_orig = np.expm1(xgb_final_test_preds_log)
xgb_final_test_preds_orig = np.maximum(xgb_final_test_preds_orig, 0) 

submission = pd.DataFrame({'id': test_df['id'], 'price': xgb_final_test_preds_orig}) 
submission_filename = 'submission.csv'
submission.to_csv(submission_filename, index=False)
print(f"\nSubmission file created successfully: {submission_filename}")
print(submission.head())


  Top 20 XGBoost Features (averaged over folds):
num__prestige_score            0.088663
num__prestige_rank             0.071089
num__is_prestigious_letters    0.042509
num__unique_letters_count      0.025870
num__is_prestigious_number     0.024011
num__letters_mean_price_log    0.021013
num__numbers_freq_enc          0.020575
num__significance              0.020538
num__price_lag_1_log           0.019583
num__numbers_log_freq_enc      0.018200
num__advantage_on_road         0.017567
num__year                      0.017231
num__is_gov_and_prestige       0.016129
cat__region_code               0.015352
num__letter_vec_mp             0.013067
num__max_repeated_digits       0.012493
num__letter_vec_am             0.011018
num__region_avg_price_log      0.010459
num__region_count              0.010369
cat__agency_category           0.009383
dtype: float32

Submission file created successfully: submission.csv
       id         price
6   54874  1.033723e+06
12  52711  3.859312e+05
26  52799