In [2]:
import pandas as pd
import numpy as np
import re
import datetime
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb

try:
    from supplemental_english import REGION_CODES, GOVERNMENT_CODES
except ImportError:
    REGION_CODES = {}
    GOVERNMENT_CODES = {}

def parse_date(date_str):
    if pd.isnull(date_str):
        return np.nan
    if ' - ' in date_str:
        date_str = date_str.split(' - ')[0].strip()
    try:
        return pd.to_datetime(date_str, errors='coerce')
    except Exception:
        return np.nan

def extract_plate_features(plate):
    features = {}
    m = re.search(r'(\d{2,3})$', plate)
    features['region'] = m.group(1) if m else np.nan
    
    digit_groups = re.findall(r'\d+', plate)
    if len(digit_groups) >= 1:
        features['number_part'] = digit_groups[0]
    else:
        features['number_part'] = np.nan

    letters = re.sub(r'\d+', '', plate)
    features['letters'] = letters
    features['plate_length'] = len(plate)
    return features

def check_government_code(letters, number_str, region):
    try:
        number_val = int(number_str)
    except:
        return (0, 0, 0)
    for (letters_key, (num_min, num_max), region_key), info in GOVERNMENT_CODES.items():
        if letters_key == letters and region_key == region:
            if num_min <= number_val <= num_max:
                _, is_forbidden, has_advantage, significance = info
                return (is_forbidden, has_advantage, significance)
    return (0, 0, 0)

def smape(y_true, y_pred):
    numerator = np.abs(y_true - y_pred)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    return np.mean(np.where(denominator == 0, 0, numerator / denominator)) * 100

train = pd.read_csv('/kaggle/input/russian-car-plates-prices-prediction/train.csv')
test = pd.read_csv('/kaggle/input/russian-car-plates-prices-prediction/test.csv')
sample_submission = pd.read_csv('/kaggle/input/russian-car-plates-prices-prediction/sample_submission.csv')

reference_date = pd.to_datetime("2021-01-01")

for df in [train, test]:
    df['parsed_date'] = df['date'].apply(parse_date)
    df['year'] = df['parsed_date'].dt.year
    df['month'] = df['parsed_date'].dt.month
    df['day'] = df['parsed_date'].dt.day
    df['weekday'] = df['parsed_date'].dt.weekday
    df['days_since_2021'] = (df['parsed_date'] - reference_date).dt.days

def add_plate_features(df):
    plate_feats = df['plate'].apply(extract_plate_features)
    plate_df = pd.DataFrame(list(plate_feats))

    gov_features = plate_df.apply(
        lambda row: check_government_code(row['letters'], row['number_part'], row['region']),
        axis=1
    )
    plate_df[['is_forbidden','has_advantage','significance_level']] = pd.DataFrame(gov_features.tolist(), index=plate_df.index)
    
    plate_df['number_part'] = pd.to_numeric(plate_df['number_part'], errors='coerce')
    df = pd.concat([df, plate_df], axis=1)
    return df

train = add_plate_features(train)
test = add_plate_features(test)

from sklearn.preprocessing import LabelEncoder

for col in ['region', 'letters']:
    le = LabelEncoder()
    combined = pd.concat([train[col], test[col]], axis=0).astype(str)
    le.fit(combined)
    train[col + '_enc'] = le.transform(train[col].astype(str))
    test[col + '_enc'] = le.transform(test[col].astype(str))

price_cap = train['price'].quantile(0.995)
train['price'] = np.where(train['price'] > price_cap, price_cap, train['price'])
train['price_log'] = np.log1p(train['price'])

features = [
    'days_since_2021',
    'weekday',
    'plate_length',
    'number_part',
    'region_enc',
    'letters_enc',
    'is_forbidden',
    'has_advantage',
    'significance_level'
]
X = train[features]
y_log = train['price_log']
X_test = test[features]

cutoff_date = pd.to_datetime('2024-07-01')
train_mask = train['parsed_date'] < cutoff_date
valid_mask = train['parsed_date'] >= cutoff_date

X_train = X[train_mask]
y_train = y_log[train_mask]
X_valid = X[valid_mask]
y_valid = y_log[valid_mask]

lgb_train = lgb.Dataset(X_train, y_train)
lgb_valid = lgb.Dataset(X_valid, y_valid)

params = {
    'objective': 'regression',
    'metric': 'mae',
    'verbosity': -1,
    'seed': 42
}

callbacks = [
    lgb.early_stopping(stopping_rounds=50),
    lgb.log_evaluation(period=50)
]

model = lgb.train(
    params,
    train_set=lgb_train,
    num_boost_round=1000,
    valid_sets=[lgb_valid],
    callbacks=callbacks
)

y_valid_pred_log = model.predict(X_valid, num_iteration=model.best_iteration)
y_valid_pred = np.expm1(y_valid_pred_log)
y_valid_true = np.expm1(y_valid)  

val_smape = smape(y_valid_true.values, y_valid_pred)
print(f"Validation SMAPE (time-based split): {val_smape:.4f}%")

full_train = lgb.Dataset(X, y_log)
model_full = lgb.train(params, full_train, num_boost_round=model.best_iteration)

test_pred_log = model_full.predict(X_test)
test_pred = np.expm1(test_pred_log)

submission = pd.DataFrame({
    'id': test['id'],
    'price': test_pred
})
submission.to_csv('submission.csv', index=False)
print("Submission file created with time-based split (no lag features)!")

Training until validation scores don't improve for 50 rounds
[50]	valid_0's l1: 0.623546
[100]	valid_0's l1: 0.573657
[150]	valid_0's l1: 0.563555
[200]	valid_0's l1: 0.567905
Early stopping, best iteration is:
[163]	valid_0's l1: 0.560082
Validation SMAPE (time-based split): 51.3600%
Submission file created with time-based split (no lag features)!
