In [None]:
import os
import re
import gc
import numpy as np
import pandas as pd

import optuna
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

from catboost import CatBoostRegressor
import lightgbm as lgb
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Dropout, Input, Embedding, Concatenate, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import backend as K
import tensorflow as tf

from sklearn.linear_model import Ridge
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso

import warnings
warnings.filterwarnings('ignore')

# Reduce memory usage by optimizing data types
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object and col_type.name != 'category':
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type).startswith('int'):
                if c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                else:
                    df[col] = df[col].astype(np.int32)
            else:
                df[col] = df[col].astype(np.float16)
    end_mem = df.memory_usage().sum() / 1024**2
    return df

# Load the data
train = pd.read_csv('/kaggle/input/playground-series-s4e9/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s4e9/test.csv')

# # Define the sampling fraction
# sample_frac = 0.0001  # 1%

# # Sample 1% of the training and test data
# train = train.sample(frac=sample_frac, random_state=42)
# test = test.sample(frac=sample_frac, random_state=42)

# # Save the sampled datasets
# train.to_csv('train_sampled.csv', index=False)
# test.to_csv('test_sampled.csv', index=False)

# print(f"Sampled Training Data Shape: {train.shape}")
# print(f"Sampled Test Data Shape: {test.shape}")

print(f"Training Data Shape: {train.shape}")
print(f"Test Data Shape: {test.shape}")
print(train.head())
print(test.head())
# Clear memory
gc.collect()

In [None]:
# Fill missing values

train['clean_title'].fillna('No', inplace=True)
train['accident'].fillna('None reported', inplace=True)

test['clean_title'].fillna('No', inplace=True)
test['accident'].fillna('None reported', inplace=True)

fuel_type_mapping = {
    '1500 Laramie': 'Gasoline',  # Typically powered by V6 or V8 engines
    '1500 TRX': 'Gasoline',  # Powered by a 6.2L supercharged V8 engine
    '500e Battery Electric': 'Electric',  # Fully electric
    'A4 2.0T Premium': 'Gasoline',  # 2.0L turbocharged inline-four engine
    'A4 2.0T Tech Premium': 'Gasoline',  # 2.0L turbocharged inline-four engine
    'A5 2.0T Premium Plus': 'Gasoline',  # 2.0L turbocharged inline-four engine
    'A7 55 Premium Plus': 'Gasoline',  # 3.0L V6 engine
    'Air Grand Touring': 'Electric',  # Fully electric luxury sedan
    'Air Pure': 'Electric',  # Fully electric luxury sedan
    'AMG G 63 Base': 'Gasoline',  # 4.0L twin-turbo V8 engine
    'AMG GLE AMG GLE 63 S-Model 4MATIC': 'Gasoline',  # 4.0L twin-turbo V8 engine
    'AMG GLS 63 4MATIC': 'Gasoline',  # 4.0L twin-turbo V8 engine
    'Armada Platinum': 'Gasoline',  # 5.6L V8 engine
    'Armada SL': 'Gasoline',  # 5.6L V8 engine
    'Bentayga Activity Edition': 'Gasoline',  # V8 engine
    'Bolt EUV Premier': 'Electric',  # Fully electric
    'Bolt EV LT': 'Electric',  # Fully electric
    'Bronco': 'Gasoline',  # Varies by trim, typically a V6 engine
    'Bronco Outer Banks': 'Gasoline',  # 2.3L turbocharged I4 engine
    'Bronco Sport Big Bend': 'Gasoline',  # 1.5L turbocharged I3 engine
    'Bronco Wildtrak Advanced': 'Gasoline',  # 2.7L twin-turbo V6
    'bZ4X Limited': 'Electric',  # Fully electric
    'C40 Recharge Pure Electric Twin Ultimate': 'Electric',  # Fully electric
    'CC Sport': 'Gasoline',  # 2.0L turbocharged I4 engine
    'Challenger SRT Demon': 'Gasoline',  # 6.2L supercharged V8 engine
    'Charger GT': 'Gasoline',  # 3.6L V6 engine
    'Cooper S Base': 'Gasoline',  # 2.0L turbocharged I4 engine
    'Corvette Stingray w/2LT': 'Gasoline',  # 6.2L V8 engine
    'CT5-V Blackwing': 'Gasoline',  # 6.2L supercharged V8 engine
    'E-Class E 400 4MATIC': 'Gasoline',  # 3.0L V6 engine
    'e-Golf SE': 'Electric',  # Fully electric
    'EQS 450 4MATIC': 'Electric',  # Fully electric
    'EQS 450+ Base': 'Electric',  # Fully electric
    'Escalade Sport Platinum': 'Gasoline',  # 6.2L V8 engine
    'e-tron Premium': 'Electric',  # Fully electric
    'e-tron Prestige': 'Electric',  # Fully electric
    'EV6 GT-Line': 'Electric',  # Fully electric
    'EV6 Wind': 'Electric',  # Fully electric
    'Evora 400 Base': 'Gasoline',  # 3.5L V6 engine
    'Expedition Timberline': 'Gasoline',  # 3.5L V6 engine
    'F12berlinetta Base': 'Gasoline',  # 6.3L V12 engine
    'F-150 Lariat': 'Gasoline',  # V6 or V8 engines
    'F-150 Lightning LARIAT': 'Electric',  # Fully electric
    'F-150 Lightning XLT': 'Electric',  # Fully electric
    'F-150 SVT Raptor': 'Gasoline',  # 3.5L twin-turbo V6
    'F-150 XLT': 'Gasoline',  # V6 or V8 engines
    'F-250 King Ranch': 'Diesel',  # 6.7L Power Stroke turbo diesel
    'Flying Spur V8': 'Gasoline',  # 4.0L twin-turbo V8 engine
    'Forte LXS': 'Gasoline',  # 2.0L inline-four engine
    'ForTwo Pure': 'Electric',  # Fully electric
    'F-PACE S': 'Gasoline',  # 3.0L supercharged V6 engine
    'F-TYPE R': 'Gasoline',  # 5.0L supercharged V8 engine
    'G8 GT': 'Gasoline',  # 6.0L V8 engine
    'G90 3.3T Premium': 'Gasoline',  # 3.3L twin-turbo V6 engine
    'Gallardo LP550-2': 'Gasoline',  # 5.2L V10 engine
    'Gladiator Mojave': 'Gasoline',  # 3.6L V6 engine
    'GLS 450 Base 4MATIC': 'Gasoline',  # 3.0L V6 engine
    'Grecale Modena': 'Gasoline',  # 3.0L twin-turbo V6 engine
    'GT-R Premium': 'Gasoline',  # 3.8L twin-turbo V6 engine
    'GV70 3.5T Sport': 'Gasoline',  # 3.5L twin-turbo V6 engine
    'Hardtop Cooper': 'Gasoline',  # 1.5L turbocharged I3 engine
    'HUMMER EV Edition 1': 'Electric',  # Fully electric
    'i3 120Ah w/Range Extender': 'Hybrid',  # Plug-in hybrid
    'i3 94 Ah': 'Electric',  # Fully electric
    'i3 Base': 'Electric',  # Fully electric
    'i3 Base w/Range Extender': 'Hybrid',  # Plug-in hybrid
    'ID.4 Pro S': 'Electric',  # Fully electric
    'ILX Technology Plus Package': 'Gasoline',  # 2.4L inline-four engine
    'IONIQ 5 SE': 'Electric',  # Fully electric
    'K5 GT-Line': 'Gasoline',  # 1.6L turbocharged I4 engine
    'Kona EV SEL': 'Electric',  # Fully electric
    'Lancer DE': 'Gasoline',  # 2.0L inline-four engine
    'Leaf S': 'Electric',  # Fully electric
    'Leaf SL': 'Electric',  # Fully electric
    'Leaf SV PLUS': 'Electric',  # Fully electric
    'LYRIQ Luxury': 'Electric',  # Fully electric
    'M3 Competition xDrive': 'Gasoline',  # 3.0L twin-turbo I6 engine
    'M3 CS': 'Gasoline',  # 3.0L twin-turbo I6 engine
    'M340 i xDrive': 'Gasoline',  # 3.0L I6 engine
    'M440 i': 'Gasoline',  # 3.0L turbo I6 engine
    'M8 Competition': 'Gasoline',  # 4.4L twin-turbo V8 engine
    'Macan S': 'Gasoline',  # 3.0L turbo V6 engine
    'MDX 3.5L w/Advance & Entertainment Pkgs': 'Gasoline',  # 3.5L V6 engine
    'MDX w/Technology Package': 'Gasoline',  # 3.5L V6 engine
    'Mirai Base': 'Hydrogen',  # Hydrogen fuel cell
    'Mirai Limited': 'Hydrogen',  # Hydrogen fuel cell
    'Model 3': 'Electric',  # Fully electric
    'Model 3 Base': 'Electric',  # Fully electric
    'Model 3 Long Range': 'Electric',
    'Model 3 Mid Range': 'Electric',
    'Model 3 Performance': 'Electric',
    'Model 3 Standard Range': 'Electric',
    'Model 3 Standard Range Plus': 'Electric',
    'Model S 100D': 'Electric',
    'Model S 70D': 'Electric',
    'Model S 75D': 'Electric',
    'Model S 85': 'Electric',
    'Model S 85D': 'Electric',
    'Model S 90D': 'Electric',
    'Model S Long Range': 'Electric',
    'Model S Long Range Plus': 'Electric',
    'Model S P100D': 'Electric',
    'Model S Performance': 'Electric',
    'Model S Plaid': 'Electric',
    'Model X 100D': 'Electric',
    'Model X 75D': 'Electric',
    'Model X Base': 'Electric',
    'Model X Long Range': 'Electric',
    'Model X Long Range Plus': 'Electric',
    'Model X P100D': 'Electric',
    'Model X P90D': 'Electric',
    'Model X Performance': 'Electric',
    'Model X Plaid': 'Electric',
    'Model Y Long Range': 'Electric',
    'Model Y Performance': 'Electric',
    'Mustang Mach-E California Route 1': 'Electric',
    'Mustang Mach-E GT': 'Electric',
    'Mustang Mach-E Premium': 'Electric',
    'Mustang Mach-E Select': 'Electric',
    'Niro EV EX': 'Electric',
    'Niro Plug-In Hybrid EX Premium': 'Hybrid',
    'NV200 SV': 'Gasoline',
    'Pacifica Launch Edition': 'Hybrid',
    'Palisade Limited': 'Gasoline',
    'Panamera Base': 'Gasoline',
    'Passat 2.0T R-Line': 'Gasoline',
    'Passat 2.0T SE': 'Gasoline',
    'Prius v Three': 'Hybrid',
    'Q3 45 S line Premium': 'Gasoline',
    'Q3 45 S line Premium Plus': 'Gasoline',
    'Q4 e-tron 50 Premium Plus': 'Electric',
    'Q4 e-tron Sportback Premium': 'Electric',
    'Q5 2.0T Premium Plus': 'Gasoline',
    'Q5 40 Premium': 'Gasoline',
    'Q5 S line Premium Plus': 'Gasoline',
    'Q7 3.0T Prestige': 'Gasoline',
    'Q7 Premium Plus': 'Gasoline',
    'Q8 55 Premium': 'Gasoline',
    'Q8 55 Premium Plus': 'Gasoline',
    'QX60 Base': 'Gasoline',
    'R1S Adventure Package': 'Electric',
    'R1S Launch Edition': 'Electric',
    'R1T Launch Edition': 'Electric',
    'R8 5.2': 'Gasoline',
    'R8 5.2 V10 plus': 'Gasoline',
    'RAV4 Base': 'Gasoline',
    'RAV4 TRD Off Road': 'Gasoline',
    'Revero Base': 'Hybrid',
    'Rogue SL': 'Gasoline',
    'Romeo Giulia Quadrifoglio': 'Gasoline',
    'Rover Range Rover Evoque S': 'Gasoline',
    'Rover Range Rover P530 SE LWB 7 Seat': 'Gasoline',
    'Rover Range Rover Sport Supercharged': 'Gasoline',
    'Rover Range Rover Velar SVAutobiography Dynamic Edition': 'Gasoline',
    'RS 3 2.5T': 'Gasoline',
    'S5 3.0T Premium Plus': 'Gasoline',
    'S5 3.0T Prestige': 'Gasoline',
    'S90 T5 Momentum': 'Gasoline',
    'S-Class S 560 4MATIC': 'Gasoline',
    'S-Class S 63 AMG': 'Gasoline',
    'Sentra SR': 'Gasoline',
    'Sierra 1500 Denali': 'Gasoline',
    'Solstice GXP': 'Gasoline',
    'Sonata Hybrid Limited': 'Hybrid',
    'Sorento EX': 'Gasoline',
    'Tahoe LTZ': 'Gasoline',
    'Taurus SHO': 'Gasoline',
    'Taycan': 'Electric',
    'Taycan 4S': 'Electric',
    'Taycan Base': 'Electric',
    'Taycan Turbo': 'Electric',
    'Titan SV': 'Gasoline',
    'TLX Type S w/Performance Tire': 'Gasoline',
    'Transit Connect XLT': 'Gasoline',
    'Tundra Hybrid TRD Pro': 'Hybrid',
    'Tundra Limited': 'Gasoline',
    'Veloster Turbo R-Spec': 'Gasoline',
    'Wagoneer Series III': 'Gasoline',
    'Wrangler 80th Anniversary': 'Gasoline',
    'Wrangler Unlimited Rubicon': 'Gasoline',
}


# 填充 NaN 值
for df in [train, test]:
    df['fuel_type'] = df['fuel_type'].fillna(df['model'].map(fuel_type_mapping))

print(train.head())
print('======================================')
print(test.head())

# Feature extraction from 'engine' column
def extract_hp(engine):
    match = re.search(r'(\d+(\.\d+)?)HP', str(engine))
    return float(match.group(1)) if match else np.nan

def extract_displacement(engine):
    match = re.search(r'(\d+\.\d+)L|(\d+\.\d+) Liter', str(engine))
    return float(match.group(1) or match.group(2)) if match else np.nan

def extract_engine_type(engine):
    match = re.search(r'(V\d+|I\d+|Flat \d+|Straight \d+)', str(engine))
    return match.group(1) if match else 'Unknown'

def extract_cylinder_count(engine):
    match = re.search(r'(\d+) Cylinder', str(engine))
    return int(match.group(1)) if match else np.nan

def extract_fuel_type_engine(engine):
    fuel_types = ['Gasoline', 'Diesel', 'Electric', 'Hybrid', 'Flex Fuel']
    for fuel in fuel_types:
        if fuel in str(engine):
            return fuel
    return 'Unknown'

for df in [train, test]:
    df['Horsepower'] = df['engine'].apply(extract_hp)
    df['Displacement'] = df['engine'].apply(extract_displacement)
    df['Engine_Type'] = df['engine'].apply(extract_engine_type)
    df['Cylinder_Count'] = df['engine'].apply(extract_cylinder_count)
    df['Fuel_Type_Engine'] = df['engine'].apply(extract_fuel_type_engine)

# Impute missing values using KNNImputer
imputer = KNNImputer(n_neighbors=5)

# Combine train and test data for consistent imputation
combined = pd.concat([train, test], sort=False)

# Impute 'Horsepower' and 'Cylinder_Count'
combined[['Horsepower', 'Cylinder_Count']] = imputer.fit_transform(combined[['Horsepower', 'Cylinder_Count']])

# Split back to train and test
train[['Horsepower', 'Cylinder_Count']] = combined.loc[combined['price'].notnull(), ['Horsepower', 'Cylinder_Count']]
test[['Horsepower', 'Cylinder_Count']] = combined.loc[combined['price'].isnull(), ['Horsepower', 'Cylinder_Count']]

# Drop 'engine' and 'model' columns
train.drop(columns=['engine', 'model'], inplace=True, errors='ignore')
test.drop(columns=['engine', 'model'], inplace=True, errors='ignore')

# Clear memory
del combined
gc.collect()

# Encode 'accident' and 'clean_title'
train["accident"] = train["accident"].replace({'At least 1 accident or damage reported':1,"None reported":0}) 
test["accident"] = test["accident"].replace({'At least 1 accident or damage reported':1,"None reported":0}) 

train["clean_title"] = train["clean_title"].replace({"Yes":1,"No":0})
test["clean_title"] = test["clean_title"].replace({"Yes":1,"No":0})

# Process 'transmission' column
transmission_mapping = {
    '6-speed a/t': 'Automatic',
    '8-speed automatic': 'Automatic',
    'automatic': 'Automatic',
    '7-speed a/t': 'Automatic',
    'a/t': 'Automatic',
    '8-speed a/t': 'Automatic',
    'transmission w/dual shift mode': 'Automatic',
    '9-speed automatic': 'Automatic',
    '10-speed automatic': 'Automatic',
    '1-speed a/t': 'Automatic',
    '2-speed a/t': 'Automatic',
    '2-speed automatic': 'Automatic',
    '4-speed a/t': 'Automatic',
    '5-speed automatic': 'Automatic',
    '4-speed automatic': 'Automatic',
    '6-speed automatic': 'Automatic', 
    '9-speed a/t': 'Automatic',        
    '10-speed a/t': 'Automatic',      
    '7-speed automatic': 'Automatic',  
    '6-speed electronically controlled automatic with o': 'Automatic',
    'single-speed fixed gear': 'Automatic',
    '7-speed dct automatic': 'Automatic',
    '10-speed automatic with overdrive': 'Automatic',
    'automatic, 9-spd 9g-tronic': 'Automatic',
    'automatic, 8-spd': 'Automatic',
    'automatic, 8-spd sport w/sport & manual modes': 'Automatic',
    'automatic, 8-spd pdk dual-clutch': 'Automatic',
    'automatic, 8-spd m steptronic w/drivelogic, sport & manual modes': 'Automatic',
    'automatic, 8-spd dual-clutch': 'Automatic',
    'transmission overdrive switch': 'Automatic',  

    '7-speed automatic with auto-shift': 'Tiptronic',
    '5-speed a/t': 'Tiptronic',
    '7-speed a/t tiptronic': 'Tiptronic',  
    '8-speed at': 'Tiptronic',
    '8-speed a/t': 'Tiptronic',

    '6-speed m/t': 'Manual',
    '7-speed m/t': 'Manual',
    '6-speed manual': 'Manual',
    '5-speed m/t': 'Manual',
    'manual': 'Manual',
    '7-speed manual': 'Manual',
    '8-speed manual': 'Manual',
    'm/t': 'Manual',
    '6 speed at/mt': 'Manual',
    '6 speed mt': 'Manual',

    'automatic cvt': 'Variator',
    'cvt transmission': 'Variator',
    'cvt-f': 'Variator',

    'variable': 'Variator',
    'f': 'Other',                        
    '7-speed': 'Other',                 
    '6-speed': 'Other',                  
    '2': 'Other',                       
    '–': 'Other',
    'scheduled for or in production': 'Other'
}

def simplify_transmission(transmission):
    if 'Automatic' in transmission:
        return 'Automatic'
    elif 'Manual' in transmission:
        return 'Manual'
    elif 'Tiptronic' in transmission:
        return 'Tiptronic'
    elif 'Variator' in transmission:
        return 'Variator'
    else:
        return 'Other'

for df in [train, test]:
    df['transmission'] = df['transmission'].str.strip().str.lower()
    df['transmission'] = df['transmission'].replace(transmission_mapping)
    df['transmission'] = df['transmission'].apply(simplify_transmission)

# Encode 'transmission' numerically
transmission_encoding = {'Automatic':1,
                         'Tiptronic':2,
                         'Manual':3,
                         'Variator':4,
                         'Other':5}
train["transmission"] = train["transmission"].map(transmission_encoding)
test["transmission"] = test["transmission"].map(transmission_encoding)

# Drop 'Fuel Type' column
train.drop('Fuel Type', axis=1, inplace=True, errors='ignore')
test.drop('Fuel Type', axis=1, inplace=True, errors='ignore')

# Impute missing values in 'Displacement'
train['Displacement'].fillna(value=train['Displacement'].mean(), inplace=True)
test['Displacement'].fillna(value=test['Displacement'].mean(), inplace=True)

# Encode categorical variables
categorical_columns = ['brand', 'fuel_type', 'ext_col', 'int_col', 'Engine_Type', 'Fuel_Type_Engine']

for col in categorical_columns:
    le = LabelEncoder()
    combined_col = pd.concat([train[col], test[col]], axis=0)
    le.fit(combined_col.astype(str))
    train[col] = le.transform(train[col].astype(str))
    test[col] = le.transform(test[col].astype(str))

# Scale numerical features
numerical_features = ['Horsepower', 'Displacement', 'Cylinder_Count', 'milage', 'model_year']

scaler = StandardScaler()
train[numerical_features] = scaler.fit_transform(train[numerical_features])
test[numerical_features] = scaler.transform(test[numerical_features])

# Clear memory
gc.collect()

# Prepare data for modeling
X = train.drop(columns=['id', 'price'])
y = train['price']
X_test = test.drop(columns=['id'])

# Set up cross-validation
folds = KFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
print('===========================================================================')
X.head()
print('===========================================================================')
y.head()
print('===========================================================================')
X_test.head()
print('===========================================================================')
print(folds)
print('===========================================================================')

In [None]:
# Function to optimize and train models with Optuna
def optimize_model(model_name, X, y):
    def objective(trial):
        if model_name == 'catboost':
            param = {
                'iterations': trial.suggest_int('iterations', 1000, 3000),
                'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
                'depth': trial.suggest_int('depth', 4, 10),
                'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-2, 10),
                'random_strength': trial.suggest_uniform('random_strength', 0, 10),
                'bagging_temperature': trial.suggest_uniform('bagging_temperature', 0, 10),
                'eval_metric': 'RMSE',
                'loss_function': 'RMSE',
                'verbose': False,
                'task_type': 'CPU',
                'random_seed': 42,
            }
            oof_preds = np.zeros(len(X))
            for fold, (train_idx, val_idx) in enumerate(folds.split(X, y)):
                X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
                X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
                model = CatBoostRegressor(**param)
                model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=100, verbose=False)
                oof_preds[val_idx] = model.predict(X_val)
            rmse = np.sqrt(mean_squared_error(y, oof_preds))
            return rmse
        elif model_name == 'lightgbm':
            param = {
                'objective': 'regression',
                'metric': 'rmse',
                'verbosity': -1,
                'boosting_type': 'gbdt',
                'n_estimators': trial.suggest_int('n_estimators', 1000, 3000),
                'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
                'num_leaves': trial.suggest_int('num_leaves', 31, 256),
                'max_depth': trial.suggest_int('max_depth', 4, 12),
                'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
                'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
                'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
                'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-4, 10),
                'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-4, 10),
                'random_state': 42
            }
            oof_preds = np.zeros(len(X))
            for fold, (train_idx, val_idx) in enumerate(folds.split(X, y)):
                X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
                X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
                model = LGBMRegressor(**param)
                model.fit(X_train, y_train, eval_set=[(X_val, y_val)])
                oof_preds[val_idx] = model.predict(X_val)
            rmse = np.sqrt(mean_squared_error(y, oof_preds))
            return rmse
        elif model_name == 'xgboost':
            param = {
                'objective': 'reg:squarederror',
                'n_estimators': trial.suggest_int('n_estimators', 1000, 3000),
                'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
                'max_depth': trial.suggest_int('max_depth', 4, 12),
                'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
                'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
                'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
                'gamma': trial.suggest_loguniform('gamma', 1e-4, 10),
                'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-4, 10),
                'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-4, 10),
                'random_state': 42,
                'tree_method': 'hist',
                'predictor': 'cpu_predictor'
            }
            oof_preds = np.zeros(len(X))
            for fold, (train_idx, val_idx) in enumerate(folds.split(X, y)):
                X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
                X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
                model = XGBRegressor(**param)
                model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=100, verbose=False)
                oof_preds[val_idx] = model.predict(X_val)
            rmse = np.sqrt(mean_squared_error(y, oof_preds))
            return rmse
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=10)
    return study.best_params

# Optimize models
best_params_cat = optimize_model('catboost', X, y)
best_params_lgb = optimize_model('lightgbm', X, y)
best_params_xgb = optimize_model('xgboost', X, y)

# Optimize Neural Network with Optuna
def optimize_nn(X, y):
    def objective(trial):
        # Clear session
        K.clear_session()
        # Hyperparameters to tune
        num_layers = trial.suggest_int('num_layers', 2, 5)
        units = trial.suggest_int('units', 128, 512)
        activation = trial.suggest_categorical('activation', ['relu', 'elu', 'selu'])
        learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-2)
        batch_size = trial.suggest_categorical('batch_size', [64, 128, 256])
        dropout_rate = trial.suggest_uniform('dropout_rate', 0.1, 0.2)
        # Build the model
        inputs = Input(shape=(X.shape[1],))
        x = inputs
        for i in range(num_layers):
            x = Dense(units, activation=activation)(x)
            x = BatchNormalization()(x)
            x = Dropout(dropout_rate)(x)
        outputs = Dense(1)(x)
        model = Model(inputs, outputs)
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate), loss='mean_squared_error')
        oof_preds = np.zeros(len(X))
        for fold, (train_idx, val_idx) in enumerate(folds.split(X, y)):
            X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
            X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
            # Convert to numpy arrays
            X_train_np = X_train.values.astype(np.float32)
            X_val_np = X_val.values.astype(np.float32)
            model.fit(X_train_np, y_train, epochs=100, batch_size=batch_size, verbose=0,
                      validation_data=(X_val_np, y_val), callbacks=[EarlyStopping(patience=10, restore_best_weights=True)])
            oof_preds[val_idx] = model.predict(X_val_np).flatten()
        rmse = np.sqrt(mean_squared_error(y, oof_preds))
        return rmse
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=2)
    return study.best_params

best_params_nn = optimize_nn(X, y)

# Retrain models with best parameters and get out-of-fold predictions
def train_and_predict(model_name, X, y, X_test, params):
    oof_preds = np.zeros(len(X))
    test_preds = np.zeros(len(X_test))
    for fold, (train_idx, val_idx) in enumerate(folds.split(X, y)):
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
        if model_name == 'catboost':
            model = CatBoostRegressor(**params)
            model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=100, verbose=False)
        elif model_name == 'lightgbm':
            model = LGBMRegressor(**params)
            model.fit(X_train, y_train, eval_set=[(X_val, y_val)])
        elif model_name == 'xgboost':
            model = XGBRegressor(**params)
            model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=100, verbose=False)
        oof_preds[val_idx] = model.predict(X_val)
        test_preds += model.predict(X_test) / folds.n_splits
    rmse = np.sqrt(mean_squared_error(y, oof_preds))
    print(f"{model_name} CV RMSE: {rmse}")
    return oof_preds, test_preds

oof_preds_cat, test_preds_cat = train_and_predict('catboost', X, y, X_test, best_params_cat)
oof_preds_lgb, test_preds_lgb = train_and_predict('lightgbm', X, y, X_test, best_params_lgb)
oof_preds_xgb, test_preds_xgb = train_and_predict('xgboost', X, y, X_test, best_params_xgb)

# Neural Network training
def build_nn_model(params):
    inputs = Input(shape=(X.shape[1],))
    x = inputs
    for i in range(params['num_layers']):
        x = Dense(params['units'], activation=params['activation'])(x)
        x = BatchNormalization()(x)
        x = Dropout(params['dropout_rate'])(x)
    outputs = Dense(1)(x)
    model = Model(inputs, outputs)
    model.compile(optimizer=tf.keras.optimizers.Adam(params['learning_rate']), loss='mean_squared_error')
    return model

oof_preds_nn = np.zeros(len(X))
test_preds_nn = np.zeros(len(X_test))
for fold, (train_idx, val_idx) in enumerate(folds.split(X, y)):
    K.clear_session()
    model = build_nn_model(best_params_nn)
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
    X_train_np = X_train.values.astype(np.float32)
    X_val_np = X_val.values.astype(np.float32)
    X_test_np = X_test.values.astype(np.float32)
    model.fit(X_train_np, y_train, epochs=100, batch_size=best_params_nn['batch_size'], verbose=0,
              validation_data=(X_val_np, y_val), callbacks=[EarlyStopping(patience=10, restore_best_weights=True)])
    oof_preds_nn[val_idx] = model.predict(X_val_np).flatten()
    test_preds_nn += model.predict(X_test_np).flatten() / folds.n_splits
rmse_nn = np.sqrt(mean_squared_error(y, oof_preds_nn))
print(f"Neural Network CV RMSE: {rmse_nn}")

# Create meta-features for stacking
X_meta = pd.DataFrame({
    'catboost': oof_preds_cat,
    'lightgbm': oof_preds_lgb,
    'xgboost': oof_preds_xgb,
    'nn': oof_preds_nn,
})
X_test_meta = pd.DataFrame({
    'catboost': test_preds_cat,
    'lightgbm': test_preds_lgb,
    'xgboost': test_preds_xgb,
    'nn': test_preds_nn,
})

# Optimize meta-model with Optuna
def optimize_meta(X_meta, y):
    def objective(trial):
        alpha = trial.suggest_loguniform('alpha', 1e-4, 10)
        oof_meta = np.zeros(len(y))
        for fold, (train_idx, val_idx) in enumerate(folds.split(X_meta, y)):
            X_train_meta, y_train_meta = X_meta.iloc[train_idx], y.iloc[train_idx]
            X_val_meta, y_val_meta = X_meta.iloc[val_idx], y.iloc[val_idx]
            model_meta = Ridge(alpha=alpha)
            model_meta.fit(X_train_meta, y_train_meta)
            oof_meta[val_idx] = model_meta.predict(X_val_meta)
        rmse = np.sqrt(mean_squared_error(y, oof_meta))
        return rmse
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=2)
    return study.best_params

best_params_meta = optimize_meta(X_meta, y)
best_alpha = best_params_meta['alpha']

# Train final meta-model
model_meta = Ridge(alpha=best_alpha)
model_meta.fit(X_meta, y)
final_predictions = model_meta.predict(X_test_meta)
oof_meta = model_meta.predict(X_meta)
rmse_meta = np.sqrt(mean_squared_error(y, oof_meta))
print(f"Meta-model CV RMSE: {rmse_meta}")

In [None]:
# Create submission file
submission = pd.DataFrame({
    'id': test['id'],
    'price': final_predictions
})
submission.to_csv('submission.csv', index=False)
print("Submission file created: submission.csv")
submission.head()
