# Regression of Used Car Prices

## Modified the metric from `log1p RMSE` to Raw `RMSE` for both Keras and Keras Tuner

## Keras - 1 Attempt

## Attempt 1 

In [None]:

# Reproducibility
import os, random, time, json
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.metrics import RootMeanSquaredError, MeanAbsoluteError
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

seed = 42
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

# Data Loading
train_df = pd.read_csv('playground-series-s4e9/train.csv.zip')
test_df = pd.read_csv('playground-series-s4e9/test.csv.zip')
sample_sub = pd.read_csv('playground-series-s4e9/sample_submission.csv.zip', nrows=1)

# Infer ID and target columns
cols = list(sample_sub.columns)
id_col = cols[0]
target_columns = cols[1:]

# Combine training data
df = train_df.copy()

# Target encoding for continuous regression
y_values = df[target_columns].astype(float).values
if np.all(y_values >= 0):
    y_enc = np.log1p(y_values)
else:
    y_enc = y_values

# Features
X = df.drop(columns=target_columns + [id_col], errors='ignore')

# Train/validation split
X_train, X_val, y_train, y_val = train_test_split(
    X, y_enc,
    test_size=0.2,
    random_state=seed
)

# Optional: drop ID from features
for df_ in (X_train, X_val):
    if id_col in df_.columns:
        df_.drop(columns=[id_col], inplace=True)

# Drop all-missing columns
all_missing = [c for c in X_train.columns if X_train[c].isna().all()]
X_train.drop(columns=all_missing, inplace=True)
X_val.drop(columns=all_missing, inplace=True)

# Feature types
numeric_features = X_train.select_dtypes(include=['int64','float64']).columns.tolist()
cat_features = X_train.select_dtypes(include=['object','category']).columns.tolist()
low_cardinality = [c for c in cat_features if X_train[c].nunique() <= 50]

# Preprocessing pipelines
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])
preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numeric_features),
    ('cat', cat_pipeline, low_cardinality)
])

# Fit and transform
X_train_proc = preprocessor.fit_transform(X_train)
X_val_proc = preprocessor.transform(X_val)

# Model architecture determination
n_samples, n_features = X_train_proc.shape
n_targets = y_train.shape[1] if y_train.ndim > 1 else 1

if n_samples < 10000 or n_features < 100:
    layer_sizes = [min(n_features*2, 128), min(n_features, 64)]
    use_bn = False
    drop_rates = [0.3, 0.3]
else:
    sizes = [n_features * i for i in (2, 1, 0.5, 0.25)]
    layer_sizes = [int(min(s, 1024)) for s in sizes if min(s, 1024) >= 16]
    use_bn = True
    drop_rates = [0.4] * len(layer_sizes)

# Build model
model = Sequential()
for idx, size in enumerate(layer_sizes):
    if idx == 0:
        model.add(Dense(size, activation='relu', input_dim=n_features))
    else:
        model.add(Dense(size, activation='relu'))
    if use_bn:
        model.add(BatchNormalization())
    model.add(Dropout(drop_rates[idx]))
model.add(Dense(n_targets, activation='linear'))

def mse_real(y_true_log, y_pred_log):
    y_true = tf.math.expm1(y_true_log)
    y_pred = tf.math.expm1(y_pred_log)
    return tf.reduce_mean(tf.square(y_true - y_pred))
mse_real.__name__ = 'mse_real'      

def rmse_real(y_true_log, y_pred_log):
    y_true = tf.math.expm1(y_true_log)
    y_pred = tf.math.expm1(y_pred_log)
    return tf.sqrt(tf.reduce_mean(tf.square(y_true - y_pred)))
rmse_real.__name__ = 'rmse_real'

# Compile
model.compile(
    optimizer='adam',
    loss='mean_squared_error',
    metrics=[rmse_real,mse_real]
)

# Callbacks
callbacks = [
    EarlyStopping(patience=10, restore_best_weights=True, verbose=1),
    ModelCheckpoint('best_model.h5', save_best_only=True, verbose=1)
]

# Training
start_time = time.time()
history = model.fit(
    X_train_proc, y_train,
    validation_data=(X_val_proc, y_val),
    epochs=100,
    batch_size=64,
    callbacks=callbacks,
    verbose=2
)
duration = time.time() - start_time

# Logging results
results = {
    'training_loss': history.history['mse_real'][-1],
    'validation_loss': history.history['val_mse_real'][-1],
    'training_RMSE': history.history['rmse_real'][-1],
    'validation_RMSE': history.history['val_rmse_real'][-1]
}
with open('results.json', 'w') as f:
    json.dump(results, f)

# Prediction & Submission
X_test = test_df.copy()
if id_col in X_test.columns:
    test_ids = X_test[id_col]
X_test = X_test.drop(columns=target_columns + [id_col], errors='ignore')
X_test.drop(columns=all_missing, errors='ignore', inplace=True)
X_test_proc = preprocessor.transform(X_test)
raw_preds = model.predict(X_test_proc)
final = np.expm1(np.clip(raw_preds, a_min=None, a_max=20)) if np.all(raw_preds >= 0) else raw_preds
if final.ndim == 1:
    final = final.reshape(-1, 1)
submission = pd.DataFrame(final, columns=target_columns)
submission.insert(0, id_col, test_ids.reset_index(drop=True))
submission.to_csv('submission_result.csv', index=False)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Epoch 1: val_loss improved from inf to 0.75041, saving model to best_model.h5




2357/2357 - 4s - 2ms/step - loss: 12.3391 - mse_real: 71144010940416.0000 - rmse_real: 3462145.2500 - val_loss: 0.7504 - val_mse_real: 5853377024.0000 - val_rmse_real: 57668.3789
Epoch 2/100

Epoch 2: val_loss improved from 0.75041 to 0.54282, saving model to best_model.h5




2357/2357 - 4s - 2ms/step - loss: 5.8454 - mse_real: 1114419625984.0000 - rmse_real: 657254.7500 - val_loss: 0.5428 - val_mse_real: 5615367168.0000 - val_rmse_real: 55212.7969
Epoch 3/100

Epoch 3: val_loss improved from 0.54282 to 0.45569, saving model to best_model.h5




2357/2357 - 4s - 2ms/step - loss: 3.5368 - mse_real: 43524018176.0000 - rmse_real: 185697.3125 - val_loss: 0.4557 - val_mse_real: 5485157888.0000 - val_rmse_real: 53866.5781
Epoch 4/100

Epoch 4: val_loss improved from 0.45569 to 0.37745, saving model to best_model.h5




2357/2357 - 3s - 1ms/step - loss: 2.1124 - mse_real: 13807226880.0000 - rmse_real: 103741.1250 - val_loss: 0.3774 - val_mse_real: 5388912640.0000 - val_rmse_real: 52784.5508
Epoch 5/100

Epoch 5: val_loss improved from 0.37745 to 0.33741, saving model to best_model.h5




2357/2357 - 3s - 1ms/step - loss: 1.1412 - mse_real: 7928892928.0000 - rmse_real: 71278.3125 - val_loss: 0.3374 - val_mse_real: 5271377408.0000 - val_rmse_real: 51529.6211
Epoch 6/100

Epoch 6: val_loss improved from 0.33741 to 0.31136, saving model to best_model.h5




2357/2357 - 3s - 1ms/step - loss: 0.5927 - mse_real: 6472240640.0000 - rmse_real: 58241.6016 - val_loss: 0.3114 - val_mse_real: 5149201920.0000 - val_rmse_real: 50217.9570
Epoch 7/100

Epoch 7: val_loss improved from 0.31136 to 0.30957, saving model to best_model.h5




2357/2357 - 3s - 1ms/step - loss: 0.3941 - mse_real: 6081110016.0000 - rmse_real: 53577.2578 - val_loss: 0.3096 - val_mse_real: 5095639552.0000 - val_rmse_real: 49656.1680
Epoch 8/100

Epoch 8: val_loss improved from 0.30957 to 0.30434, saving model to best_model.h5




2357/2357 - 3s - 1ms/step - loss: 0.3574 - mse_real: 5993613312.0000 - rmse_real: 52478.4844 - val_loss: 0.3043 - val_mse_real: 5073657856.0000 - val_rmse_real: 49443.0898
Epoch 9/100

Epoch 9: val_loss did not improve from 0.30434
2357/2357 - 3s - 1ms/step - loss: 0.3496 - mse_real: 5966213120.0000 - rmse_real: 52144.4258 - val_loss: 0.3079 - val_mse_real: 5053559296.0000 - val_rmse_real: 49250.4297
Epoch 10/100

Epoch 10: val_loss did not improve from 0.30434
2357/2357 - 3s - 1ms/step - loss: 0.3470 - mse_real: 5961902592.0000 - rmse_real: 52032.2461 - val_loss: 0.3114 - val_mse_real: 5075383808.0000 - val_rmse_real: 49463.0547
Epoch 11/100

Epoch 11: val_loss did not improve from 0.30434
2357/2357 - 3s - 1ms/step - loss: 0.3462 - mse_real: 5953558528.0000 - rmse_real: 51985.5664 - val_loss: 0.3116 - val_mse_real: 5067457024.0000 - val_rmse_real: 49389.9883
Epoch 12/100

Epoch 12: val_loss did not improve from 0.30434
2357/2357 - 3s - 1ms/step - loss: 0.3461 - mse_real: 5956694016.00

## Keras Tuner - 1 Attempt

## Attempt 1 

In [None]:
# Reproducibility
import os, random, time, json
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.metrics import RootMeanSquaredError, MeanAbsoluteError
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

seed = 42
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

# Data Loading
train_df = pd.read_csv('playground-series-s4e9/train.csv.zip')
test_df = pd.read_csv('playground-series-s4e9/test.csv.zip')
sample_sub = pd.read_csv('playground-series-s4e9/sample_submission.csv.zip', nrows=1)

# Infer ID and target columns
cols = list(sample_sub.columns)
id_col = cols[0]
target_columns = cols[1:]

# Combine training data
df = train_df.copy()

# Target encoding for continuous regression
y_values = df[target_columns].astype(float).values
if np.all(y_values >= 0):
    y_enc = np.log1p(y_values)
else:
    y_enc = y_values

# Features
X = df.drop(columns=target_columns + [id_col], errors='ignore')

# Train/validation split
X_train, X_val, y_train, y_val = train_test_split(
    X, y_enc,
    test_size=0.2,
    random_state=seed
)

# Optional: drop ID from features
for df_ in (X_train, X_val):
    if id_col in df_.columns:
        df_.drop(columns=[id_col], inplace=True)

# Drop all-missing columns
all_missing = [c for c in X_train.columns if X_train[c].isna().all()]
X_train.drop(columns=all_missing, inplace=True)
X_val.drop(columns=all_missing, inplace=True)

# Feature types
numeric_features = X_train.select_dtypes(include=['int64','float64']).columns.tolist()
cat_features = X_train.select_dtypes(include=['object','category']).columns.tolist()
low_cardinality = [c for c in cat_features if X_train[c].nunique() <= 50]

# Preprocessing pipelines
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])
preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numeric_features),
    ('cat', cat_pipeline, low_cardinality)
])

# Fit and transform
X_train_proc = preprocessor.fit_transform(X_train)
X_val_proc = preprocessor.transform(X_val)

# Model architecture determination
n_samples, n_features = X_train_proc.shape
n_targets = y_train.shape[1] if y_train.ndim > 1 else 1

import keras_tuner as kt
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.metrics import RootMeanSquaredError, MeanAbsoluteError

# Define early stopping and checkpointing
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True)

# Input dimension
n_features = X_train_proc.shape[1]

def mse_real(y_true_log, y_pred_log):
    y_true = tf.math.expm1(y_true_log)
    y_pred = tf.math.expm1(y_pred_log)
    return tf.reduce_mean(tf.square(y_true - y_pred))
mse_real.__name__ = 'mse_real'      

def rmse_real(y_true_log, y_pred_log):
    y_true = tf.math.expm1(y_true_log)
    y_pred = tf.math.expm1(y_pred_log)
    return tf.sqrt(tf.reduce_mean(tf.square(y_true - y_pred)))
rmse_real.__name__ = 'rmse_real'

# HyperModel
class MyHyperModel(kt.HyperModel):
    def build(self, hp):
        layers = hp.Int('layers', 2, 8)
        units = hp.Int('units', 64, 1024)
        drop = hp.Float('dropout', 0.0, 0.5)
        opt = hp.Choice('optimizer', ['adam'])
        lr = hp.Float('learning_rate', 1e-5, 0.01, sampling='log')

        model = Sequential()
        for idx in range(layers):
            if idx == 0:
                model.add(Dense(units, activation='relu', input_dim=n_features))
            else:
                model.add(Dense(units, activation='relu'))
            model.add(Dropout(drop))
        model.add(Dense(1, activation='linear'))  # Assuming n_targets = 1 for regression

        model.compile(optimizer=opt, loss='mean_squared_error', metrics=[rmse_real,mse_real])
        return model

# Tuner
bs = 32  # batch size
ep = 20  # epochs

tuner = kt.BayesianOptimization(
    MyHyperModel(),
    objective='val_loss',
    max_trials=10,
    executions_per_trial=1,
    seed=42,
    overwrite=False,
    project_name='bayesian_tuner'
)

if y_val is not None:
    tuner.search(
        X_train_proc, y_train,
        validation_data=(X_val_proc, y_val),
        batch_size=bs, epochs=ep,
        callbacks=[early_stopping, checkpoint]
    )
else:
    tuner.search(
        X_train_proc, y_train,
        validation_split=0.2,
        batch_size=bs, epochs=ep,
        callbacks=[early_stopping, checkpoint]
    )

model = tuner.hypermodel.build(tuner.get_best_hyperparameters(1)[0])

start_time = time.time()

# Retrain model with original callbacks and data
if y_val is not None:
    history = model.fit(
        X_train_proc, y_train,
        validation_data=(X_val_proc, y_val),
        epochs=100, batch_size=bs,
        callbacks=[early_stopping, checkpoint],
        verbose=2
    )
else:
    history = model.fit(
        X_train_proc, y_train,
        validation_split=0.2,
        epochs=100, batch_size=bs,
        callbacks=[early_stopping, checkpoint],
        verbose=2
    )

duration = time.time() - start_time


# Logging results
results = {
    'training_loss': history.history['mse_real'][-1],
    'validation_loss': history.history['val_mse_real'][-1],
    'training_RMSE': history.history['rmse_real'][-1],
    'validation_RMSE': history.history['val_rmse_real'][-1]
}
with open('results.json', 'w') as f:
    json.dump(results, f)

# Prediction & Submission
X_test = test_df.copy()
if id_col in X_test.columns:
    test_ids = X_test[id_col]
X_test = X_test.drop(columns=target_columns + [id_col], errors='ignore')
X_test.drop(columns=all_missing, errors='ignore', inplace=True)
X_test_proc = preprocessor.transform(X_test)
raw_preds = model.predict(X_test_proc)
final = np.expm1(np.clip(raw_preds, a_min=None, a_max=20)) if np.all(raw_preds >= 0) else raw_preds
if final.ndim == 1:
    final = final.reshape(-1, 1)
submission = pd.DataFrame(final, columns=target_columns)
submission.insert(0, id_col, test_ids.reset_index(drop=True))
submission.to_csv('submission_result.csv', index=False)

Trial 10 Complete [00h 06m 56s]
val_loss: 0.2960398197174072

Best val_loss So Far: 0.29548490047454834
Total elapsed time: 01h 12m 03s
Epoch 1/100




4714/4714 - 25s - 5ms/step - loss: 0.5426 - mse_real: 19172701023995166720.0000 - rmse_real: 63838284.0000 - val_loss: 0.3063 - val_mse_real: 4925497856.0000 - val_rmse_real: 43360.3750
Epoch 2/100
4714/4714 - 22s - 5ms/step - loss: 0.3697 - mse_real: 5990876160.0000 - rmse_real: 47075.2734 - val_loss: 0.3247 - val_mse_real: 4925643264.0000 - val_rmse_real: 43508.1875
Epoch 3/100
4714/4714 - 19s - 4ms/step - loss: 0.3542 - mse_real: 5944964608.0000 - rmse_real: 46533.6992 - val_loss: 0.3119 - val_mse_real: 5144035840.0000 - val_rmse_real: 44871.5352
Epoch 4/100




4714/4714 - 19s - 4ms/step - loss: 0.3371 - mse_real: 5916884992.0000 - rmse_real: 46091.7656 - val_loss: 0.3036 - val_mse_real: 5115154432.0000 - val_rmse_real: 44498.7148
Epoch 5/100




4714/4714 - 19s - 4ms/step - loss: 0.3297 - mse_real: 5901241856.0000 - rmse_real: 45843.6250 - val_loss: 0.2992 - val_mse_real: 5052323328.0000 - val_rmse_real: 43952.2188
Epoch 6/100
4714/4714 - 19s - 4ms/step - loss: 0.3282 - mse_real: 5895203328.0000 - rmse_real: 45743.3906 - val_loss: 0.2996 - val_mse_real: 5074844160.0000 - val_rmse_real: 44106.4375
Epoch 7/100




4714/4714 - 19s - 4ms/step - loss: 0.3275 - mse_real: 6334534144.0000 - rmse_real: 46041.8242 - val_loss: 0.2984 - val_mse_real: 5046730752.0000 - val_rmse_real: 43879.6328
Epoch 8/100
4714/4714 - 19s - 4ms/step - loss: 0.3282 - mse_real: 35256508416.0000 - rmse_real: 48195.7812 - val_loss: 0.3021 - val_mse_real: 5043734016.0000 - val_rmse_real: 43862.6367
Epoch 9/100




4714/4714 - 19s - 4ms/step - loss: 0.3260 - mse_real: 206684094464.0000 - rmse_real: 52200.8125 - val_loss: 0.2979 - val_mse_real: 5046713856.0000 - val_rmse_real: 43860.0586
Epoch 10/100
4714/4714 - 19s - 4ms/step - loss: 0.3243 - mse_real: 5889111040.0000 - rmse_real: 45621.6250 - val_loss: 0.2986 - val_mse_real: 5057114624.0000 - val_rmse_real: 43981.4844
Epoch 11/100
4714/4714 - 22s - 5ms/step - loss: 0.3243 - mse_real: 5888558592.0000 - rmse_real: 45612.1875 - val_loss: 0.2984 - val_mse_real: 5019281920.0000 - val_rmse_real: 43655.7344
Epoch 12/100




4714/4714 - 24s - 5ms/step - loss: 0.3236 - mse_real: 5881847808.0000 - rmse_real: 45615.9648 - val_loss: 0.2979 - val_mse_real: 5053080064.0000 - val_rmse_real: 43917.9727
Epoch 13/100
4714/4714 - 19s - 4ms/step - loss: 0.3222 - mse_real: 5877618176.0000 - rmse_real: 45570.4023 - val_loss: 0.3002 - val_mse_real: 5052841472.0000 - val_rmse_real: 43934.1055
Epoch 14/100




4714/4714 - 19s - 4ms/step - loss: 0.3367 - mse_real: inf - rmse_real: inf - val_loss: 0.2973 - val_mse_real: 5051807232.0000 - val_rmse_real: 43904.5469
Epoch 15/100
4714/4714 - 21s - 4ms/step - loss: 0.3209 - mse_real: 6154395648.0000 - rmse_real: 45757.2773 - val_loss: 0.2973 - val_mse_real: 5021539328.0000 - val_rmse_real: 43657.0742
Epoch 16/100
4714/4714 - 18s - 4ms/step - loss: 0.3208 - mse_real: 5882513408.0000 - rmse_real: 45519.6992 - val_loss: 0.3023 - val_mse_real: 5106527232.0000 - val_rmse_real: 44417.0000
Epoch 17/100
4714/4714 - 18s - 4ms/step - loss: 0.3199 - mse_real: 5876698112.0000 - rmse_real: 45511.1758 - val_loss: 0.2999 - val_mse_real: 5076005888.0000 - val_rmse_real: 44124.5391
Epoch 18/100
4714/4714 - 19s - 4ms/step - loss: 0.3197 - mse_real: 5876789248.0000 - rmse_real: 45519.0508 - val_loss: 0.2983 - val_mse_real: 5067098112.0000 - val_rmse_real: 44028.4297
Epoch 19/100
4714/4714 - 19s - 4ms/step - loss: 0.3189 - mse_real: 5876754432.0000 - rmse_real: 45482.

In [None]:
print(duration)

736.234264
