# Backpack Prediction Challenge


## Modified the metric from `log1p RMSE` to Raw `RMSE` for both Keras and Keras Tuner

## Keras - 1 Attempt 

## Attempt 1


In [None]:
import random
import numpy as np
import pandas as pd
import tensorflow as tf
import json
import time
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.metrics import RootMeanSquaredError, MeanAbsoluteError
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization
from tensorflow.keras import Model

# Set seeds for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

# 1. Data Loading & Split
# Training files
train_files = ['playground-series-s5e2/train.csv', 'playground-series-s5e2/training_extra.csv']
train_dfs = [pd.read_csv(f) for f in train_files]
# Test file
df_test = pd.read_csv('playground-series-s5e2/test.csv')
# Infer id and target columns from sample submission
sub_sample = pd.read_csv('playground-series-s5e2/sample_submission.csv', nrows=0)
id_col = sub_sample.columns[0]
target_columns = sub_sample.columns.tolist()[1:]

# Combine training data
df = pd.concat(train_dfs, ignore_index=True)

# 2. Target Encoding (Regression)
y_values = df[target_columns].astype(float).values
y_enc = np.log1p(y_values) if np.all(y_values >= 0) else y_values

# 3. Features & IDs
X = df.drop(columns=target_columns + [id_col], errors='ignore')
X_train = X.copy()
y_train = y_enc
test_ids = df_test[id_col]
X_val = df_test.drop(columns=target_columns + [id_col], errors='ignore')
y_val = None

# 4. Feature Engineering
# Drop columns with all missing values
X_train.dropna(axis=1, how='all', inplace=True)
X_val = X_val[X_train.columns]
# Identify categorical vs numeric
categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
low_card_cats = [c for c in categorical_cols if X_train[c].nunique() <= 50]
numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()

# 5. Preprocessing Pipeline
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, low_card_cats)
])
# Fit & transform
X_train_proc = preprocessor.fit_transform(X_train)
X_val_proc = preprocessor.transform(X_val)

# 6. Model Architecture Selection
n_samples, n_features = X_train_proc.shape
n_targets = len(target_columns)
if n_samples < 10000 or n_features < 100:
    units1 = min(n_features * 2, 128)
    units2 = min(n_features, 64)
    hidden_layers = [int(units1), int(units2)]
    use_bn = False
    dropout_rate = 0.3
else:
    sizes = [n_features * i for i in (2, 1, 0.5, 0.25)]
    hidden_layers = [int(s) for s in sizes if s >= 16]
    use_bn = True
    dropout_rate = 0.4

# Build the model
inputs = Input(shape=(n_features,))
x = inputs
for units in hidden_layers:
    x = Dense(units, activation='relu')(x)
    if use_bn:
        x = BatchNormalization()(x)
    x = Dropout(dropout_rate)(x)
outputs = Dense(n_targets, activation='linear')(x)
model = Model(inputs, outputs)

def mse_real(y_true_log, y_pred_log):
    y_true = tf.math.expm1(y_true_log)
    y_pred = tf.math.expm1(y_pred_log)
    return tf.reduce_mean(tf.square(y_true - y_pred))
mse_real.__name__ = 'mse_real'      

def rmse_real(y_true_log, y_pred_log):
    y_true = tf.math.expm1(y_true_log)
    y_pred = tf.math.expm1(y_pred_log)
    return tf.sqrt(tf.reduce_mean(tf.square(y_true - y_pred)))
rmse_real.__name__ = 'rmse_real'

# 7. Compile
model.compile(
    optimizer='adam',
    loss='mean_squared_error',
    metrics=[rmse_real, mse_real]
)

# 8. Callbacks & Training
callbacks = [
    EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose=1),
    ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True, verbose=1)
]
start_time = time.time()
history = model.fit(
    X_train_proc, y_train,
    validation_split=0.2,
    epochs=100,
    batch_size=64,
    callbacks=callbacks,
    verbose=2
)
duration = time.time() - start_time

# 9. Evaluation & Logging
training_loss = history.history['mse_real'][-1]
training_rmse = history.history['rmse_real'][-1]
validation_loss = history.history['val_mse_real'][-1]
validation_rmse = history.history['val_rmse_real'][-1]
results = {
    'training_loss': training_loss,
    'training_rmse': training_rmse,
    'validation_loss': validation_loss,
    'validation_rmse': validation_rmse,
    'training_duration': duration
}
with open('Keras/results.json', 'w') as f:
    json.dump(results, f)

# 10. Prediction & Submission
raw_preds = model.predict(X_val_proc)
final = raw_preds
if np.all(final >= 0):
    final = np.expm1(final)
if final.ndim == 1:
    final = final.reshape(-1, 1)
submission = pd.DataFrame(final, columns=target_columns)
submission.insert(0, id_col, test_ids.reset_index(drop=True))
submission.to_csv('Keras/submission_result.csv', index=False)


2025-07-09 21:25:04.887946: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


Epoch 1/100

Epoch 1: val_loss improved from inf to 0.33582, saving model to best_model.h5




49929/49929 - 70s - 1ms/step - loss: 0.4076 - mse_real: 2322.6353 - rmse_real: 44.5129 - val_loss: 0.3358 - val_mse_real: 1644.1116 - val_rmse_real: 40.4637
Epoch 2/100

Epoch 2: val_loss improved from 0.33582 to 0.33579, saving model to best_model.h5




49929/49929 - 67s - 1ms/step - loss: 0.3356 - mse_real: 1637.5887 - rmse_real: 40.3873 - val_loss: 0.3358 - val_mse_real: 1641.7657 - val_rmse_real: 40.4351
Epoch 3/100

Epoch 3: val_loss improved from 0.33579 to 0.33577, saving model to best_model.h5




49929/49929 - 69s - 1ms/step - loss: 0.3355 - mse_real: 1637.4263 - rmse_real: 40.3849 - val_loss: 0.3358 - val_mse_real: 1641.9222 - val_rmse_real: 40.4370
Epoch 4/100

Epoch 4: val_loss did not improve from 0.33577
49929/49929 - 70s - 1ms/step - loss: 0.3355 - mse_real: 1637.3541 - rmse_real: 40.3839 - val_loss: 0.3358 - val_mse_real: 1645.2397 - val_rmse_real: 40.4775
Epoch 5/100

Epoch 5: val_loss did not improve from 0.33577
49929/49929 - 69s - 1ms/step - loss: 0.3355 - mse_real: 1637.3877 - rmse_real: 40.3846 - val_loss: 0.3358 - val_mse_real: 1643.6863 - val_rmse_real: 40.4585
Epoch 6/100

Epoch 6: val_loss did not improve from 0.33577
49929/49929 - 70s - 1ms/step - loss: 0.3355 - mse_real: 1637.3088 - rmse_real: 40.3837 - val_loss: 0.3358 - val_mse_real: 1643.7808 - val_rmse_real: 40.4598
Epoch 7/100

Epoch 7: val_loss did not improve from 0.33577
49929/49929 - 68s - 1ms/step - loss: 0.3355 - mse_real: 1637.2535 - rmse_real: 40.3830 - val_loss: 0.3358 - val_mse_real: 1644.6272 

# Keras Tuner - 1 Attempts

## Attempt 1

In [None]:
import random
import numpy as np
import pandas as pd
import tensorflow as tf
import json
import time
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.metrics import RootMeanSquaredError, MeanAbsoluteError
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization
from tensorflow.keras import Model

# Set seeds for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

# 1. Data Loading & Split
# Training files
train_files = ['playground-series-s5e2/train.csv', 'playground-series-s5e2/training_extra.csv']
train_dfs = [pd.read_csv(f) for f in train_files]
# Test file
df_test = pd.read_csv('playground-series-s5e2/test.csv')
# Infer id and target columns from sample submission
sub_sample = pd.read_csv('playground-series-s5e2/sample_submission.csv', nrows=0)
id_col = sub_sample.columns[0]
target_columns = sub_sample.columns.tolist()[1:]

# Combine training data
df = pd.concat(train_dfs, ignore_index=True)

# 2. Target Encoding (Regression)
y_values = df[target_columns].astype(float).values
y_enc = np.log1p(y_values) if np.all(y_values >= 0) else y_values

# 3. Features & IDs
X = df.drop(columns=target_columns + [id_col], errors='ignore')
X_train = X.copy()
y_train = y_enc
test_ids = df_test[id_col]
X_val = df_test.drop(columns=target_columns + [id_col], errors='ignore')
y_val = None

# 4. Feature Engineering
# Drop columns with all missing values
X_train.dropna(axis=1, how='all', inplace=True)
X_val = X_val[X_train.columns]
# Identify categorical vs numeric
categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
low_card_cats = [c for c in categorical_cols if X_train[c].nunique() <= 50]
numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()

# 5. Preprocessing Pipeline
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, low_card_cats)
])
# Fit & transform
X_train_proc = preprocessor.fit_transform(X_train)
X_val_proc = preprocessor.transform(X_val)

# 6. Model Architecture Selection
n_samples, n_features = X_train_proc.shape
n_targets = len(target_columns)
if n_samples < 10000 or n_features < 100:
    units1 = min(n_features * 2, 128)
    units2 = min(n_features, 64)
    hidden_layers = [int(units1), int(units2)]
    use_bn = False
    dropout_rate = 0.3
else:
    sizes = [n_features * i for i in (2, 1, 0.5, 0.25)]
    hidden_layers = [int(s) for s in sizes if s >= 16]
    use_bn = True
    dropout_rate = 0.4

# Build the model using Keras Tuner
import keras_tuner as kt
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# Define early stopping and model checkpoint
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True)

n_features = X_train_proc.shape[1]

def mse_real(y_true_log, y_pred_log):
    y_true = tf.math.expm1(y_true_log)
    y_pred = tf.math.expm1(y_pred_log)
    return tf.reduce_mean(tf.square(y_true - y_pred))
mse_real.__name__ = 'mse_real'      

def rmse_real(y_true_log, y_pred_log):
    y_true = tf.math.expm1(y_true_log)
    y_pred = tf.math.expm1(y_pred_log)
    return tf.sqrt(tf.reduce_mean(tf.square(y_true - y_pred)))
rmse_real.__name__ = 'rmse_real'


class MyHyperModel(kt.HyperModel):
    def build(self, hp):
        layers = hp.Int('layers', 2, 8)
        units = hp.Int('units', 64, 1024, 64)
        drop = hp.Float('dropout', 0.0, 0.5, 0.1)
        opt = hp.Choice('optimizer', ['adam'])
        lr = hp.Float('learning_rate', 1e-5, 0.01, sampling='log')

        inputs = Input(shape=(n_features,))
        x = inputs
        for _ in range(layers):
            x = Dense(units, activation='relu')(x)
            x = Dropout(drop)(x)
        outputs = Dense(1, activation='linear')(x)
        model = Model(inputs, outputs)
        model.compile(optimizer=opt, loss='mean_squared_error', metrics=[rmse_real,mse_real])
        return model

# Initialize the Bayesian tuner
bs = 32  # batch size
ep = 20   # epochs

tuner = kt.BayesianOptimization(
    MyHyperModel(),
    objective='val_loss',
    max_trials=10,
    executions_per_trial=1,
    seed=42,
    overwrite=False,
    project_name='bayesian_tuner'
)

if y_val is not None:
    tuner.search(
        X_train_proc, y_train,
        validation_data=(X_val_proc, y_val),
        batch_size=bs, epochs=ep,
        callbacks=[early_stopping, checkpoint]
    )
else:
    tuner.search(
        X_train_proc, y_train,
        validation_split=0.2,
        batch_size=bs, epochs=ep,
        callbacks=[early_stopping, checkpoint]
    )

model = tuner.hypermodel.build(tuner.get_best_hyperparameters(1)[0])

# Retrain the model with the original callbacks and data
start_time = time.time()  # Start timing
if y_val is not None:
    history = model.fit(
        X_train_proc, y_train,
        validation_data=(X_val_proc, y_val),
        epochs=100, batch_size=bs,
        callbacks=[early_stopping, checkpoint],
        verbose=2
    )
else:
    history = model.fit(
        X_train_proc, y_train,
        validation_split=0.2,
        epochs=100, batch_size=bs,
        callbacks=[early_stopping, checkpoint],
        verbose=2
    )
end_time = time.time()  # End timing
duration = end_time - start_time  # Calculate duration

# 9. Evaluation & Logging
training_loss = history.history['mse_real'][-1]
training_rmse = history.history['rmse_real'][-1]
validation_loss = history.history['val_mse_real'][-1]
validation_rmse = history.history['val_rmse_real'][-1]
results = {
    'training_loss': training_loss,
    'training_rmse': training_rmse,
    'validation_loss': validation_loss,
    'validation_rmse': validation_rmse,
    'training_duration': duration
}

with open('results.json', 'w') as f:
    json.dump(results, f)

# 10. Prediction & Submission
raw_preds = model.predict(X_val_proc)
final = raw_preds
if np.all(final >= 0):
    final = np.expm1(final)
if final.ndim == 1:
    final = final.reshape(-1, 1)
submission = pd.DataFrame(final, columns=target_columns)
submission.insert(0, id_col, test_ids.reset_index(drop=True))
submission.to_csv('submission_result.csv', index=False)

Trial 10 Complete [01h 17m 39s]
val_loss: 0.33597391843795776

Best val_loss So Far: 0.335781455039978
Total elapsed time: 14h 03m 27s
Epoch 1/100




99858/99858 - 194s - 2ms/step - loss: 0.3556 - mse_real: 2132.3987 - rmse_real: 41.5539 - val_loss: 0.3360 - val_mse_real: 1641.9176 - val_rmse_real: 40.3551
Epoch 2/100
99858/99858 - 185s - 2ms/step - loss: 0.3357 - mse_real: 1638.3668 - rmse_real: 40.3152 - val_loss: 0.3360 - val_mse_real: 1642.1677 - val_rmse_real: 40.3581
Epoch 3/100
99858/99858 - 178s - 2ms/step - loss: 0.3357 - mse_real: 1638.3527 - rmse_real: 40.3149 - val_loss: 0.3360 - val_mse_real: 1642.1677 - val_rmse_real: 40.3581
Epoch 4/100
99858/99858 - 194s - 2ms/step - loss: 0.3357 - mse_real: 1638.3419 - rmse_real: 40.3148 - val_loss: 0.3360 - val_mse_real: 1642.1677 - val_rmse_real: 40.3581
Epoch 5/100
99858/99858 - 184s - 2ms/step - loss: 0.3357 - mse_real: 1638.3419 - rmse_real: 40.3148 - val_loss: 0.3360 - val_mse_real: 1642.1677 - val_rmse_real: 40.3581
Epoch 6/100
99858/99858 - 178s - 2ms/step - loss: 0.3357 - mse_real: 1638.3419 - rmse_real: 40.3148 - val_loss: 0.3360 - val_mse_real: 1642.1677 - val_rmse_real: 