# Tabular Playground Series - Aug 2021


## Modified the metric from `log1p RMSE` to Raw `RMSE` for both Keras and Keras Tuner

## Keras - 1 Attempt

## Attempt 1

In [None]:

# 1. Reproducibility
import random
import os
import numpy as np
import pandas as pd
import json
import time
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.metrics import RootMeanSquaredError, MeanAbsoluteError, SparseTopKCategoricalAccuracy
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline

seed = 42
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

# 2. Data Loading
train_df = pd.read_csv('tabular-playground-series-aug-2021/train.csv.zip')
df_test = pd.read_csv('tabular-playground-series-aug-2021/test.csv.zip')
id_col = 'id'
target_columns = ['loss']

# 3. Target Encoding for regression
y_values = train_df[target_columns].astype(float).values
y_enc = np.log1p(y_values) if np.all(y_values >= 0) else y_values

# 4. Features
X = train_df.drop(columns=target_columns + [id_col], errors='ignore')
test_ids = df_test[id_col]
X_test = df_test.drop(columns=target_columns + [id_col], errors='ignore')

# 4a. Drop columns with all missing values
all_missing = X.columns[X.isna().all()].tolist()
X.drop(columns=all_missing, inplace=True)
X_test.drop(columns=all_missing, inplace=True)

# 4b. Identify and drop categorical columns with high cardinality
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
for col in cat_cols:
    if X[col].nunique() > 50:
        X.drop(columns=col, inplace=True)
        X_test.drop(columns=col, inplace=True)

# 5. Preprocessing Pipeline
num_cols = X.select_dtypes(include=['number']).columns.tolist()
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

# Fit and transform data
X_train_proc = preprocessor.fit_transform(X)
X_test_proc = preprocessor.transform(X_test)

# 6. Model Architecture for continuous regression
n_samples, n_features = X_train_proc.shape
if n_samples < 10000 or n_features < 100:
    hidden_sizes = [min(n_features * 2, 128), min(n_features, 64)]
    dropout_rate = 0.3
    use_bn = False
else:
    sizes = [int(min(n_features * i, 1024)) for i in (2, 1, 0.5, 0.25)]
    hidden_sizes = [s for s in sizes if s >= 16]
    dropout_rate = 0.4
    use_bn = True

inputs = tf.keras.Input(shape=(n_features,))
x = inputs
for hs in hidden_sizes:
    x = tf.keras.layers.Dense(hs, activation='relu')(x)
    if use_bn:
        x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dropout(dropout_rate)(x)
outputs = tf.keras.layers.Dense(1, activation='linear')(x)
model = tf.keras.Model(inputs, outputs)

def mse_real(y_true_log, y_pred_log):
    y_true = tf.math.expm1(y_true_log)
    y_pred = tf.math.expm1(y_pred_log)
    return tf.reduce_mean(tf.square(y_true - y_pred))
mse_real.__name__ = 'mse_real'      

def rmse_real(y_true_log, y_pred_log):
    y_true = tf.math.expm1(y_true_log)
    y_pred = tf.math.expm1(y_pred_log)
    return tf.sqrt(tf.reduce_mean(tf.square(y_true - y_pred)))
rmse_real.__name__ = 'rmse_real'

model.compile(
    optimizer='adam',
    loss='mean_squared_error',
    metrics=[rmse_real, mse_real]
)

# 7. Callbacks & Training
callbacks = [
    EarlyStopping(patience=10, restore_best_weights=True, verbose=1),
    ModelCheckpoint('best_model.h5', save_best_only=True, verbose=1)
]

start_time = time.time()
history = model.fit(
    X_train_proc, y_enc,
    validation_split=0.2,
    epochs=100,
    batch_size=64,
    callbacks=callbacks,
    verbose=2
)
duration = time.time() - start_time

# 8. Evaluation & Logging
hist = history.history
results = {
    'training_loss': history.history['mse_real'][-1],
    'validation_loss': history.history['val_mse_real'][-1],
    'training_RMSE': history.history['rmse_real'][-1],
    'validation_RMSE': history.history['val_rmse_real'][-1]
}
with open('Keras/results.json', 'w') as f:
    json.dump(results, f)

# 9. Prediction & Submission
raw_preds = model.predict(X_test_proc)
final = raw_preds
if np.all(final >= 0):
    final = np.expm1(np.clip(final, a_min=None, a_max=20))
if final.ndim == 1:
    final = final.reshape(-1, 1)
submission = pd.DataFrame(final, columns=target_columns)
submission.insert(0, id_col, test_ids.reset_index(drop=True))
submission.to_csv('Keras/submission_result.csv', index=False)


2025-07-09 22:23:16.805563: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752099796.829364  922662 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752099796.836460  922662 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1752099796.857670  922662 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1752099796.857697  922662 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1752099796.857699  922662 computation_placer.cc:177] computation placer alr

Epoch 1/100

Epoch 1: val_loss improved from inf to 1.23649, saving model to best_model.h5




3125/3125 - 13s - 4ms/step - loss: 1.2209 - mse_real: 73.9877 - rmse_real: 8.5200 - val_loss: 1.2184 - val_mse_real: 77.4115 - val_rmse_real: 8.7068
Epoch 3/100

Epoch 3: val_loss improved from 1.21839 to 1.21103, saving model to best_model.h5




3125/3125 - 12s - 4ms/step - loss: 1.2050 - mse_real: 73.2687 - rmse_real: 8.4778 - val_loss: 1.2110 - val_mse_real: 76.6270 - val_rmse_real: 8.6623
Epoch 4/100

Epoch 4: val_loss improved from 1.21103 to 1.20415, saving model to best_model.h5




3125/3125 - 12s - 4ms/step - loss: 1.1975 - mse_real: 72.8648 - rmse_real: 8.4544 - val_loss: 1.2041 - val_mse_real: 75.9260 - val_rmse_real: 8.6222
Epoch 5/100

Epoch 5: val_loss improved from 1.20415 to 1.19617, saving model to best_model.h5




3125/3125 - 13s - 4ms/step - loss: 1.1927 - mse_real: 72.6599 - rmse_real: 8.4422 - val_loss: 1.1962 - val_mse_real: 74.7014 - val_rmse_real: 8.5519
Epoch 6/100

Epoch 6: val_loss improved from 1.19617 to 1.19380, saving model to best_model.h5




3125/3125 - 13s - 4ms/step - loss: 1.1864 - mse_real: 72.3574 - rmse_real: 8.4241 - val_loss: 1.1938 - val_mse_real: 74.5614 - val_rmse_real: 8.5437
Epoch 7/100

Epoch 7: val_loss improved from 1.19380 to 1.19228, saving model to best_model.h5




3125/3125 - 13s - 4ms/step - loss: 1.1827 - mse_real: 72.2032 - rmse_real: 8.4152 - val_loss: 1.1923 - val_mse_real: 74.2101 - val_rmse_real: 8.5234
Epoch 8/100

Epoch 8: val_loss improved from 1.19228 to 1.19173, saving model to best_model.h5




3125/3125 - 13s - 4ms/step - loss: 1.1781 - mse_real: 71.9939 - rmse_real: 8.4030 - val_loss: 1.1917 - val_mse_real: 73.6760 - val_rmse_real: 8.4925
Epoch 9/100

Epoch 9: val_loss did not improve from 1.19173
3125/3125 - 13s - 4ms/step - loss: 1.1751 - mse_real: 71.8568 - rmse_real: 8.3950 - val_loss: 1.1926 - val_mse_real: 73.6867 - val_rmse_real: 8.4931
Epoch 10/100

Epoch 10: val_loss did not improve from 1.19173
3125/3125 - 13s - 4ms/step - loss: 1.1727 - mse_real: 71.7252 - rmse_real: 8.3872 - val_loss: 1.1921 - val_mse_real: 73.8186 - val_rmse_real: 8.5008
Epoch 11/100

Epoch 11: val_loss did not improve from 1.19173
3125/3125 - 12s - 4ms/step - loss: 1.1696 - mse_real: 71.5848 - rmse_real: 8.3787 - val_loss: 1.1956 - val_mse_real: 73.9565 - val_rmse_real: 8.5088
Epoch 12/100

Epoch 12: val_loss did not improve from 1.19173
3125/3125 - 12s - 4ms/step - loss: 1.1688 - mse_real: 71.5291 - rmse_real: 8.3755 - val_loss: 1.1937 - val_mse_real: 73.7933 - val_rmse_real: 8.4991
Epoch 13/

## Keras Tuner - 1 Attempt

# Attempt 1

In [None]:
# 1. Reproducibility
import random
import os
import numpy as np
import pandas as pd
import json
import time
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.metrics import RootMeanSquaredError, MeanAbsoluteError, SparseTopKCategoricalAccuracy
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline

seed = 42
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

# 2. Data Loading
train_df = pd.read_csv('tabular-playground-series-aug-2021/train.csv.zip')
df_test = pd.read_csv('tabular-playground-series-aug-2021/test.csv.zip')
id_col = 'id'
target_columns = ['loss']

# 3. Target Encoding for regression
y_values = train_df[target_columns].astype(float).values
y_enc = np.log1p(y_values) if np.all(y_values >= 0) else y_values

# 4. Features
X = train_df.drop(columns=target_columns + [id_col], errors='ignore')
test_ids = df_test[id_col]
X_test = df_test.drop(columns=target_columns + [id_col], errors='ignore')

# 4a. Drop columns with all missing values
all_missing = X.columns[X.isna().all()].tolist()
X.drop(columns=all_missing, inplace=True)
X_test.drop(columns=all_missing, inplace=True)

# 4b. Identify and drop categorical columns with high cardinality
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
for col in cat_cols:
    if X[col].nunique() > 50:
        X.drop(columns=col, inplace=True)
        X_test.drop(columns=col, inplace=True)

# 5. Preprocessing Pipeline
num_cols = X.select_dtypes(include=['number']).columns.tolist()
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

# Fit and transform data
X_train_proc = preprocessor.fit_transform(X)
X_test_proc = preprocessor.transform(X_test)

# 6. Model Architecture for continuous regression
n_samples, n_features = X_train_proc.shape
if n_samples < 10000 or n_features < 100:
    hidden_sizes = [min(n_features * 2, 128), min(n_features, 64)]
    dropout_rate = 0.3
    use_bn = False
else:
    sizes = [int(min(n_features * i, 1024)) for i in (2, 1, 0.5, 0.25)]
    hidden_sizes = [s for s in sizes if s >= 16]
    dropout_rate = 0.4
    use_bn = True

import keras_tuner as kt
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization
from tensorflow.keras.models import Model

# Define early stopping and model checkpoint
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True)

# Input dimension
n_features = X_train_proc.shape[1]

def mse_real(y_true_log, y_pred_log):
    y_true = tf.math.expm1(y_true_log)
    y_pred = tf.math.expm1(y_pred_log)
    return tf.reduce_mean(tf.square(y_true - y_pred))
mse_real.__name__ = 'mse_real'      

def rmse_real(y_true_log, y_pred_log):
    y_true = tf.math.expm1(y_true_log)
    y_pred = tf.math.expm1(y_pred_log)
    return tf.sqrt(tf.reduce_mean(tf.square(y_true - y_pred)))
rmse_real.__name__ = 'rmse_real'


# HyperModel
class MyHyperModel(kt.HyperModel):
    def build(self, hp):
        layers = hp.Int('layers', 2, 8)
        units = hp.Int('units', 64, 1024, 64)
        act = hp.Choice('activation', ['relu'])
        drop = hp.Float('dropout', 0.0, 0.5, 0.1)
        opt = hp.Choice('optimizer', ['adam'])
        lr = hp.Float('learning_rate', 1e-5, 0.01, sampling='log')

        inputs = Input(shape=(n_features,))
        x = inputs
        for _ in range(layers):
            x = Dense(units, activation=act)(x)
            x = BatchNormalization()(x)
            x = Dropout(drop)(x)
        outputs = Dense(1, activation='linear')(x)
        model = Model(inputs, outputs)
        model.compile(optimizer=opt, loss='mean_squared_error', metrics=[mse_real, rmse_real])
        return model

# Tuner setup
bs = 32  # Example batch size, adjust as needed
ep = 100  # Example epochs, adjust as needed

tuner = kt.BayesianOptimization(
    MyHyperModel(),
    objective='val_loss',
    max_trials=10,
    executions_per_trial=1,
    seed=42,
    overwrite=False,
    project_name='bayesian_tuner'
)

if 'y_val' in locals() and y_val is not None:
    tuner.search(
        X_train_proc, y_enc,
        validation_data=(X_test_proc, y_test),  # Assuming y_test is defined
        batch_size=bs, epochs=ep,
        callbacks=[early_stopping, checkpoint]
    )
else:
    tuner.search(
        X_train_proc, y_enc,
        validation_split=0.2,
        batch_size=bs, epochs=ep,
        callbacks=[early_stopping, checkpoint]
    )

model = tuner.hypermodel.build(tuner.get_best_hyperparameters(1)[0])

start_time = time.time()

# Retrain model with original callbacks and data
if 'y_val' in locals() and y_val is not None:
    history = model.fit(
        X_train_proc, y_enc,
        validation_data=(X_test_proc, y_test),  # Assuming y_test is defined
        epochs=100, batch_size=bs,
        callbacks=[early_stopping, checkpoint],
        verbose=2
    )
else:
    history = model.fit(
        X_train_proc, y_enc,
        validation_split=0.2,
        epochs=100, batch_size=bs,
        callbacks=[early_stopping, checkpoint],
        verbose=2
    )

duration = time.time() - start_time

# 8. Evaluation & Logging
hist = history.history
results = {
    'training_loss': history.history['mse_real'][-1],
    'validation_loss': history.history['val_mse_real'][-1],
    'training_RMSE': history.history['rmse_real'][-1],
    'validation_RMSE': history.history['val_rmse_real'][-1]
}
with open('results.json', 'w') as f:
    json.dump(results, f)

# 9. Prediction & Submission
raw_preds = model.predict(X_test_proc)
final = raw_preds
if np.all(final >= 0):
    final = np.expm1(np.clip(final, a_min=None, a_max=20))
if final.ndim == 1:
    final = final.reshape(-1, 1)
submission = pd.DataFrame(final, columns=target_columns)
submission.insert(0, id_col, test_ids.reset_index(drop=True))
submission.to_csv('submission_result.csv', index=False)

Trial 10 Complete [00h 07m 49s]
val_loss: 1.2046785354614258

Best val_loss So Far: 1.199518084526062
Total elapsed time: 01h 23m 53s
Epoch 1/100




6250/6250 - 41s - 7ms/step - loss: 1.3402 - mse_real: 76.8970 - rmse_real: 8.5880 - val_loss: 1.2173 - val_mse_real: 74.5659 - val_rmse_real: 8.4673
Epoch 2/100




6250/6250 - 37s - 6ms/step - loss: 1.2205 - mse_real: 73.4444 - rmse_real: 8.4042 - val_loss: 1.2041 - val_mse_real: 73.5331 - val_rmse_real: 8.4074
Epoch 3/100




6250/6250 - 38s - 6ms/step - loss: 1.2076 - mse_real: 73.0739 - rmse_real: 8.3823 - val_loss: 1.2036 - val_mse_real: 74.5721 - val_rmse_real: 8.4664
Epoch 4/100




6250/6250 - 37s - 6ms/step - loss: 1.1970 - mse_real: 72.7041 - rmse_real: 8.3596 - val_loss: 1.2027 - val_mse_real: 74.8107 - val_rmse_real: 8.4800
Epoch 5/100
6250/6250 - 37s - 6ms/step - loss: 1.1864 - mse_real: 72.2902 - rmse_real: 8.3354 - val_loss: 1.2105 - val_mse_real: 76.0896 - val_rmse_real: 8.5525
Epoch 6/100
6250/6250 - 38s - 6ms/step - loss: 1.1770 - mse_real: 71.8918 - rmse_real: 8.3116 - val_loss: 1.2095 - val_mse_real: 76.1003 - val_rmse_real: 8.5527
Epoch 7/100
6250/6250 - 37s - 6ms/step - loss: 1.1699 - mse_real: 71.5707 - rmse_real: 8.2931 - val_loss: 1.2073 - val_mse_real: 75.7180 - val_rmse_real: 8.5310
Epoch 8/100
6250/6250 - 36s - 6ms/step - loss: 1.1627 - mse_real: 71.2016 - rmse_real: 8.2715 - val_loss: 1.2074 - val_mse_real: 75.6720 - val_rmse_real: 8.5287
Epoch 9/100
6250/6250 - 36s - 6ms/step - loss: 1.1566 - mse_real: 70.9180 - rmse_real: 8.2544 - val_loss: 1.2140 - val_mse_real: 75.4003 - val_rmse_real: 8.5136
Epoch 10/100
6250/6250 - 38s - 6ms/step - loss

In [None]:
print(duration)

525.935126
