## Tabular Playground Series - Jun 2022

## Keras - 1 Attempt 

## Attempt 1

In [None]:

# Reproducibility
import os, random, time, json
import numpy as np
import pandas as pd
tf_import_error = False
try:
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers
    from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
except ImportError:
    tf_import_error = True
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Set seeds
random.seed(42)
np.random.seed(42)
if not tf_import_error:
    tf.random.set_seed(42)

# Load data
print("Loading data...")
df = pd.read_csv('data.csv.zip')
df_test = pd.read_csv('sample_submission.csv.zip')

# Infer identifiers
id_col = 'row-col'
target_col = 'value'

# Identify continuous features
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()

# Drop columns with all missing values
df = df.dropna(axis=1, how='all')
num_cols = [c for c in num_cols if c in df.columns]

# Preprocessing pipeline for numeric features
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Fit and transform training numeric data
print("Preprocessing numeric features...")
X_train_num = num_pipeline.fit_transform(df[num_cols])

# Autoencoder model definition
n_features = X_train_num.shape[1]
input_layer = keras.Input(shape=(n_features,))
# Since n_features < 100, use small architecture
dense1 = layers.Dense(min(n_features*2,128), activation='relu')(input_layer)
drop1 = layers.Dropout(0.3)(dense1)
dense2 = layers.Dense(min(n_features,64), activation='relu')(drop1)
drop2 = layers.Dropout(0.3)(dense2)
dense3 = layers.Dense(min(n_features*2,128), activation='relu')(drop2)
drop3 = layers.Dropout(0.3)(dense3)
output_layer = layers.Dense(n_features, activation='linear')(drop3)
model = keras.Model(inputs=input_layer, outputs=output_layer)

model.compile(
    optimizer='adam',
    loss='mean_squared_error',
    metrics=[tf.keras.metrics.RootMeanSquaredError(name='rmse'),
             tf.keras.metrics.MeanAbsoluteError(name='mae')]
)
model.summary()

# Callbacks
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
checkpoint = ModelCheckpoint('best_model.h5', save_best_only=True, monitor='val_loss')
callbacks = [early_stop, checkpoint]

# Training
t_start = time.time()
history = model.fit(
    X_train_num, X_train_num,
    validation_split=0.2,
    epochs=100,
    batch_size=128,
    callbacks=callbacks,
    verbose=2
)
t_duration = time.time() - t_start
print(f"Training completed in {t_duration:.2f} seconds.")

# Reconstruct and impute missing values
print("Reconstructing and imputing missing values...")
X_train_pred = model.predict(X_train_num)
# Inverse scaling
X_pred_unscaled = num_pipeline.named_steps['scaler'].inverse_transform(X_train_pred)

df_imputed = df.copy()
for i, col in enumerate(num_cols):
    mask = df[col].isna().values
    if mask.any():
        df_imputed.loc[mask, col] = X_pred_unscaled[mask, i]
# Generate submission based on sample_submission
print("Generating submission file...")
predictions = []
for rc in df_test[id_col]:
    row_str, col_name = rc.split('-')
    row_idx = int(row_str)
    val = df_imputed.at[row_idx, col_name]
    predictions.append(val)

submission = pd.DataFrame({id_col: df_test[id_col], target_col: predictions})
submission.to_csv('submission_result.csv', index=False)

# Save training results
print("Saving results...")
results = {
    'training_loss': history.history['loss'][-1],
    'training_rmse': history.history['rmse'][-1],
    'training_mae': history.history['mae'][-1],
    'validation_loss': history.history['val_loss'][-1],
    'validation_rmse': history.history['val_rmse'] if 'val_rmse' in history.history else history.history['val_rmse'][-1],
    'validation_mae': history.history['val_mae'][-1]
}
with open('results.json', 'w') as f:
    json.dump(results, f)

print("Done.")

2025-07-10 13:21:31.132557: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752153691.157185  966351 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752153691.165268  966351 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1752153691.187270  966351 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1752153691.187298  966351 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1752153691.187300  966351 computation_placer.cc:177] computation placer alr

Loading data...
Preprocessing numeric features...


2025-07-10 13:21:56.725679: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


Epoch 1/100




6250/6250 - 14s - 2ms/step - loss: 0.7957 - mae: 0.7047 - rmse: 0.8920 - val_loss: 0.6824 - val_mae: 0.6488 - val_rmse: 0.8261
Epoch 2/100




6250/6250 - 13s - 2ms/step - loss: 0.7670 - mae: 0.6911 - rmse: 0.8758 - val_loss: 0.6699 - val_mae: 0.6427 - val_rmse: 0.8184
Epoch 3/100




6250/6250 - 13s - 2ms/step - loss: 0.7582 - mae: 0.6859 - rmse: 0.8707 - val_loss: 0.6641 - val_mae: 0.6397 - val_rmse: 0.8149
Epoch 4/100
6250/6250 - 13s - 2ms/step - loss: 0.7528 - mae: 0.6827 - rmse: 0.8677 - val_loss: 0.6672 - val_mae: 0.6415 - val_rmse: 0.8169
Epoch 5/100
6250/6250 - 13s - 2ms/step - loss: 0.7499 - mae: 0.6807 - rmse: 0.8660 - val_loss: 0.6678 - val_mae: 0.6422 - val_rmse: 0.8172
Epoch 6/100
6250/6250 - 14s - 2ms/step - loss: 0.7479 - mae: 0.6794 - rmse: 0.8648 - val_loss: 0.6690 - val_mae: 0.6431 - val_rmse: 0.8179
Epoch 7/100
6250/6250 - 13s - 2ms/step - loss: 0.7462 - mae: 0.6783 - rmse: 0.8638 - val_loss: 0.6702 - val_mae: 0.6436 - val_rmse: 0.8187
Epoch 8/100
6250/6250 - 13s - 2ms/step - loss: 0.7446 - mae: 0.6775 - rmse: 0.8629 - val_loss: 0.6712 - val_mae: 0.6447 - val_rmse: 0.8192
Training completed in 105.77 seconds.
Reconstructing and imputing missing values...
[1m31250/31250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 639us/step
Generating s

In [2]:
print(t_duration)

105.77217316627502


## Keras Tuner - 4 Attempts

## Attempt 1 

In [None]:
# Reproducibility
import os, random, time, json
import numpy as np
import pandas as pd
tf_import_error = False
try:
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers
    from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
except ImportError:
    tf_import_error = True
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Set seeds
random.seed(42)
np.random.seed(42)
if not tf_import_error:
    tf.random.set_seed(42)

# Load data
print("Loading data...")
df = pd.read_csv('tabular-playground-series-jun-2022/data.csv.zip')
df_test = pd.read_csv('tabular-playground-series-jun-2022/sample_submission.csv.zip')

# Infer identifiers
id_col = 'row-col'
target_col = 'value'

# Identify continuous features
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()

# Drop columns with all missing values
df = df.dropna(axis=1, how='all')
num_cols = [c for c in num_cols if c in df.columns]

# Preprocessing pipeline for numeric features
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Fit and transform training numeric data
print("Preprocessing numeric features...")
X_train_num = num_pipeline.fit_transform(df[num_cols])

# Keras-Tuner model definition
import keras_tuner as kt
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# Define early stopping and model checkpoint
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True)

n_features = X_train_num.shape[1]

class MyHyperModel(kt.HyperModel):
    def build(self, hp):
        layers = hp.Int('layers', 2, 8)
        units = hp.Int('units', 64, 1024, step=64)
        act = hp.Choice('activation', ['relu'])
        drop = hp.Float('dropout', 0.0, 0.5, step=0.1)
        opt = hp.Choice('optimizer', ['adam'])
        lr = hp.Float('learning_rate', 1e-5, 0.01, sampling='log')

        inputs = Input(shape=(n_features,))
        x = inputs
        for _ in range(layers):
            x = Dense(units, activation=act)(x)
            x = Dropout(drop)(x)
        output_layer = Dense(n_features, activation='linear')(x)
        model = Model(inputs, output_layer)
        model.compile(optimizer=opt, loss='mean_squared_error', metrics=[tf.keras.metrics.RootMeanSquaredError(name='rmse'), tf.keras.metrics.MeanAbsoluteError(name='mae')])
        return model

# Initialize the tuner
bs = 32  # batch size
ep = 20  # epochs

tuner = kt.BayesianOptimization(
    MyHyperModel(),
    objective='val_loss',
    max_trials=1,
    executions_per_trial=1,
    seed=42,
    overwrite=True,
    project_name='bayesian_tuner'
)

if y_val is not None:
    tuner.search(
        X_train_num, y_train,
        validation_data=(X_val_num, y_val),
        batch_size=bs, epochs=1,
        callbacks=[early_stopping, checkpoint]
    )
else:
    tuner.search(
        X_train_num, y_train,
        validation_split=0.2,
        batch_size=bs, epochs=1,
        callbacks=[early_stopping, checkpoint]
    )

model = tuner.hypermodel.build(tuner.get_best_hyperparameters(1)[0])

if y_val is not None:
    history = model.fit(
        X_train_num, y_train,
        validation_data=(X_val_num, y_val),
        epochs=1, batch_size=bs,
        callbacks=[early_stopping, checkpoint],
        verbose=2
    )
else:
    history = model.fit(
        X_train_num, y_train,
        validation_split=0.2,
        epochs=1, batch_size=bs,
        callbacks=[early_stopping, checkpoint],
        verbose=2
    )

# Reconstruct and impute missing values
print("Reconstructing and imputing missing values...")
X_train_pred = model.predict(X_train_num)
# Inverse scaling
X_pred_unscaled = num_pipeline.named_steps['scaler'].inverse_transform(X_train_pred)

df_imputed = df.copy()
for i, col in enumerate(num_cols):
    mask = df[col].isna().values
    if mask.any():
        df_imputed.loc[mask, col] = X_pred_unscaled[mask, i]
# Generate submission based on sample_submission
print("Generating submission file...")
predictions = []
for rc in df_test[id_col]:
    row_str, col_name = rc.split('-')
    row_idx = int(row_str)
    val = df_imputed.at[row_idx, col_name]
    predictions.append(val)

submission = pd.DataFrame({id_col: df_test[id_col], target_col: predictions})
submission.to_csv('submission_result.csv', index=False)

# Save training results
print("Saving results...")
results = {
    'training_loss': history.history['loss'][-1],
    'training_rmse': history.history['rmse'][-1],
    'training_mae': history.history['mae'][-1],
    'validation_loss': history.history['val_loss'][-1],
    'validation_rmse': history.history['val_rmse'] if 'val_rmse' in history.history else history.history['val_rmse'][-1],
    'validation_mae': history.history['val_mae'][-1]
}
with open('results.json', 'w') as f:
    json.dump(results, f)

print("Done.")

Loading data...
Preprocessing numeric features...


NameError: name 'y_val' is not defined

## Attempt 2

In [None]:
# Reproducibility
import os, random, time, json
import numpy as np
import pandas as pd
tf_import_error = False
try:
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers
    from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
except ImportError:
    tf_import_error = True
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Set seeds
random.seed(42)
np.random.seed(42)
if not tf_import_error:
    tf.random.set_seed(42)

# Load data
print("Loading data...")
df = pd.read_csv('tabular-playground-series-jun-2022/data.csv.zip')
df_test = pd.read_csv('tabular-playground-series-jun-2022/sample_submission.csv.zip')

# Infer identifiers
id_col = 'row-col'
target_col = 'value'

# Identify continuous features
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()

# Drop columns with all missing values
df = df.dropna(axis=1, how='all')
num_cols = [c for c in num_cols if c in df.columns]

# Preprocessing pipeline for numeric features
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Fit and transform training numeric data
print("Preprocessing numeric features...")
X_train_num = num_pipeline.fit_transform(df[num_cols])
y_train = df[target_col].values  # Define y_train
X_val_num = None  # Define X_val_num for validation split
y_val = None  # Define y_val for validation split

# Keras-Tuner model definition
import keras_tuner as kt
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# Define early stopping and model checkpoint
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True)

n_features = X_train_num.shape[1]

class MyHyperModel(kt.HyperModel):
    def build(self, hp):
        layers = hp.Int('layers', 2, 8)
        units = hp.Int('units', 64, 1024, step=64)
        act = hp.Choice('activation', ['relu'])
        drop = hp.Float('dropout', 0.0, 0.5, step=0.1)
        opt = hp.Choice('optimizer', ['adam'])
        lr = hp.Float('learning_rate', 1e-5, 0.01, sampling='log')

        inputs = Input(shape=(n_features,))
        x = inputs
        for _ in range(layers):
            x = Dense(units, activation=act)(x)
            x = Dropout(drop)(x)
        output_layer = Dense(n_features, activation='linear')(x)
        model = Model(inputs, output_layer)
        model.compile(optimizer=opt, loss='mean_squared_error', metrics=[tf.keras.metrics.RootMeanSquaredError(name='rmse'), tf.keras.metrics.MeanAbsoluteError(name='mae')])
        return model

# Initialize the tuner
bs = 32  # batch size
ep = 20  # epochs

tuner = kt.BayesianOptimization(
    MyHyperModel(),
    objective='val_loss',
    max_trials=1,
    executions_per_trial=1,
    seed=42,
    overwrite=True,
    project_name='bayesian_tuner'
)

if y_val is not None:
    tuner.search(
        X_train_num, y_train,
        validation_data=(X_val_num, y_val),
        batch_size=bs, epochs=1,
        callbacks=[early_stopping, checkpoint]
    )
else:
    tuner.search(
        X_train_num, y_train,
        validation_split=0.2,
        batch_size=bs, epochs=1,
        callbacks=[early_stopping, checkpoint]
    )

model = tuner.hypermodel.build(tuner.get_best_hyperparameters(1)[0])

# Start timing the training process
start_time = time.time()

if y_val is not None:
    history = model.fit(
        X_train_num, y_train,
        validation_data=(X_val_num, y_val),
        epochs=1, batch_size=bs,
        callbacks=[early_stopping, checkpoint],
        verbose=2
    )
else:
    history = model.fit(
        X_train_num, y_train,
        validation_split=0.2,
        epochs=1, batch_size=bs,
        callbacks=[early_stopping, checkpoint],
        verbose=2
    )

# End timing the training process
end_time = time.time()
training_time = end_time - start_time

# Reconstruct and impute missing values
print("Reconstructing and imputing missing values...")
X_train_pred = model.predict(X_train_num)
# Inverse scaling
X_pred_unscaled = num_pipeline.named_steps['scaler'].inverse_transform(X_train_pred)

df_imputed = df.copy()
for i, col in enumerate(num_cols):
    mask = df[col].isna().values
    if mask.any():
        df_imputed.loc[mask, col] = X_pred_unscaled[mask, i]
# Generate submission based on sample_submission
print("Generating submission file...")
predictions = []
for rc in df_test[id_col]:
    row_str, col_name = rc.split('-')
    row_idx = int(row_str)
    val = df_imputed.at[row_idx, col_name]
    predictions.append(val)

submission = pd.DataFrame({id_col: df_test[id_col], target_col: predictions})
submission.to_csv('submission_result.csv', index=False)

# Save training results
print("Saving results...")
results = {
    'training_loss': history.history['loss'][-1],
    'training_rmse': history.history['rmse'][-1],
    'training_mae': history.history['mae'][-1],
    'validation_loss': history.history['val_loss'][-1],
    'validation_rmse': history.history['val_rmse'][-1] if 'val_rmse' in history.history else None,
    'validation_mae': history.history['val_mae'][-1],
    'training_time': training_time  # Save training time
}
with open('results.json', 'w') as f:
    json.dump(results, f)

print("Done.")

Loading data...
Preprocessing numeric features...


KeyError: 'value'

## Attempt 3

In [None]:
# Reproducibility
import os, random, time, json
import numpy as np
import pandas as pd
tf_import_error = False
try:
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers
    from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
except ImportError:
    tf_import_error = True
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Set seeds
random.seed(42)
np.random.seed(42)
if not tf_import_error:
    tf.random.set_seed(42)

# Load data
print("Loading data...")
df = pd.read_csv('tabular-playground-series-jun-2022/data.csv.zip')
df_test = pd.read_csv('tabular-playground-series-jun-2022/sample_submission.csv.zip')

# Infer identifiers
id_col = 'row-col'
target_col = 'value'

# Check if target column exists in the test data
if target_col not in df_test.columns:
    raise KeyError(f"Target column '{target_col}' not found in the sample submission dataframe.")

# Identify continuous features
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()

# Drop columns with all missing values
df = df.dropna(axis=1, how='all')
num_cols = [c for c in num_cols if c in df.columns]

# Preprocessing pipeline for numeric features
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Fit and transform training numeric data
print("Preprocessing numeric features...")
X_train_num = num_pipeline.fit_transform(df[num_cols])
y_train = df_test[target_col].values  # Define y_train from test data
X_val_num = None  # Define X_val_num for validation split
y_val = None  # Define y_val for validation split

# Keras-Tuner model definition
import keras_tuner as kt
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# Define early stopping and model checkpoint
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True)

n_features = X_train_num.shape[1]

class MyHyperModel(kt.HyperModel):
    def build(self, hp):
        layers = hp.Int('layers', 2, 8)
        units = hp.Int('units', 64, 1024, step=64)
        act = hp.Choice('activation', ['relu'])
        drop = hp.Float('dropout', 0.0, 0.5, step=0.1)
        opt = hp.Choice('optimizer', ['adam'])
        lr = hp.Float('learning_rate', 1e-5, 0.01, sampling='log')

        inputs = Input(shape=(n_features,))
        x = inputs
        for _ in range(layers):
            x = Dense(units, activation=act)(x)
            x = Dropout(drop)(x)
        output_layer = Dense(1, activation='linear')(x)  # Change output layer to have 1 unit
        model = Model(inputs, output_layer)
        model.compile(optimizer=opt, loss='mean_squared_error', metrics=[tf.keras.metrics.RootMeanSquaredError(name='rmse'), tf.keras.metrics.MeanAbsoluteError(name='mae')])
        return model

# Initialize the tuner
bs = 32  # batch size
ep = 20  # epochs

tuner = kt.BayesianOptimization(
    MyHyperModel(),
    objective='val_loss',
    max_trials=1,
    executions_per_trial=1,
    seed=42,
    overwrite=True,
    project_name='bayesian_tuner'
)

if y_val is not None:
    tuner.search(
        X_train_num, y_train,
        validation_data=(X_val_num, y_val),
        batch_size=bs, epochs=1,
        callbacks=[early_stopping, checkpoint]
    )
else:
    tuner.search(
        X_train_num, y_train,
        validation_split=0.2,
        batch_size=bs, epochs=1,
        callbacks=[early_stopping, checkpoint]
    )

model = tuner.hypermodel.build(tuner.get_best_hyperparameters(1)[0])

# Start timing the training process
start_time = time.time()

if y_val is not None:
    history = model.fit(
        X_train_num, y_train,
        validation_data=(X_val_num, y_val),
        epochs=1, batch_size=bs,
        callbacks=[early_stopping, checkpoint],
        verbose=2
    )
else:
    history = model.fit(
        X_train_num, y_train,
        validation_split=0.2,
        epochs=1, batch_size=bs,
        callbacks=[early_stopping, checkpoint],
        verbose=2
    )

# End timing the training process
end_time = time.time()
training_time = end_time - start_time

# Reconstruct and impute missing values
print("Reconstructing and imputing missing values...")
X_train_pred = model.predict(X_train_num)
# Inverse scaling
X_pred_unscaled = num_pipeline.named_steps['scaler'].inverse_transform(X_train_pred)

df_imputed = df.copy()
for i, col in enumerate(num_cols):
    mask = df[col].isna().values
    if mask.any():
        df_imputed.loc[mask, col] = X_pred_unscaled[mask, i]
# Generate submission based on sample_submission
print("Generating submission file...")
predictions = []
for rc in df_test[id_col]:
    row_str, col_name = rc.split('-')
    row_idx = int(row_str)
    val = df_imputed.at[row_idx, col_name]
    predictions.append(val)

submission = pd.DataFrame({id_col: df_test[id_col], target_col: predictions})
submission.to_csv('submission_result.csv', index=False)

# Save training results
print("Saving results...")
results = {
    'training_loss': history.history['loss'][-1],
    'training_rmse': history.history['rmse'][-1],
    'training_mae': history.history['mae'][-1],
    'validation_loss': history.history['val_loss'][-1],
    'validation_rmse': history.history['val_rmse'][-1] if 'val_rmse' in history.history else None,
    'validation_mae': history.history['val_mae'][-1],
    'training_time': training_time  # Save training time
}
with open('results.json', 'w') as f:
    json.dump(results, f)

print("Done.")

Trial 1 Complete [00h 00m 52s]
val_loss: 3.261597602785028e-14

Best val_loss So Far: 3.261597602785028e-14
Total elapsed time: 00h 00m 52s




25000/25000 - 50s - 2ms/step - loss: 1.6314e-05 - mae: 3.5707e-04 - rmse: 0.0040 - val_loss: 5.9116e-12 - val_mae: 2.4315e-06 - val_rmse: 2.4314e-06
Reconstructing and imputing missing values...
[1m31250/31250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 639us/step


ValueError: non-broadcastable output operand with shape (1000000,1) doesn't match the broadcast shape (1000000,81)

## Attempt 4

In [1]:
# Reproducibility
import os, random, time, json
import numpy as np
import pandas as pd
tf_import_error = False
try:
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers
    from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
except ImportError:
    tf_import_error = True
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Set seeds
random.seed(42)
np.random.seed(42)
if not tf_import_error:
    tf.random.set_seed(42)

# Load data
print("Loading data...")
df = pd.read_csv('data.csv.zip')
df_test = pd.read_csv('sample_submission.csv.zip')

# Infer identifiers
id_col = 'row-col'
target_col = 'value'

# Check if target column exists in the test data
if target_col not in df_test.columns:
    raise KeyError(f"Target column '{target_col}' not found in the sample submission dataframe.")

# Identify continuous features
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()

# Drop columns with all missing values
df = df.dropna(axis=1, how='all')
num_cols = [c for c in num_cols if c in df.columns]

# Preprocessing pipeline for numeric features
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Fit and transform training numeric data
print("Preprocessing numeric features...")
X_train_num = num_pipeline.fit_transform(df[num_cols])
y_train = df_test[target_col].values  # Define y_train from test data
X_val_num = None  # Define X_val_num for validation split
y_val = None  # Define y_val for validation split

# Keras-Tuner model definition
import keras_tuner as kt
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# Define early stopping and model checkpoint
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True)

n_features = X_train_num.shape[1]

class MyHyperModel(kt.HyperModel):
    def build(self, hp):
        layers = hp.Int('layers', 2, 8)
        units = hp.Int('units', 64, 1024, step=64)
        act = hp.Choice('activation', ['relu'])
        drop = hp.Float('dropout', 0.0, 0.5, step=0.1)
        opt = hp.Choice('optimizer', ['adam'])
        lr = hp.Float('learning_rate', 1e-5, 0.01, sampling='log')

        inputs = Input(shape=(n_features,))
        x = inputs
        for _ in range(layers):
            x = Dense(units, activation=act)(x)
            x = Dropout(drop)(x)
        output_layer = Dense(1, activation='linear')(x)  # Change output layer to have 1 unit
        model = Model(inputs, output_layer)
        model.compile(optimizer=opt, loss='mean_squared_error', metrics=[tf.keras.metrics.RootMeanSquaredError(name='rmse'), tf.keras.metrics.MeanAbsoluteError(name='mae')])
        return model

# Initialize the tuner
bs = 32  # batch size
ep = 20  # epochs

tuner = kt.BayesianOptimization(
    MyHyperModel(),
    objective='val_loss',
    max_trials=10,
    executions_per_trial=1,
    seed=42,
    overwrite=True,
    project_name='bayesian_tuner'
)

if y_val is not None:
    tuner.search(
        X_train_num, y_train,
        validation_data=(X_val_num, y_val),
        batch_size=bs, epochs=ep,
        callbacks=[early_stopping, checkpoint]
    )
else:
    tuner.search(
        X_train_num, y_train,
        validation_split=0.2,
        batch_size=bs, epochs=ep,
        callbacks=[early_stopping, checkpoint]
    )

model = tuner.hypermodel.build(tuner.get_best_hyperparameters(1)[0])

# Start timing the training process
start_time = time.time()

if y_val is not None:
    history = model.fit(
        X_train_num, y_train,
        validation_data=(X_val_num, y_val),
        epochs=100, batch_size=bs,
        callbacks=[early_stopping, checkpoint],
        verbose=2
    )
else:
    history = model.fit(
        X_train_num, y_train,
        validation_split=0.2,
        epochs=100, batch_size=bs,
        callbacks=[early_stopping, checkpoint],
        verbose=2
    )

# End timing the training process
end_time = time.time()
training_time = end_time - start_time

# Reconstruct and impute missing values
print("Reconstructing and imputing missing values...")
X_train_pred = model.predict(X_train_num)
# Inverse scaling
X_pred_unscaled = num_pipeline.named_steps['scaler'].inverse_transform(np.repeat(X_train_pred, n_features, axis=1))

df_imputed = df.copy()
for i, col in enumerate(num_cols):
    mask = df[col].isna().values
    if mask.any():
        df_imputed.loc[mask, col] = X_pred_unscaled[mask, i]
# Generate submission based on sample_submission
print("Generating submission file...")
predictions = []
for rc in df_test[id_col]:
    row_str, col_name = rc.split('-')
    row_idx = int(row_str)
    val = df_imputed.at[row_idx, col_name]
    predictions.append(val)

submission = pd.DataFrame({id_col: df_test[id_col], target_col: predictions})
submission.to_csv('submission_result.csv', index=False)

# Save training results
print("Saving results...")
results = {
    'training_loss': history.history['loss'][-1],
    'training_rmse': history.history['rmse'][-1],
    'training_mae': history.history['mae'][-1],
    'validation_loss': history.history['val_loss'][-1],
    'validation_rmse': history.history['val_rmse'][-1] if 'val_rmse' in history.history else None,
    'validation_mae': history.history['val_mae'][-1],
    'training_time': training_time  # Save training time
}
with open('results.json', 'w') as f:
    json.dump(results, f)

print("Done.")

Trial 10 Complete [00h 39m 40s]
val_loss: 2.292622398850879e-22

Best val_loss So Far: 0.0
Total elapsed time: 05h 30m 06s
Epoch 1/100




25000/25000 - 100s - 4ms/step - loss: 0.0013 - mae: 0.0050 - rmse: 0.0361 - val_loss: 2.3745e-10 - val_mae: 1.5296e-05 - val_rmse: 1.5409e-05
Epoch 2/100




25000/25000 - 104s - 4ms/step - loss: 1.9702e-08 - mae: 1.4401e-05 - rmse: 1.4037e-04 - val_loss: 1.4026e-21 - val_mae: 3.7449e-11 - val_rmse: 3.7451e-11
Epoch 3/100
25000/25000 - 92s - 4ms/step - loss: 3.1575e-09 - mae: 8.0853e-06 - rmse: 5.6192e-05 - val_loss: 1.7438e-18 - val_mae: 1.3206e-09 - val_rmse: 1.3205e-09
Epoch 4/100
25000/25000 - 83s - 3ms/step - loss: 1.0912e-09 - mae: 7.7536e-06 - rmse: 3.3033e-05 - val_loss: 2.3983e-12 - val_mae: 1.5488e-06 - val_rmse: 1.5486e-06
Epoch 5/100
25000/25000 - 82s - 3ms/step - loss: 1.8032e-09 - mae: 7.7024e-06 - rmse: 4.2464e-05 - val_loss: 2.2981e-09 - val_mae: 4.7935e-05 - val_rmse: 4.7938e-05
Epoch 6/100
25000/25000 - 82s - 3ms/step - loss: 1.1831e-09 - mae: 7.1936e-06 - rmse: 3.4396e-05 - val_loss: 1.7892e-20 - val_mae: 1.3375e-10 - val_rmse: 1.3376e-10
Epoch 7/100
25000/25000 - 84s - 3ms/step - loss: 2.6238e-09 - mae: 8.4071e-06 - rmse: 5.1224e-05 - val_loss: 1.0325e-12 - val_mae: 1.0161e-06 - val_rmse: 1.0161e-06
Epoch 8/100




25000/25000 - 99s - 4ms/step - loss: 4.7698e-09 - mae: 6.9596e-06 - rmse: 6.9064e-05 - val_loss: 3.1318e-28 - val_mae: 1.7696e-14 - val_rmse: 1.7697e-14
Epoch 9/100
25000/25000 - 87s - 3ms/step - loss: 9.8873e-10 - mae: 7.1351e-06 - rmse: 3.1444e-05 - val_loss: 8.8613e-19 - val_mae: 9.4133e-10 - val_rmse: 9.4134e-10
Epoch 10/100
25000/25000 - 92s - 4ms/step - loss: 1.6177e-09 - mae: 7.1248e-06 - rmse: 4.0220e-05 - val_loss: 1.3361e-24 - val_mae: 1.1559e-12 - val_rmse: 1.1559e-12
Epoch 11/100
25000/25000 - 96s - 4ms/step - loss: 1.0952e-09 - mae: 7.3512e-06 - rmse: 3.3094e-05 - val_loss: 3.2377e-22 - val_mae: 1.7993e-11 - val_rmse: 1.7994e-11
Epoch 12/100
25000/25000 - 90s - 4ms/step - loss: 1.0238e-09 - mae: 7.2997e-06 - rmse: 3.1997e-05 - val_loss: 3.3752e-25 - val_mae: 5.8092e-13 - val_rmse: 5.8097e-13
Epoch 13/100
25000/25000 - 95s - 4ms/step - loss: 8.3824e-10 - mae: 7.4283e-06 - rmse: 2.8952e-05 - val_loss: 9.9247e-23 - val_mae: 9.9619e-12 - val_rmse: 9.9623e-12
Epoch 14/100
25000



25000/25000 - 96s - 4ms/step - loss: 8.1059e-10 - mae: 7.5536e-06 - rmse: 2.8471e-05 - val_loss: 6.6034e-30 - val_mae: 2.5697e-15 - val_rmse: 2.5697e-15
Epoch 19/100
25000/25000 - 99s - 4ms/step - loss: 8.0869e-10 - mae: 7.5721e-06 - rmse: 2.8437e-05 - val_loss: 8.0671e-25 - val_mae: 8.9825e-13 - val_rmse: 8.9817e-13
Epoch 20/100
25000/25000 - 99s - 4ms/step - loss: 8.3784e-10 - mae: 7.5306e-06 - rmse: 2.8945e-05 - val_loss: 8.3632e-14 - val_mae: 2.8918e-07 - val_rmse: 2.8919e-07
Epoch 21/100
25000/25000 - 98s - 4ms/step - loss: 7.9404e-10 - mae: 7.5836e-06 - rmse: 2.8179e-05 - val_loss: 2.2045e-24 - val_mae: 1.4848e-12 - val_rmse: 1.4848e-12
Epoch 22/100
25000/25000 - 84s - 3ms/step - loss: 8.1956e-10 - mae: 7.5995e-06 - rmse: 2.8628e-05 - val_loss: 1.6140e-24 - val_mae: 1.2704e-12 - val_rmse: 1.2704e-12
Epoch 23/100
25000/25000 - 91s - 4ms/step - loss: 1.0234e-09 - mae: 7.2188e-06 - rmse: 3.1991e-05 - val_loss: 2.7571e-16 - val_mae: 1.6604e-08 - val_rmse: 1.6604e-08
Epoch 24/100




25000/25000 - 93s - 4ms/step - loss: 1.3039e-09 - mae: 7.1929e-06 - rmse: 3.6110e-05 - val_loss: 0.0000e+00 - val_mae: 0.0000e+00 - val_rmse: 0.0000e+00
Epoch 25/100
25000/25000 - 99s - 4ms/step - loss: 9.0556e-10 - mae: 6.9018e-06 - rmse: 3.0093e-05 - val_loss: 2.3404e-20 - val_mae: 1.5298e-10 - val_rmse: 1.5298e-10
Epoch 26/100
25000/25000 - 99s - 4ms/step - loss: 8.2250e-10 - mae: 7.7547e-06 - rmse: 2.8679e-05 - val_loss: 3.0559e-09 - val_mae: 5.5283e-05 - val_rmse: 5.5280e-05
Epoch 27/100
25000/25000 - 92s - 4ms/step - loss: 1.4010e-09 - mae: 7.0487e-06 - rmse: 3.7430e-05 - val_loss: 0.0000e+00 - val_mae: 1.5604e-20 - val_rmse: 0.0000e+00
Epoch 28/100
25000/25000 - 97s - 4ms/step - loss: 8.0021e-10 - mae: 7.6853e-06 - rmse: 2.8288e-05 - val_loss: 1.6498e-23 - val_mae: 4.0620e-12 - val_rmse: 4.0618e-12
Epoch 29/100
25000/25000 - 102s - 4ms/step - loss: 1.0634e-09 - mae: 7.2130e-06 - rmse: 3.2609e-05 - val_loss: 1.2435e-18 - val_mae: 1.1151e-09 - val_rmse: 1.1151e-09
Epoch 30/100
250

In [None]:
#Training Time saved in results.json