In [2]:
import random, json, time
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.metrics import RootMeanSquaredError, MeanAbsoluteError
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout

random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

# Load data
sample_sub = pd.read_csv('playground-series-s5e5/sample_submission.csv')
id_col = sample_sub.columns[0]
target_columns = list(sample_sub.columns[1:])

df_train = pd.read_csv('playground-series-s5e5/train.csv')
df_test = pd.read_csv('playground-series-s5e5/test.csv')

df = df_train.copy()

# Target encoding for regression
y_values = df[target_columns].astype(float).values
# apply log1p since values >= 0
y_enc = np.log1p(y_values)

# Features
X = df.drop(columns=target_columns + [id_col], errors='ignore')

# Split (use provided test)
X_train = X.copy()
y_train = y_enc
train_ids = df[id_col]
test_ids = df_test[id_col]
X_val = df_test.drop(columns=target_columns + [id_col], errors='ignore')
y_val = None

# Feature engineering: drop all-missing
all_missing = [c for c in X_train.columns if X_train[c].isna().all()]
X_train.drop(columns=all_missing, inplace=True)
X_val.drop(columns=all_missing, inplace=True, errors='ignore')
# Categorical handling
categorical = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
high_card = [c for c in categorical if X_train[c].nunique() > 50]
X_train.drop(columns=high_card, inplace=True)
X_val.drop(columns=high_card, inplace=True, errors='ignore')

# Preprocessing pipeline
numeric_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
cat_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
preprocessor = ColumnTransformer([
    ('num', num_pipeline, numeric_features),
    ('cat', cat_pipeline, cat_features)
])

X_train_proc = preprocessor.fit_transform(X_train)
X_val_proc = preprocessor.transform(X_val)

# Model architecture guidelines for small dataset
n_samples, n_features = X_train_proc.shape
units1 = min(n_features * 2, 128)
units2 = min(n_features, 64)

inputs = Input(shape=(n_features,))
x = Dense(units1, activation='relu')(inputs)
x = Dropout(0.3)(x)
x = Dense(units2, activation='relu')(x)
x = Dropout(0.3)(x)
outputs = Dense(len(target_columns), activation='linear')(x)
model = Model(inputs, outputs)

# Compile
model.compile(
    optimizer='adam',
    loss='mean_squared_error',
    metrics=[RootMeanSquaredError(name='rmse'), MeanAbsoluteError(name='mae')]
)

# Callbacks & Training
callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
    ModelCheckpoint('model_best.h5', monitor='val_loss', save_best_only=True)
]
start_time = time.time()
if y_val is not None:
    history = model.fit(X_train_proc, y_train, validation_data=(X_val_proc, y_val),
                        epochs=100, batch_size=64, callbacks=callbacks, verbose=2)
else:
    history = model.fit(X_train_proc, y_train, validation_split=0.2,
                        epochs=100, batch_size=64, callbacks=callbacks, verbose=2)
duration = time.time() - start_time

# Evaluation & Logging
hist = history.history
results = {
    'training_accuracy': float(hist['rmse'][-1]),
    'training_loss': float(hist['loss'][-1]),
    'validation_accuracy': float(hist['val_rmse'][-1]),
    'validation_loss': float(hist['val_loss'][-1])
}
with open('results.json', 'w') as f:
    json.dump(results, f)

# Prediction & Submission
raw_preds = model.predict(X_val_proc)
# inverse log1p
final = np.expm1(raw_preds)
if final.ndim == 1:
    final = final.reshape(-1, 1)
submission = pd.DataFrame(final, columns=target_columns)
submission.insert(0, id_col, test_ids.reset_index(drop=True))
submission.to_csv('submission_result.csv', index=False)

Epoch 1/100




9375/9375 - 12s - 1ms/step - loss: 1.1184 - mae: 0.7020 - rmse: 1.0576 - val_loss: 0.1161 - val_mae: 0.2319 - val_rmse: 0.3408
Epoch 2/100




9375/9375 - 11s - 1ms/step - loss: 0.1690 - mae: 0.2813 - rmse: 0.4111 - val_loss: 0.0895 - val_mae: 0.2125 - val_rmse: 0.2991
Epoch 3/100
9375/9375 - 11s - 1ms/step - loss: 0.1613 - mae: 0.2749 - rmse: 0.4016 - val_loss: 0.0996 - val_mae: 0.2212 - val_rmse: 0.3156
Epoch 4/100
9375/9375 - 11s - 1ms/step - loss: 0.1597 - mae: 0.2729 - rmse: 0.3996 - val_loss: 0.1386 - val_mae: 0.2598 - val_rmse: 0.3723
Epoch 5/100
9375/9375 - 11s - 1ms/step - loss: 0.1590 - mae: 0.2722 - rmse: 0.3987 - val_loss: 0.2138 - val_mae: 0.3036 - val_rmse: 0.4624
Epoch 6/100
9375/9375 - 11s - 1ms/step - loss: 0.1509 - mae: 0.2642 - rmse: 0.3884 - val_loss: 0.1937 - val_mae: 0.2894 - val_rmse: 0.4402
Epoch 7/100
9375/9375 - 11s - 1ms/step - loss: 0.1516 - mae: 0.2643 - rmse: 0.3894 - val_loss: 0.1977 - val_mae: 0.3071 - val_rmse: 0.4446
Epoch 8/100
9375/9375 - 20s - 2ms/step - loss: 0.1519 - mae: 0.2642 - rmse: 0.3897 - val_loss: 0.1819 - val_mae: 0.2799 - val_rmse: 0.4266
Epoch 9/100
9375/9375 - 11s - 1ms/step 