# Forecasting Sticker Sales



## Keras - 4 Attempt, no results in the submission file


# Attempt 1

In [3]:
import os
import random
import numpy as np
import pandas as pd
import time
import json
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.metrics import RootMeanSquaredError, MeanAbsoluteError

# 1. Reproducibility
seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)
numpy_seed = np.random.seed(seed)
tf.random.set_seed(seed)

# 2. Load data
train_df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
submission_template = pd.read_csv('sample_submission.csv')

# Infer id and target
id_col = 'id'
target_col = 'num_sold'

# 3. Encode target for continuous regression
y_values = train_df[[target_col]].astype(float).values
if np.all(y_values >= 0):
    y_enc = np.log1p(y_values)
else:
    y_enc = y_values

# Prepare training features
X = train_df.drop(columns=[target_col, id_col], errors='ignore')
train_ids = train_df[id_col]

# Prepare test features
test_ids = df_test[id_col]
X_test = df_test.drop(columns=[target_col, id_col], errors='ignore')

# 4. Feature engineering: drop all-missing columns
X = X.dropna(axis=1, how='all')
X_test = X_test[X.columns]

# Extract date features
if 'date' in X.columns:
    for df_ in [X, X_test]:
        df_['date'] = pd.to_datetime(df_['date'])
        df_['year'] = df_['date'].dt.year
        df_['month'] = df_['date'].dt.month
        df_['day'] = df_['date'].dt.day
        df_['weekday'] = df_['date'].dt.dayofweek
    X = X.drop(columns=['date'])
    X_test = X_test.drop(columns=['date'])

# Identify categorical and numeric columns
cat_cols = X.select_dtypes(include=['object']).columns.tolist()
high_card_cols = [col for col in cat_cols if X[col].nunique() > 50]
# Drop high cardinality
X = X.drop(columns=high_card_cols)
X_test = X_test.drop(columns=high_card_cols)
cat_cols = [c for c in cat_cols if c not in high_card_cols]
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()

# 5. Preprocessing pipelines
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

X_proc = preprocessor.fit_transform(X)
X_test_proc = preprocessor.transform(X_test)

# 6. Model architecture for continuous regression
input_dim = X_proc.shape[1]
if X_proc.shape[0] < 10000 or input_dim < 100:
    units1 = min(input_dim*2, 128)
    units2 = min(input_dim, 64)
    dropout_rate = 0.3
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(input_dim,)),
        tf.keras.layers.Dense(units1, activation='relu'),
        tf.keras.layers.Dropout(dropout_rate),
        tf.keras.layers.Dense(units2, activation='relu'),
        tf.keras.layers.Dropout(dropout_rate),
        tf.keras.layers.Dense(1, activation='linear')
    ])
else:
    units = [min(int(input_dim * i), 1024) for i in (2, 1, 0.5, 0.25)]
    units = [u for u in units if u >= 16]
    layers = [tf.keras.layers.Input(shape=(input_dim,))]
    for u in units:
        layers.append(tf.keras.layers.Dense(u, activation='relu'))
        layers.append(tf.keras.layers.BatchNormalization())
        layers.append(tf.keras.layers.Dropout(0.4))
    layers.append(tf.keras.layers.Dense(1, activation='linear'))
    model = tf.keras.Sequential(layers)


def mse_real(y_true_log, y_pred_log):
    y_true = tf.math.expm1(y_true_log)
    y_pred = tf.math.expm1(y_pred_log)
    return tf.reduce_mean(tf.square(y_true - y_pred))
mse_real.__name__ = 'mse_real'      

def rmse_real(y_true_log, y_pred_log):
    y_true = tf.math.expm1(y_true_log)
    y_pred = tf.math.expm1(y_pred_log)
    return tf.sqrt(tf.reduce_mean(tf.square(y_true - y_pred)))
rmse_real.__name__ = 'rmse_real'


# 7. Compile the model
model.compile(
    optimizer='adam',
    loss='mean_squared_error',
    metrics=[mse_real,rmse_real]
)

# 8. Callbacks and training
callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
    ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True)
]
start_time = time.time()
history = model.fit(
    X_proc, y_enc,
    validation_split=0.2,
    epochs=100,
    batch_size=64,
    callbacks=callbacks,
    verbose=2
)
duration = time.time() - start_time

# 9. Evaluation and logging
results = {
    'training_loss': history.history['mse_real'][-1],
    'validation_loss': history.history['val_mse_real'][-1],
    'training_RMSE': history.history['rmse_real'][-1],
    'validation_RMSE': history.history['val_rmse_real'][-1]
}
with open('results.json', 'w') as f:
    json.dump(results, f)

# 10. Prediction and submission
raw_preds = model.predict(X_test_proc)
final = raw_preds
if np.all(final >= 0):
    final = np.expm1(final)
final = final.reshape(-1, 1)
submission = pd.DataFrame(final, columns=[target_col])
submission.insert(0, id_col, test_ids.reset_index(drop=True))
submission.to_csv('submission_result.csv', index=False)


Epoch 1/100
2877/2877 - 5s - 2ms/step - loss: nan - mse_real: nan - rmse_real: nan - val_loss: nan - val_mse_real: nan - val_rmse_real: nan
Epoch 2/100
2877/2877 - 4s - 1ms/step - loss: nan - mse_real: nan - rmse_real: nan - val_loss: nan - val_mse_real: nan - val_rmse_real: nan
Epoch 3/100
2877/2877 - 4s - 1ms/step - loss: nan - mse_real: nan - rmse_real: nan - val_loss: nan - val_mse_real: nan - val_rmse_real: nan
Epoch 4/100
2877/2877 - 4s - 1ms/step - loss: nan - mse_real: nan - rmse_real: nan - val_loss: nan - val_mse_real: nan - val_rmse_real: nan
Epoch 5/100
2877/2877 - 4s - 1ms/step - loss: nan - mse_real: nan - rmse_real: nan - val_loss: nan - val_mse_real: nan - val_rmse_real: nan
Epoch 6/100
2877/2877 - 4s - 1ms/step - loss: nan - mse_real: nan - rmse_real: nan - val_loss: nan - val_mse_real: nan - val_rmse_real: nan
Epoch 7/100
2877/2877 - 4s - 1ms/step - loss: nan - mse_real: nan - rmse_real: nan - val_loss: nan - val_mse_real: nan - val_rmse_real: nan
Epoch 8/100
2877/287

# Attempt 2

In [9]:
import os
import random
import numpy as np
import pandas as pd
import time
import json
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.metrics import RootMeanSquaredError, MeanAbsoluteError

# 1. Reproducibility
seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)
numpy_seed = np.random.seed(seed)
tf.random.set_seed(seed)

# 2. Load data
train_df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
submission_template = pd.read_csv('sample_submission.csv')

# Infer id and target
id_col = 'id'
target_col = 'num_sold'

# 3. Encode target for continuous regression
y_values = train_df[[target_col]].astype(float).values
if np.all(y_values >= 0):
    y_enc = np.log1p(y_values)
else:
    y_enc = y_values

# Prepare training features
X = train_df.drop(columns=[target_col, id_col], errors='ignore')
train_ids = train_df[id_col]

# Prepare test features
test_ids = df_test[id_col]
X_test = df_test.drop(columns=[target_col, id_col], errors='ignore')

# 4. Feature engineering: drop all-missing columns
X = X.dropna(axis=1, how='all')
X_test = X_test[X.columns]

# Extract date features
if 'date' in X.columns:
    for df_ in [X, X_test]:
        df_['date'] = pd.to_datetime(df_['date'])
        df_['year'] = df_['date'].dt.year
        df_['month'] = df_['date'].dt.month
        df_['day'] = df_['date'].dt.day
        df_['weekday'] = df_['date'].dt.dayofweek
    X = X.drop(columns=['date'])
    X_test = X_test.drop(columns=['date'])

# Identify categorical and numeric columns
cat_cols = X.select_dtypes(include=['object']).columns.tolist()
high_card_cols = [col for col in cat_cols if X[col].nunique() > 50]
# Drop high cardinality
X = X.drop(columns=high_card_cols)
X_test = X_test.drop(columns=high_card_cols)
cat_cols = [c for c in cat_cols if c not in high_card_cols]
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()

# 5. Preprocessing pipelines
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

X_proc = preprocessor.fit_transform(X)
X_test_proc = preprocessor.transform(X_test)

# 6. Model architecture for continuous regression
input_dim = X_proc.shape[1]
if X_proc.shape[0] < 10000 or input_dim < 100:
    units1 = min(input_dim*2, 128)
    units2 = min(input_dim, 64)
    dropout_rate = 0.3
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(input_dim,)),
        tf.keras.layers.Dense(units1, activation='relu'),
        tf.keras.layers.Dropout(dropout_rate),
        tf.keras.layers.Dense(units2, activation='relu'),
        tf.keras.layers.Dropout(dropout_rate),
        tf.keras.layers.Dense(1, activation='linear')
    ])
else:
    units = [min(int(input_dim * i), 1024) for i in (2, 1, 0.5, 0.25)]
    units = [u for u in units if u >= 16]
    layers = [tf.keras.layers.Input(shape=(input_dim,))]
    for u in units:
        layers.append(tf.keras.layers.Dense(u, activation='relu'))
        layers.append(tf.keras.layers.BatchNormalization())
        layers.append(tf.keras.layers.Dropout(0.4))
    layers.append(tf.keras.layers.Dense(1, activation='linear'))
    model = tf.keras.Sequential(layers)

def mse_real(y_true_log, y_pred_log):
    y_true = tf.math.expm1(y_true_log)
    y_pred = tf.math.expm1(y_pred_log)
    return tf.reduce_mean(tf.square(y_true - y_pred))
mse_real.__name__ = 'mse_real'      

def rmse_real(y_true_log, y_pred_log):
    y_true = tf.math.expm1(y_true_log)
    y_pred = tf.math.expm1(y_pred_log)
    return tf.sqrt(tf.reduce_mean(tf.square(y_true - y_pred)))
rmse_real.__name__ = 'rmse_real'

# 7. Compile the model
model.compile(
    optimizer='adam',
    loss='mean_squared_error',
    metrics=[mse_real, rmse_real]
)

# 8. Callbacks and training
callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
    ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True)
]
start_time = time.time()
history = model.fit(
    X_proc, y_enc,
    validation_split=0.2,
    epochs=100,
    batch_size=64,
    callbacks=callbacks,
    verbose=2
)
duration = time.time() - start_time

# 9. Evaluation and logging
results = {
    'training_rmse': history.history['rmse_real'][-1],  # Changed to RMSE for accuracy
    'training_loss': history.history['mse_real'][-1],
    'validation_rmse': history.history['val_rmse_real'][-1],  # Changed to RMSE for accuracy
    'validation_loss': history.history['val_mse_real'][-1],
    'training_duration': duration
}
with open('results.json', 'w') as f:
    json.dump(results, f)

# 10. Prediction and submission
raw_preds = model.predict(X_test_proc)
final = raw_preds
if np.all(final >= 0):
    final = np.expm1(final)
final = final.reshape(-1, 1)
submission = pd.DataFrame(final, columns=[target_col])
submission.insert(0, id_col, test_ids.reset_index(drop=True))
submission.to_csv('submission_result.csv', index=False)

Epoch 1/100
2877/2877 - 6s - 2ms/step - loss: nan - mse_real: nan - rmse_real: nan - val_loss: nan - val_mse_real: nan - val_rmse_real: nan
Epoch 2/100
2877/2877 - 4s - 1ms/step - loss: nan - mse_real: nan - rmse_real: nan - val_loss: nan - val_mse_real: nan - val_rmse_real: nan
Epoch 3/100
2877/2877 - 4s - 1ms/step - loss: nan - mse_real: nan - rmse_real: nan - val_loss: nan - val_mse_real: nan - val_rmse_real: nan
Epoch 4/100
2877/2877 - 4s - 1ms/step - loss: nan - mse_real: nan - rmse_real: nan - val_loss: nan - val_mse_real: nan - val_rmse_real: nan
Epoch 5/100
2877/2877 - 4s - 1ms/step - loss: nan - mse_real: nan - rmse_real: nan - val_loss: nan - val_mse_real: nan - val_rmse_real: nan
Epoch 6/100
2877/2877 - 4s - 1ms/step - loss: nan - mse_real: nan - rmse_real: nan - val_loss: nan - val_mse_real: nan - val_rmse_real: nan
Epoch 7/100
2877/2877 - 4s - 1ms/step - loss: nan - mse_real: nan - rmse_real: nan - val_loss: nan - val_mse_real: nan - val_rmse_real: nan
Epoch 8/100
2877/287

In [8]:
print(duration)

44.87995481491089


# Attempt 3


Hint: 
```bash
The code doesnt provide output into the submission file, all metrics show NAN during training, check data preprocessing

```

In [10]:
import os
import random
import numpy as np
import pandas as pd
import time
import json
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# 1. Reproducibility
seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

# 2. Load data
train_df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
submission_template = pd.read_csv('sample_submission.csv')

# Infer id and target
id_col = 'id'
target_col = 'num_sold'

# 3. Encode target for continuous regression
y_values = train_df[[target_col]].astype(float).values
if np.all(y_values >= 0):
    y_enc = np.log1p(y_values)
else:
    y_enc = y_values

# Prepare training features
X = train_df.drop(columns=[target_col, id_col], errors='ignore')
train_ids = train_df[id_col]

# Prepare test features
test_ids = df_test[id_col]
X_test = df_test.drop(columns=[target_col, id_col], errors='ignore')

# 4. Feature engineering: drop all-missing columns
X = X.dropna(axis=1, how='all')
X_test = X_test[X.columns]

# Extract date features
if 'date' in X.columns:
    for df_ in [X, X_test]:
        df_['date'] = pd.to_datetime(df_['date'])
        df_['year'] = df_['date'].dt.year
        df_['month'] = df_['date'].dt.month
        df_['day'] = df_['date'].dt.day
        df_['weekday'] = df_['date'].dt.dayofweek
    X = X.drop(columns=['date'])
    X_test = X_test.drop(columns=['date'])

# Identify categorical and numeric columns
cat_cols = X.select_dtypes(include=['object']).columns.tolist()
high_card_cols = [col for col in cat_cols if X[col].nunique() > 50]
# Drop high cardinality
X = X.drop(columns=high_card_cols)
X_test = X_test.drop(columns=high_card_cols)
cat_cols = [c for c in cat_cols if c not in high_card_cols]
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()

# 5. Preprocessing pipelines
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

X_proc = preprocessor.fit_transform(X)
X_test_proc = preprocessor.transform(X_test)

# 6. Model architecture for continuous regression
input_dim = X_proc.shape[1]
if X_proc.shape[0] < 10000 or input_dim < 100:
    units1 = min(input_dim*2, 128)
    units2 = min(input_dim, 64)
    dropout_rate = 0.3
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(input_dim,)),
        tf.keras.layers.Dense(units1, activation='relu'),
        tf.keras.layers.Dropout(dropout_rate),
        tf.keras.layers.Dense(units2, activation='relu'),
        tf.keras.layers.Dropout(dropout_rate),
        tf.keras.layers.Dense(1, activation='linear')
    ])
else:
    units = [min(int(input_dim * i), 1024) for i in (2, 1, 0.5, 0.25)]
    units = [u for u in units if u >= 16]
    layers = [tf.keras.layers.Input(shape=(input_dim,))]
    for u in units:
        layers.append(tf.keras.layers.Dense(u, activation='relu'))
        layers.append(tf.keras.layers.BatchNormalization())
        layers.append(tf.keras.layers.Dropout(0.4))
    layers.append(tf.keras.layers.Dense(1, activation='linear'))
    model = tf.keras.Sequential(layers)

def mse_real(y_true_log, y_pred_log):
    y_true = tf.math.expm1(y_true_log)
    y_pred = tf.math.expm1(y_pred_log)
    return tf.reduce_mean(tf.square(y_true - y_pred))
mse_real.__name__ = 'mse_real'      

def rmse_real(y_true_log, y_pred_log):
    y_true = tf.math.expm1(y_true_log)
    y_pred = tf.math.expm1(y_pred_log)
    return tf.sqrt(tf.reduce_mean(tf.square(y_true - y_pred)))
rmse_real.__name__ = 'rmse_real'

# 7. Compile the model
model.compile(
    optimizer='adam',
    loss='mean_squared_error',
    metrics=[mse_real, rmse_real]
)

# 8. Callbacks and training
callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
    ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True)
]
start_time = time.time()
history = model.fit(
    X_proc, y_enc,
    validation_split=0.2,
    epochs=100,
    batch_size=64,
    callbacks=callbacks,
    verbose=2
)
duration = time.time() - start_time

# 9. Evaluation and logging
results = {
    'training_rmse': history.history['rmse_real'][-1],  # Changed to RMSE for accuracy
    'training_loss': history.history['mse_real'][-1],
    'validation_rmse': history.history['val_rmse_real'][-1],  # Changed to RMSE for accuracy
    'validation_loss': history.history['val_mse_real'][-1]
}
with open('results.json', 'w') as f:
    json.dump(results, f)

# 10. Prediction and submission
raw_preds = model.predict(X_test_proc)
final = raw_preds
if np.all(final >= 0):
    final = np.expm1(final)
final = final.reshape(-1, 1)
submission = pd.DataFrame(final, columns=[target_col])
submission.insert(0, id_col, test_ids.reset_index(drop=True))
submission.to_csv('submission_result.csv', index=False)

Epoch 1/100
2877/2877 - 5s - 2ms/step - loss: nan - mse_real: nan - rmse_real: nan - val_loss: nan - val_mse_real: nan - val_rmse_real: nan
Epoch 2/100
2877/2877 - 4s - 1ms/step - loss: nan - mse_real: nan - rmse_real: nan - val_loss: nan - val_mse_real: nan - val_rmse_real: nan
Epoch 3/100
2877/2877 - 4s - 1ms/step - loss: nan - mse_real: nan - rmse_real: nan - val_loss: nan - val_mse_real: nan - val_rmse_real: nan
Epoch 4/100
2877/2877 - 4s - 1ms/step - loss: nan - mse_real: nan - rmse_real: nan - val_loss: nan - val_mse_real: nan - val_rmse_real: nan
Epoch 5/100
2877/2877 - 4s - 2ms/step - loss: nan - mse_real: nan - rmse_real: nan - val_loss: nan - val_mse_real: nan - val_rmse_real: nan
Epoch 6/100
2877/2877 - 5s - 2ms/step - loss: nan - mse_real: nan - rmse_real: nan - val_loss: nan - val_mse_real: nan - val_rmse_real: nan
Epoch 7/100
2877/2877 - 4s - 1ms/step - loss: nan - mse_real: nan - rmse_real: nan - val_loss: nan - val_mse_real: nan - val_rmse_real: nan
Epoch 8/100
2877/287

# Attempt 4 

Hint:

```bash
I have a Keras regression script that uses scikit‑learn pipelines:
– ColumnTransformer with numeric + categorical pipelines (OneHotEncoder)
– custom metrics in model.compile
It runs but prints no metrics and writes no output files.
Possible culprits: wrong OneHotEncoder arg (sparse_output vs sparse), empty num/cat lists after preprocessing, or target y shaped (n,1) instead of (n,).
List the quickest code tweaks to make the model train visibly and save results.
```

In [12]:
import os
import random
import numpy as np
import pandas as pd
import time
import json
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.metrics import RootMeanSquaredError, MeanAbsoluteError

# 1. Reproducibility
seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)
numpy_seed = np.random.seed(seed)
tf.random.set_seed(seed)

# 2. Load data
train_df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
submission_template = pd.read_csv('sample_submission.csv')

# Infer id and target
id_col = 'id'
target_col = 'num_sold'

# 3. Encode target for continuous regression
y_values = train_df[[target_col]].astype(float).values
if np.all(y_values >= 0):
    y_enc = np.log1p(y_values)
else:
    y_enc = y_values

# Prepare training features
X = train_df.drop(columns=[target_col, id_col], errors='ignore')
train_ids = train_df[id_col]

# Prepare test features
test_ids = df_test[id_col]
X_test = df_test.drop(columns=[target_col, id_col], errors='ignore')

# 4. Feature engineering: drop all-missing columns
X = X.dropna(axis=1, how='all')
X_test = X_test[X.columns]

# Extract date features
if 'date' in X.columns:
    for df_ in [X, X_test]:
        df_['date'] = pd.to_datetime(df_['date'])
        df_['year'] = df_['date'].dt.year
        df_['month'] = df_['date'].dt.month
        df_['day'] = df_['date'].dt.day
        df_['weekday'] = df_['date'].dt.dayofweek
    X = X.drop(columns=['date'])
    X_test = X_test.drop(columns=['date'])

# Identify categorical and numeric columns
cat_cols = X.select_dtypes(include=['object']).columns.tolist()
high_card_cols = [col for col in cat_cols if X[col].nunique() > 50]
# Drop high cardinality
X = X.drop(columns=high_card_cols)
X_test = X_test.drop(columns=high_card_cols)
cat_cols = [c for c in cat_cols if c not in high_card_cols]
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()

# 5. Preprocessing pipelines
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))  
])
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

X_proc = preprocessor.fit_transform(X)
X_test_proc = preprocessor.transform(X_test)

# 6. Model architecture for continuous regression
input_dim = X_proc.shape[1]
if X_proc.shape[0] < 10000 or input_dim < 100:
    units1 = min(input_dim*2, 128)
    units2 = min(input_dim, 64)
    dropout_rate = 0.3
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(input_dim,)),
        tf.keras.layers.Dense(units1, activation='relu'),
        tf.keras.layers.Dropout(dropout_rate),
        tf.keras.layers.Dense(units2, activation='relu'),
        tf.keras.layers.Dropout(dropout_rate),
        tf.keras.layers.Dense(1, activation='linear')
    ])
else:
    units = [min(int(input_dim * i), 1024) for i in (2, 1, 0.5, 0.25)]
    units = [u for u in units if u >= 16]
    layers = [tf.keras.layers.Input(shape=(input_dim,))]
    for u in units:
        layers.append(tf.keras.layers.Dense(u, activation='relu'))
        layers.append(tf.keras.layers.BatchNormalization())
        layers.append(tf.keras.layers.Dropout(0.4))
    layers.append(tf.keras.layers.Dense(1, activation='linear'))
    model = tf.keras.Sequential(layers)


def mse_real(y_true_log, y_pred_log):
    y_true = tf.math.expm1(y_true_log)
    y_pred = tf.math.expm1(y_pred_log)
    return tf.reduce_mean(tf.square(y_true - y_pred))
mse_real.__name__ = 'mse_real'      

def rmse_real(y_true_log, y_pred_log):
    y_true = tf.math.expm1(y_true_log)
    y_pred = tf.math.expm1(y_pred_log)
    return tf.sqrt(tf.reduce_mean(tf.square(y_true - y_pred)))
rmse_real.__name__ = 'rmse_real'


# 7. Compile the model
model.compile(
    optimizer='adam',
    loss='mean_squared_error',
    metrics=[mse_real, rmse_real]
)

# 8. Callbacks and training
callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
    ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True)
]
start_time = time.time()
history = model.fit(
    X_proc, y_enc,
    validation_split=0.2,
    epochs=100,
    batch_size=64,
    callbacks=callbacks,
    verbose=2
)
duration = time.time() - start_time

# 9. Evaluation and logging
results = {
    'training_rmse': history.history['rmse_real'][-1],  # Changed to RMSE for accuracy
    'training_loss': history.history['mse_real'][-1],
    'validation_rmse': history.history['val_rmse_real'][-1],  # Changed to RMSE for accuracy
    'validation_loss': history.history['val_mse_real'][-1]
}
with open('results.json', 'w') as f:
    json.dump(results, f)

# 10. Prediction and submission
raw_preds = model.predict(X_test_proc)
final = raw_preds
if np.all(final >= 0):
    final = np.expm1(final)
final = final.reshape(-1, 1)
submission = pd.DataFrame(final, columns=[target_col])
submission.insert(0, id_col, test_ids.reset_index(drop=True))
submission.to_csv('submission_result.csv', index=False)

Epoch 1/100
2877/2877 - 5s - 2ms/step - loss: nan - mse_real: nan - rmse_real: nan - val_loss: nan - val_mse_real: nan - val_rmse_real: nan
Epoch 2/100
2877/2877 - 4s - 1ms/step - loss: nan - mse_real: nan - rmse_real: nan - val_loss: nan - val_mse_real: nan - val_rmse_real: nan
Epoch 3/100
2877/2877 - 4s - 1ms/step - loss: nan - mse_real: nan - rmse_real: nan - val_loss: nan - val_mse_real: nan - val_rmse_real: nan
Epoch 4/100
2877/2877 - 4s - 1ms/step - loss: nan - mse_real: nan - rmse_real: nan - val_loss: nan - val_mse_real: nan - val_rmse_real: nan
Epoch 5/100
2877/2877 - 4s - 1ms/step - loss: nan - mse_real: nan - rmse_real: nan - val_loss: nan - val_mse_real: nan - val_rmse_real: nan
Epoch 6/100
2877/2877 - 4s - 1ms/step - loss: nan - mse_real: nan - rmse_real: nan - val_loss: nan - val_mse_real: nan - val_rmse_real: nan
Epoch 7/100
2877/2877 - 4s - 1ms/step - loss: nan - mse_real: nan - rmse_real: nan - val_loss: nan - val_mse_real: nan - val_rmse_real: nan
Epoch 8/100
2877/287

In [14]:
print(duration)

44.002779960632324


## Keras Tuner - 4 Attempt, same error

In [15]:
import os
import random
import numpy as np
import pandas as pd
import time
import json
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.metrics import RootMeanSquaredError, MeanAbsoluteError
import keras_tuner as kt
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization
from tensorflow.keras.models import Sequential

# 1. Reproducibility
seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)
numpy_seed = np.random.seed(seed)
tf.random.set_seed(seed)

# 2. Load data
train_df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
submission_template = pd.read_csv('sample_submission.csv')

# Infer id and target
id_col = 'id'
target_col = 'num_sold'

# 3. Encode target for continuous regression
y_values = train_df[[target_col]].astype(float).values
if np.all(y_values >= 0):
    y_enc = np.log1p(y_values)
else:
    y_enc = y_values

# Prepare training features
X = train_df.drop(columns=[target_col, id_col], errors='ignore')
train_ids = train_df[id_col]

# Prepare test features
test_ids = df_test[id_col]
X_test = df_test.drop(columns=[target_col, id_col], errors='ignore')

# 4. Feature engineering: drop all-missing columns
X = X.dropna(axis=1, how='all')
X_test = X_test[X.columns]

# Extract date features
if 'date' in X.columns:
    for df_ in [X, X_test]:
        df_['date'] = pd.to_datetime(df_['date'])
        df_['year'] = df_['date'].dt.year
        df_['month'] = df_['date'].dt.month
        df_['day'] = df_['date'].dt.day
        df_['weekday'] = df_['date'].dt.dayofweek
    X = X.drop(columns=['date'])
    X_test = X_test.drop(columns=['date'])

# Identify categorical and numeric columns
cat_cols = X.select_dtypes(include=['object']).columns.tolist()
high_card_cols = [col for col in cat_cols if X[col].nunique() > 50]
# Drop high cardinality
X = X.drop(columns=high_card_cols)
X_test = X_test.drop(columns=high_card_cols)
cat_cols = [c for c in cat_cols if c not in high_card_cols]
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()

# 5. Preprocessing pipelines
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

X_proc = preprocessor.fit_transform(X)
X_test_proc = preprocessor.transform(X_test)

# 6. Model architecture for continuous regression
n_features = X_proc.shape[1]

# Define early stopping and checkpoint
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True)

def mse_real(y_true_log, y_pred_log):
    y_true = tf.math.expm1(y_true_log)
    y_pred = tf.math.expm1(y_pred_log)
    return tf.reduce_mean(tf.square(y_true - y_pred))
mse_real.__name__ = 'mse_real'      

def rmse_real(y_true_log, y_pred_log):
    y_true = tf.math.expm1(y_true_log)
    y_pred = tf.math.expm1(y_pred_log)
    return tf.sqrt(tf.reduce_mean(tf.square(y_true - y_pred)))
rmse_real.__name__ = 'rmse_real'

class MyHyperModel(kt.HyperModel):
    def build(self, hp):
        layers = hp.Int('layers', 2, 8)
        units = hp.Int('units', 64, 1024)
        act = hp.Choice('activation', ['relu'])
        drop = hp.Float('dropout', 0.0, 0.5)
        opt = hp.Choice('optimizer', ['adam'])
        lr = hp.Float('learning_rate', 1e-5, 0.01, sampling='log')

        model = Sequential()
        model.add(Input(shape=(n_features,)))
        for _ in range(layers):
            model.add(Dense(units, activation=act))
            model.add(BatchNormalization())
            model.add(Dropout(drop))
        model.add(Dense(1, activation='linear'))
        model.compile(optimizer=opt, loss='mean_squared_error', metrics=[rmse_real,mse_real])
        return model

# Initialize the tuner
bs = 32  # batch size
ep = 100  # epochs

tuner = kt.BayesianOptimization(
    MyHyperModel(),
    objective='val_loss',
    max_trials=10,
    executions_per_trial=1,
    seed=42,
    overwrite=False,
    project_name='bayesian_tuner'
)

if y_enc is not None:
    tuner.search(
        X_proc, y_enc,
        validation_split=0.2,
        batch_size=bs, epochs=ep,
        callbacks=[early_stopping, checkpoint]
    )
else:
    tuner.search(
        X_proc, y_enc,
        validation_split=0.2,
        batch_size=bs, epochs=ep,
        callbacks=[early_stopping, checkpoint]
    )

model = tuner.hypermodel.build(tuner.get_best_hyperparameters(1)[0])

if y_enc is not None:
    history = model.fit(
        X_proc, y_enc,
        validation_split=0.2,
        epochs=100, batch_size=bs,
        callbacks=[early_stopping, checkpoint],
        verbose=2
    )
else:
    history = model.fit(
        X_proc, y_enc,
        validation_split=0.2,
        epochs=100, batch_size=bs,
        callbacks=[early_stopping, checkpoint],
        verbose=2
    )

# 9. Evaluation and logging
hist = history.history
results = {
    'training_rmse': history.history['rmse_real'][-1],  # Changed to RMSE for accuracy
    'training_loss': history.history['mse_real'][-1],
    'validation_rmse': history.history['val_rmse_real'][-1],  # Changed to RMSE for accuracy
    'validation_loss': history.history['val_mse_real'][-1]
}
with open('results.json', 'w') as f:
    json.dump(results, f)

# 10. Prediction and submission
raw_preds = model.predict(X_test_proc)
final = raw_preds
if np.all(final >= 0):
    final = np.expm1(final)
final = final.reshape(-1, 1)
submission = pd.DataFrame(final, columns=[target_col])
submission.insert(0, id_col, test_ids.reset_index(drop=True))
submission.to_csv('submission_result.csv', index=False)

Trial 10 Complete [00h 03m 08s]
val_loss: nan

Best val_loss So Far: nan
Total elapsed time: 13m 08s
Epoch 1/100
2877/2877 - 5s - 2ms/step - loss: nan - mse_real: nan - rmse_real: nan - val_loss: nan - val_mse_real: nan - val_rmse_real: nan
Epoch 2/100
2877/2877 - 4s - 1ms/step - loss: nan - mse_real: nan - rmse_real: nan - val_loss: nan - val_mse_real: nan - val_rmse_real: nan
Epoch 3/100
2877/2877 - 4s - 1ms/step - loss: nan - mse_real: nan - rmse_real: nan - val_loss: nan - val_mse_real: nan - val_rmse_real: nan
Epoch 4/100
2877/2877 - 4s - 1ms/step - loss: nan - mse_real: nan - rmse_real: nan - val_loss: nan - val_mse_real: nan - val_rmse_real: nan
Epoch 5/100
2877/2877 - 4s - 1ms/step - loss: nan - mse_real: nan - rmse_real: nan - val_loss: nan - val_mse_real: nan - val_rmse_real: nan
Epoch 6/100
2877/2877 - 4s - 1ms/step - loss: nan - mse_real: nan - rmse_real: nan - val_loss: nan - val_mse_real: nan - val_rmse_real: nan
Epoch 7/100
2877/2877 - 4s - 1ms/step - loss: nan - mse_rea