In [18]:
import numpy as np, pandas as pd, random, os
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from tensorflow.keras import backend as K

# =========================
# CONFIG (update paths)
# =========================
TRAIN_CSV = "/kaggle/input/amazon-ml-challenge-2025/student_resource/dataset/train.csv"
TEXT_COL = "catalog_content"
COST_COL = "price"
SEED = 42
BATCH = 64
EPOCHS = 30            
PRETRAIN_EPOCHS = 15    
FINETUNE_EPOCHS = EPOCHS - PRETRAIN_EPOCHS
MAX_VOCAB = 50000
EMB_DIM = 200
MAX_SEQ = 120
LSTM_UNITS = 256
DROPOUT_RATE = 0.3

# =========================
# load data + tokenize
# =========================
df = pd.read_csv(TRAIN_CSV)
texts = df[TEXT_COL].fillna("").astype(str).tolist()
prices = df[COST_COL].astype(float).values
# Ensure non-negative prices (if any negatives exist, clip — adjust if needed)
prices = np.maximum(prices, 0.0)

tokenizer = keras.preprocessing.text.Tokenizer(num_words=MAX_VOCAB, oov_token="[UNK]")
tokenizer.fit_on_texts(texts)
seqs = tokenizer.texts_to_sequences(texts)
padded = keras.preprocessing.sequence.pad_sequences(seqs, maxlen=MAX_SEQ, padding="post", truncating="post")

train_idx, val_idx = train_test_split(np.arange(len(padded)), test_size=0.10, random_state=SEED)
X_train, X_val = padded[train_idx], padded[val_idx]
y_train, y_val = prices[train_idx], prices[val_idx]

vocab_size = min(MAX_VOCAB, len(tokenizer.word_index) + 1)

tf.keras.backend.clear_session()

# =========================
# Model definition
# =========================
text_input = layers.Input(shape=(MAX_SEQ,), dtype="int32", name="text_input")
emb = layers.Embedding(input_dim=vocab_size, output_dim=EMB_DIM, mask_zero=True)(text_input)

x = layers.Bidirectional(layers.LSTM(LSTM_UNITS, return_sequences=True, dropout=0.2))(emb)
x2 = layers.Bidirectional(layers.LSTM(128, return_sequences=True, dropout=0.2))(x)

class AttentionPooling(layers.Layer):
    def __init__(self, hidden_units=128):
        super().__init__()
        self.W = layers.Dense(hidden_units, activation="tanh")
        self.v = layers.Dense(1, activation=None)
    def call(self, inputs):
        # inputs: (B, T, D)
        score = self.W(inputs)            
        score = self.v(score)            
        weights = tf.nn.softmax(score, axis=1)  
        out = tf.reduce_sum(weights * inputs, axis=1)  
        return out

# single attention call (removed the duplicate)
att = AttentionPooling(hidden_units=128)(x2)  

# residual MLP with projection to match dims
h = layers.LayerNormalization()(att)               # shape (None, D)
h1 = layers.Dense(256, activation="relu")(h)      # (None, 256)
h1 = layers.Dropout(DROPOUT_RATE)(h1)

h2_dense = layers.Dense(128, activation="relu")(h1)   # (None, 128)
h_proj = layers.Dense(128, activation=None)(h)        # project original h -> 128
h2 = layers.Add()([h2_dense, h_proj])
h2 = layers.LayerNormalization()(h2)

# final head: produce non-negative price prediction using softplus (keeps predictions >= 0)
out_linear = layers.Dense(1, activation="linear")(h2)
out = layers.Activation(tf.nn.softplus, name="price_pred")(out_linear)  # positive outputs

model = keras.Model(inputs=text_input, outputs=out)
model.summary()

# =========================
# Losses / metrics
# =========================
def smape_loss(eps=1e-3):
    """SMAPE loss (percent). Use with original-scale targets."""
    def loss(y_true, y_pred):
        num = 2.0 * K.abs(y_pred - y_true)
        den = K.abs(y_true) + K.abs(y_pred) + eps
        sm = num / den
        return 100.0 * K.mean(sm)
    return loss

def smape_tf(y_true, y_pred):
    eps = K.epsilon()
    num = 2.0 * K.abs(y_pred - y_true)
    den = K.abs(y_true) + K.abs(y_pred) + eps
    sm = num / den
    return K.mean(sm) * 100.0

# small helper for numpy SMAPE/MAE printing after epoch
def smape_np(y_true, y_pred, eps=1e-12):
    return 100.0 * np.mean(2.0 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred) + eps))

def mae_np(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred))

# =========================
# Callbacks: monitoring original-scale metrics per epoch
# =========================
class OrigMetrics(keras.callbacks.Callback):
    def __init__(self, X_val, y_val):
        super().__init__()
        self.X_val = X_val
        self.y_val = y_val
    def on_epoch_end(self, epoch, logs=None):
        preds = self.model.predict(self.X_val, verbose=0).reshape(-1)
        # ensure non-negative
        preds = np.maximum(preds, 0.0)
        val_mae = mae_np(self.y_val, preds)
        val_sm = smape_np(self.y_val, preds)
        print(f"  -> val_mae_orig: {val_mae:.4f}  val_smape_orig: {val_sm:.4f}")

# Shared callbacks for both phases (we will modify monitors for finetune)
common_cbs = [OrigMetrics(X_val, y_val)]

# =========================
# TRAINING: Phase 1 (warm-start on MSE)
# =========================
initial_lr = 1e-3
opt = keras.optimizers.Adam(learning_rate=initial_lr, clipnorm=1.0)
model.compile(optimizer=opt, loss="mse", metrics=[smape_tf, "mae"])

print("=== Phase 1: warm-start on MSE (original-scale targets) ===")
history1 = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=PRETRAIN_EPOCHS,
    batch_size=BATCH,
    callbacks=common_cbs,
    verbose=2
)

# =========================
# TRAINING: Phase 2 (fine-tune on SMAPE)
# =========================
finetune_lr = 1e-4
opt2 = keras.optimizers.Adam(learning_rate=finetune_lr, clipnorm=1.0)
model.compile(optimizer=opt2, loss=smape_loss(eps=1e-3), metrics=[smape_tf, "mae"])

# callbacks tuned to SMAPE behavior
cbs_phase2 = [
    keras.callbacks.ReduceLROnPlateau(monitor="val_smape_orig", factor=0.5, patience=3, min_lr=1e-6, verbose=1),
    OrigMetrics(X_val, y_val)
]

print("=== Phase 2: fine-tune on SMAPE loss ===")
history2 = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=FINETUNE_EPOCHS,
    batch_size=BATCH,
    callbacks=cbs_phase2,
    verbose=2
)

# =========================
# Final evaluation on validation set
# =========================
val_pred = model.predict(X_val).reshape(-1)
val_pred = np.maximum(val_pred, 0.0)
print("Final Val MAE (orig):", mae_np(y_val, val_pred))
print("Final Val SMAPE (orig):", smape_np(y_val, val_pred))




=== Phase 1: warm-start on MSE (original-scale targets) ===
Epoch 1/15
  -> val_mae_orig: 14.9284  val_smape_orig: 64.2888
1055/1055 - 76s - 72ms/step - loss: 1003.9085 - mae: 16.2558 - smape_tf: 79.8631 - val_loss: 889.3145 - val_mae: 14.9179 - val_smape_tf: 79.3393
Epoch 2/15
  -> val_mae_orig: 13.4590  val_smape_orig: 59.4455
1055/1055 - 63s - 60ms/step - loss: 843.5683 - mae: 13.9427 - smape_tf: 82.0411 - val_loss: 677.7247 - val_mae: 13.4893 - val_smape_tf: 84.2269
Epoch 3/15
  -> val_mae_orig: 13.1142  val_smape_orig: 61.7007
1055/1055 - 63s - 60ms/step - loss: 687.6113 - mae: 11.9877 - smape_tf: 83.2663 - val_loss: 617.0581 - val_mae: 13.1375 - val_smape_tf: 81.2817
Epoch 4/15
  -> val_mae_orig: 12.4856  val_smape_orig: 56.4697
1055/1055 - 63s - 60ms/step - loss: 585.7637 - mae: 10.7794 - smape_tf: 83.3662 - val_loss: 603.3726 - val_mae: 12.4192 - val_smape_tf: 83.9614
Epoch 5/15
  -> val_mae_orig: 13.7965  val_smape_orig: 62.2565
1055/1055 - 63s - 60ms/step - loss: 517.0195 - m

  callback.on_epoch_end(epoch, logs)


  -> val_mae_orig: 12.0932  val_smape_orig: 54.0424
1055/1055 - 77s - 73ms/step - loss: 31.3444 - mae: 4.9404 - smape_tf: 87.7092 - val_loss: 54.1518 - val_mae: 12.1206 - val_smape_tf: 88.6555 - learning_rate: 1.0000e-04
Epoch 2/15
  -> val_mae_orig: 12.1211  val_smape_orig: 53.7314
1055/1055 - 63s - 60ms/step - loss: 29.8652 - mae: 4.6674 - smape_tf: 88.1550 - val_loss: 53.7187 - val_mae: 12.0547 - val_smape_tf: 88.7214 - learning_rate: 1.0000e-04
Epoch 3/15
  -> val_mae_orig: 12.0494  val_smape_orig: 53.3081
1055/1055 - 63s - 60ms/step - loss: 28.8623 - mae: 4.4854 - smape_tf: 88.2785 - val_loss: 53.2936 - val_mae: 12.0306 - val_smape_tf: 88.3584 - learning_rate: 1.0000e-04
Epoch 4/15
  -> val_mae_orig: 12.1283  val_smape_orig: 53.4068
1055/1055 - 63s - 60ms/step - loss: 28.0332 - mae: 4.3433 - smape_tf: 88.6389 - val_loss: 53.4475 - val_mae: 12.1559 - val_smape_tf: 87.8990 - learning_rate: 1.0000e-04
Epoch 5/15
  -> val_mae_orig: 12.0913  val_smape_orig: 53.2150
1055/1055 - 63s - 60

In [19]:
TEST_CSV  = "/kaggle/input/amazon-ml-challenge-2025/student_resource/dataset/test.csv"
ID_COL = "sample_id"
BATCH_SIZE = 64


test_df = pd.read_csv(TEST_CSV)
print("Test rows:", len(test_df))
assert ID_COL in test_df.columns and TEXT_COL in test_df.columns

# tokenize and pad test texts
test_texts = test_df[TEXT_COL].fillna("").astype(str).tolist()
test_seqs = tokenizer.texts_to_sequences(test_texts)
test_padded = keras.preprocessing.sequence.pad_sequences(test_seqs, maxlen=MAX_SEQ, padding="post", truncating="post")

# batch predict
preds = []
for i in range(0, len(test_padded), BATCH_SIZE):
    batch = test_padded[i : i + BATCH_SIZE]
    p = model.predict(batch, verbose=0).reshape(-1)
    preds.extend(p.tolist())

submission = pd.DataFrame({
    ID_COL: test_df[ID_COL].tolist(),
    "price": preds
})
submission.to_csv("submission.csv", index=False)
print("Saved submission.csv, shape:", submission.shape)


print(submission.head())

Test rows: 75000
Saved submission.csv, shape: (75000, 2)
   sample_id      price
0     100179  18.345997
1     245611   8.406756
2     146263  13.300120
3      95658   7.719113
4      36806  16.706221
