In [1]:
!pip uninstall -y pillow
!pip install pillow==10.3.0
!pip install -U sentence-transformers transformers lightgbm xgboost tqdm


Found existing installation: pillow 11.3.0
Uninstalling pillow-11.3.0:
  Successfully uninstalled pillow-11.3.0
[0mCollecting pillow==10.3.0
  Downloading pillow-10.3.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (9.2 kB)
Downloading pillow-10.3.0-cp311-cp311-manylinux_2_28_x86_64.whl (4.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m43.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pillow
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.12.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
dopamine-rl 4.1.2 requires gymnasium>=1.0.0, but you have gymnasium 0.29.0 which is incompatible.
bigframes 2.12.0 requires google-cloud-bigquery[bqstorage,pandas]>=3.31.0, but you have google-cloud-bigquery 3.25.0 which is incompatible.
bigframe

In [2]:
# Configuration for Kaggle
import os
KAGGLE_INPUT_DIR = '/kaggle/input/infiniper'   # change if folder name differs
KAGGLE_WORKING_DIR = '/kaggle/working/'

DATA_DIR = os.path.join(KAGGLE_INPUT_DIR, 'dataset')
TEXT_EMB_DIR = os.path.join(KAGGLE_INPUT_DIR, 'embeddings_text')
IMAGE_EMB_DIR = os.path.join(KAGGLE_INPUT_DIR, 'embeddings_image')

print("Paths:")
print(" DATA_DIR:", DATA_DIR)
print(" TEXT_EMB_DIR:", TEXT_EMB_DIR)
print(" IMAGE_EMB_DIR:", IMAGE_EMB_DIR)
print(" WORKING_DIR:", KAGGLE_WORKING_DIR)

# Imports
import os, sys, re, gc, math, time
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

# SMAPE
def smape(y_true, y_pred, eps=1e-8):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    denom = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    denom = np.where(denom == 0, eps, denom)
    return np.mean(np.abs(y_pred - y_true) / denom) * 100.0

# Utility: load numpy with fallback
def load_npy_or_fail(path):
    if os.path.exists(path):
        return np.load(path)
    else:
        raise FileNotFoundError(f"Required file missing: {path}")


Paths:
 DATA_DIR: /kaggle/input/infiniper/dataset
 TEXT_EMB_DIR: /kaggle/input/infiniper/embeddings_text
 IMAGE_EMB_DIR: /kaggle/input/infiniper/embeddings_image
 WORKING_DIR: /kaggle/working/


In [3]:
# Load CSVs
print("Loading CSVs...")
train = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
test  = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))
sample_test = pd.read_csv(os.path.join(DATA_DIR, 'sample_test.csv'))
print("Train/Test/Sample shapes:", train.shape, test.shape, sample_test.shape)

# Load embeddings (npy files). Adjust file names if different
print("Loading embeddings...")
train_text_emb = load_npy_or_fail(os.path.join(TEXT_EMB_DIR, 'train_text_emb.npy'))
test_text_emb  = load_npy_or_fail(os.path.join(TEXT_EMB_DIR, 'test_text_emb.npy'))
sample_text_emb= load_npy_or_fail(os.path.join(TEXT_EMB_DIR, 'sample_text_emb.npy'))

train_img_emb = load_npy_or_fail(os.path.join(IMAGE_EMB_DIR, 'train_img_emb.npy'))
test_img_emb  = load_npy_or_fail(os.path.join(IMAGE_EMB_DIR, 'test_img_emb.npy'))
sample_img_emb= load_npy_or_fail(os.path.join(IMAGE_EMB_DIR, 'sample_img_emb.npy'))

print("Text emb shapes:", train_text_emb.shape, test_text_emb.shape, sample_text_emb.shape)
print("Image emb shapes:", train_img_emb.shape, test_img_emb.shape, sample_img_emb.shape)


Loading CSVs...
Train/Test/Sample shapes: (75000, 4) (75000, 3) (100, 3)
Loading embeddings...
Text emb shapes: (75000, 384) (75000, 384) (100, 384)
Image emb shapes: (75000, 512) (75000, 512) (100, 512)


In [4]:
# Ensure catalog_content present and compute simple features
for df, name in [(train,'train'), (test,'test'), (sample_test,'sample_test')]:
    if 'catalog_content' not in df.columns:
        df['catalog_content'] = ''
    df['catalog_content'] = df['catalog_content'].fillna('')
    df['char_len'] = df['catalog_content'].str.len()
    df['word_count'] = df['catalog_content'].str.split().apply(len)
    df['num_digits'] = df['catalog_content'].str.count(r'\d').fillna(0).astype(int)

# parse Value and Unit (simple regex)
def parse_value_unit(text):
    val = np.nan; unit = ''
    if not isinstance(text, str): return val, unit
    m = re.search(r'Value[:\s]*([0-9]+(?:\.[0-9]+)?)', text, flags=re.I)
    if m:
        try: val = float(m.group(1))
        except: val = np.nan
    m2 = re.search(r'Unit[:\s]*([A-Za-z0-9% /._-]+)', text, flags=re.I)
    if m2:
        unit = m2.group(1).strip()
    return val, unit

# Run parsing with tqdm
for df, name in [(train,'train'), (test,'test'), (sample_test,'sample_test')]:
    vals = []
    units = []
    for txt in tqdm(df['catalog_content'].tolist(), desc=f'parse {name}', ncols=100):
        v,u = parse_value_unit(txt)
        vals.append(v); units.append(u)
    df['value_extracted'] = vals
    df['unit_extracted'] = units

print("Sample feature columns:")
display(train[['sample_id','value_extracted','unit_extracted','char_len','word_count']].head())


parse train: 100%|█████████████████████████████████████████| 75000/75000 [00:01<00:00, 49684.01it/s]
parse test: 100%|██████████████████████████████████████████| 75000/75000 [00:01<00:00, 51395.54it/s]
parse sample_test: 100%|███████████████████████████████████████| 100/100 [00:00<00:00, 41901.14it/s]

Sample feature columns:





Unnamed: 0,sample_id,value_extracted,unit_extracted,char_len,word_count
0,33127,72.0,Fl Oz,91,18
1,198967,32.0,Ounce,511,80
2,261251,11.4,Ounce,328,59
3,55858,11.25,Ounce,1318,211
4,292686,12.0,Count,155,28


In [5]:
from sklearn.preprocessing import StandardScaler

# numeric features
def numeric_array(df):
    return df[['char_len','word_count','num_digits','value_extracted']].fillna(0).to_numpy(dtype=float)

# Stack features: [text_emb, img_emb, numeric]
def build_X(text_emb, img_emb, df):
    num = numeric_array(df)
    return np.hstack([text_emb, img_emb, num])

# Build
X_train_raw = build_X(train_text_emb, train_img_emb, train)
X_test_raw  = build_X(test_text_emb, test_img_emb, test)
X_sample_raw= build_X(sample_text_emb, sample_img_emb, sample_test)
y = train['price'].values
y_log = np.log1p(y)

print("Raw shapes:", X_train_raw.shape, X_test_raw.shape, X_sample_raw.shape)

# Scale for MLP only (fit on train)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_raw)
X_test  = scaler.transform(X_test_raw)
X_sample= scaler.transform(X_sample_raw)

# Keep raw for LightGBM since tree-based models don't need scaling
X_train_lgb = X_train_raw
X_test_lgb  = X_test_raw
X_sample_lgb= X_sample_raw

print("Prepared X (MLP scaled) shape:", X_train.shape)


Raw shapes: (75000, 900) (75000, 900) (100, 900)
Prepared X (MLP scaled) shape: (75000, 900)


In [7]:
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
import gc
from tqdm import tqdm

# ---------- Robust SMAPE eval (handles both (preds, dataset) and (labels, preds)) ----------
def lgb_smape_eval(a, b):
    """
    Robust wrapper for LightGBM custom evaluation.
    Accepts either:
      - (preds, dataset) where dataset has get_label()
      - (labels, preds) where both are numpy arrays (sklearn wrapper may call like this)
    Returns ('SMAPE', value, False) which LightGBM accepts.
    """
    # Case 1: b is a Dataset-like object (has get_label)
    if hasattr(b, 'get_label'):
        y_pred_log = np.asarray(a)
        y_true_log = np.asarray(b.get_label())
    else:
        # Case 2: sklearn wrapper often calls func(labels, preds)
        # Here 'a' are labels and 'b' are preds (both numpy arrays)
        # But to be safe, detect which arg is preds by shape: preds is often float vector (but same shape)
        # We'll assume a = labels, b = preds (this matches LightGBM's sklearn wrapper behavior)
        y_true_log = np.asarray(a)
        y_pred_log = np.asarray(b)

    # Now both are log1p(target) because we train on log1p(y)
    # Convert back to original scale before SMAPE
    y_true = np.expm1(y_true_log)
    y_pred = np.expm1(y_pred_log)

    denom = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    denom = np.where(denom == 0, 1e-8, denom)
    val = np.mean(np.abs(y_pred - y_true) / denom)
    # Return triple (name, value, is_higher_better) -> False because lower SMAPE is better
    return 'SMAPE', val, False

# ---------- LightGBM params & CV setup ----------
params = {
    'objective':'regression', 'metric':'None',
    'learning_rate': 0.02, 'n_estimators':5000, 'num_leaves':512,
    'min_data_in_leaf':30, 'feature_fraction':0.8, 'bagging_fraction':0.8, 'bagging_freq':5,
    'lambda_l1':0.5, 'lambda_l2':0.5, 'n_jobs':-1, 'verbosity':-1
}

# Stratify by log price bins
n_bins = 10
try:
    bins = pd.qcut(np.log1p(train['price']), q=n_bins, labels=False, duplicates='drop')
except Exception:
    bins = pd.cut(np.log1p(train['price']), bins=n_bins, labels=False)
bins = np.array(bins, dtype=int)

seeds = [42, 2023]   # bagging / seeds
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Storage
preds_oof_log = np.zeros(len(train))
preds_test_log = np.zeros(len(test))
preds_sample_log = np.zeros(len(sample_test))

fold_idx = 0
fold_smape_list = []

# Quick shape sanity
print("Shapes: X_train:", X_train_lgb.shape, "X_test:", X_test_lgb.shape, "y:", len(y))

for train_idx, val_idx in tqdm(skf.split(X_train_lgb, bins), total=n_splits, desc='LGB CV folds', ncols=100):
    fold_idx += 1
    print(f"\n--- Fold {fold_idx} ---")
    X_tr, X_val = X_train_lgb[train_idx], X_train_lgb[val_idx]
    y_tr_log, y_val_log = np.log1p(y[train_idx]), np.log1p(y[val_idx])

    fold_val_preds_log = np.zeros(len(val_idx))
    fold_test_preds_log = np.zeros(X_test_lgb.shape[0])
    fold_sample_preds_log = np.zeros(X_sample_lgb.shape[0])

    for seed in seeds:
        print(f" Training seed {seed} ...")
        params['random_state'] = seed
        model = lgb.LGBMRegressor(**params)

        # Fit using sklearn wrapper and our robust custom eval
        model.fit(
            X_tr, y_tr_log,
            eval_set=[(X_val, y_val_log)],
            eval_metric=lgb_smape_eval,
            callbacks=[lgb.early_stopping(stopping_rounds=200), lgb.log_evaluation(period=200)]
        )

        # Predictions (log scale)
        val_pred_log = model.predict(X_val, num_iteration=model.best_iteration_)
        test_pred_log = model.predict(X_test_lgb, num_iteration=model.best_iteration_)
        sample_pred_log = model.predict(X_sample_lgb, num_iteration=model.best_iteration_)

        fold_val_preds_log += val_pred_log
        fold_test_preds_log += test_pred_log
        fold_sample_preds_log += sample_pred_log

        del model
        gc.collect()

    # Average over seeds
    fold_val_preds_log /= len(seeds)
    fold_test_preds_log /= len(seeds)
    fold_sample_preds_log /= len(seeds)

    preds_oof_log[val_idx] = fold_val_preds_log
    preds_test_log += fold_test_preds_log / n_splits
    preds_sample_log += fold_sample_preds_log / n_splits

    # Compute fold SMAPE on original scale
    fold_val_orig = np.expm1(fold_val_preds_log)
    fold_true_orig = np.expm1(y_val_log)
    fold_smape = smape(fold_true_orig, fold_val_orig)
    fold_smape_list.append(fold_smape)
    print(f" Fold {fold_idx} SMAPE: {fold_smape:.4f}%")

# Final inverse transform
preds_val = np.expm1(preds_oof_log)
preds_test = np.expm1(preds_test_log)
preds_sample = np.expm1(preds_sample_log)

cv_smape = smape(y, preds_val)
print(f"\nCV OOF SMAPE: {cv_smape:.4f}%")
print("Per-fold SMAPEs:", fold_smape_list)


Shapes: X_train: (75000, 900) X_test: (75000, 900) y: 75000


LGB CV folds:   0%|                                                           | 0/5 [00:00<?, ?it/s]


--- Fold 1 ---
 Training seed 42 ...
Training until validation scores don't improve for 200 rounds
[200]	valid_0's SMAPE: 0.561945
[400]	valid_0's SMAPE: 0.544401
[600]	valid_0's SMAPE: 0.538629
[800]	valid_0's SMAPE: 0.536217
[1000]	valid_0's SMAPE: 0.53496
[1200]	valid_0's SMAPE: 0.534297
[1400]	valid_0's SMAPE: 0.53395
[1600]	valid_0's SMAPE: 0.533769
[1800]	valid_0's SMAPE: 0.533581
[2000]	valid_0's SMAPE: 0.533485
[2200]	valid_0's SMAPE: 0.533422
[2400]	valid_0's SMAPE: 0.533369
[2600]	valid_0's SMAPE: 0.533337
[2800]	valid_0's SMAPE: 0.53332
[3000]	valid_0's SMAPE: 0.533301
[3200]	valid_0's SMAPE: 0.533282
[3400]	valid_0's SMAPE: 0.533268
[3600]	valid_0's SMAPE: 0.533264
[3800]	valid_0's SMAPE: 0.533253
[4000]	valid_0's SMAPE: 0.533243
[4200]	valid_0's SMAPE: 0.533231
[4400]	valid_0's SMAPE: 0.53323
[4600]	valid_0's SMAPE: 0.533226
[4800]	valid_0's SMAPE: 0.533223
[5000]	valid_0's SMAPE: 0.533216
Did not meet early stopping. Best iteration is:
[5000]	valid_0's SMAPE: 0.533216
 T

LGB CV folds:  20%|█████████                                    | 1/5 [1:21:30<5:26:01, 4890.50s/it]

 Fold 1 SMAPE: 53.1606%

--- Fold 2 ---
 Training seed 42 ...
Training until validation scores don't improve for 200 rounds
[200]	valid_0's SMAPE: 0.557245
[400]	valid_0's SMAPE: 0.539846
[600]	valid_0's SMAPE: 0.534096
[800]	valid_0's SMAPE: 0.531489
[1000]	valid_0's SMAPE: 0.530301
[1200]	valid_0's SMAPE: 0.529713
[1400]	valid_0's SMAPE: 0.529344
[1600]	valid_0's SMAPE: 0.529142
[1800]	valid_0's SMAPE: 0.529005
[2000]	valid_0's SMAPE: 0.528902
[2200]	valid_0's SMAPE: 0.528861
[2400]	valid_0's SMAPE: 0.528806
[2600]	valid_0's SMAPE: 0.528782
[2800]	valid_0's SMAPE: 0.528751
[3000]	valid_0's SMAPE: 0.528731
[3200]	valid_0's SMAPE: 0.528713
[3400]	valid_0's SMAPE: 0.528704
[3600]	valid_0's SMAPE: 0.528694
[3800]	valid_0's SMAPE: 0.528689
[4000]	valid_0's SMAPE: 0.528683
[4200]	valid_0's SMAPE: 0.528674
[4400]	valid_0's SMAPE: 0.528665
[4600]	valid_0's SMAPE: 0.528662
[4800]	valid_0's SMAPE: 0.528655
[5000]	valid_0's SMAPE: 0.528652
Did not meet early stopping. Best iteration is:
[4974]	

LGB CV folds:  40%|██████████████████                           | 2/5 [2:36:44<3:53:27, 4669.31s/it]

 Fold 2 SMAPE: 52.5957%

--- Fold 3 ---
 Training seed 42 ...
Training until validation scores don't improve for 200 rounds
[200]	valid_0's SMAPE: 0.558453
[400]	valid_0's SMAPE: 0.540671
[600]	valid_0's SMAPE: 0.534977
[800]	valid_0's SMAPE: 0.532207
[1000]	valid_0's SMAPE: 0.530897
[1200]	valid_0's SMAPE: 0.53022
[1400]	valid_0's SMAPE: 0.529897
[1600]	valid_0's SMAPE: 0.529714
[1800]	valid_0's SMAPE: 0.529546
[2000]	valid_0's SMAPE: 0.529448
[2200]	valid_0's SMAPE: 0.529385
[2400]	valid_0's SMAPE: 0.529331
[2600]	valid_0's SMAPE: 0.529303
[2800]	valid_0's SMAPE: 0.529282
[3000]	valid_0's SMAPE: 0.529261
[3200]	valid_0's SMAPE: 0.529247
[3400]	valid_0's SMAPE: 0.529238
[3600]	valid_0's SMAPE: 0.529229
[3800]	valid_0's SMAPE: 0.529222
[4000]	valid_0's SMAPE: 0.529216
[4200]	valid_0's SMAPE: 0.529211
[4400]	valid_0's SMAPE: 0.529208
Early stopping, best iteration is:
[4308]	valid_0's SMAPE: 0.529207
 Training seed 2023 ...
Training until validation scores don't improve for 200 rounds
[

LGB CV folds:  60%|███████████████████████████                  | 3/5 [3:53:59<2:35:06, 4653.21s/it]

 Fold 3 SMAPE: 52.7756%

--- Fold 4 ---
 Training seed 42 ...
Training until validation scores don't improve for 200 rounds
[200]	valid_0's SMAPE: 0.552103
[400]	valid_0's SMAPE: 0.535477
[600]	valid_0's SMAPE: 0.530069
[800]	valid_0's SMAPE: 0.527488
[1000]	valid_0's SMAPE: 0.526299
[1200]	valid_0's SMAPE: 0.525596
[1400]	valid_0's SMAPE: 0.525234
[1600]	valid_0's SMAPE: 0.525026
[1800]	valid_0's SMAPE: 0.524865
[2000]	valid_0's SMAPE: 0.524784
[2200]	valid_0's SMAPE: 0.524701
[2400]	valid_0's SMAPE: 0.524651
[2600]	valid_0's SMAPE: 0.524613
[2800]	valid_0's SMAPE: 0.524586
[3000]	valid_0's SMAPE: 0.524568
[3200]	valid_0's SMAPE: 0.524558
[3400]	valid_0's SMAPE: 0.52455
[3600]	valid_0's SMAPE: 0.524542
[3800]	valid_0's SMAPE: 0.524535
[4000]	valid_0's SMAPE: 0.524527
[4200]	valid_0's SMAPE: 0.524521
[4400]	valid_0's SMAPE: 0.524517
[4600]	valid_0's SMAPE: 0.524511
[4800]	valid_0's SMAPE: 0.524509
[5000]	valid_0's SMAPE: 0.524502
Did not meet early stopping. Best iteration is:
[4998]	v

LGB CV folds:  80%|████████████████████████████████████         | 4/5 [5:15:06<1:18:57, 4737.91s/it]

 Fold 4 SMAPE: 52.2571%

--- Fold 5 ---
 Training seed 42 ...
Training until validation scores don't improve for 200 rounds
[200]	valid_0's SMAPE: 0.557567
[400]	valid_0's SMAPE: 0.54008
[600]	valid_0's SMAPE: 0.534528
[800]	valid_0's SMAPE: 0.531903
[1000]	valid_0's SMAPE: 0.53068
[1200]	valid_0's SMAPE: 0.530059
[1400]	valid_0's SMAPE: 0.529731
[1600]	valid_0's SMAPE: 0.529506
[1800]	valid_0's SMAPE: 0.52934
[2000]	valid_0's SMAPE: 0.529226
[2200]	valid_0's SMAPE: 0.529154
[2400]	valid_0's SMAPE: 0.529097
[2600]	valid_0's SMAPE: 0.529047
[2800]	valid_0's SMAPE: 0.529029
[3000]	valid_0's SMAPE: 0.529008
[3200]	valid_0's SMAPE: 0.528991
[3400]	valid_0's SMAPE: 0.528974
[3600]	valid_0's SMAPE: 0.528966
[3800]	valid_0's SMAPE: 0.528961
[4000]	valid_0's SMAPE: 0.528952
[4200]	valid_0's SMAPE: 0.528947
[4400]	valid_0's SMAPE: 0.528942
[4600]	valid_0's SMAPE: 0.528935
[4800]	valid_0's SMAPE: 0.52893
[5000]	valid_0's SMAPE: 0.528927
Did not meet early stopping. Best iteration is:
[4995]	vali

LGB CV folds: 100%|███████████████████████████████████████████████| 5/5 [6:37:35<00:00, 4771.15s/it]

 Fold 5 SMAPE: 52.5952%

CV OOF SMAPE: 52.6768%
Per-fold SMAPEs: [53.16061619771864, 52.59566677781164, 52.77564688312958, 52.257131477181886, 52.595165530223106]





In [8]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

# Device and multi-GPU support
gpu_count = torch.cuda.device_count()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device, "GPU count:", gpu_count)

# Simple MLP class
class MLPreg(nn.Module):
    def __init__(self, in_dim, hidden=[1024,512,256], dropout=0.3):
        super().__init__()
        layers = []
        prev = in_dim
        for h in hidden:
            layers.append(nn.Linear(prev, h))
            layers.append(nn.BatchNorm1d(h))
            layers.append(nn.ReLU(inplace=True))
            layers.append(nn.Dropout(dropout))
            prev = h
        layers.append(nn.Linear(prev, 1))
        self.net = nn.Sequential(*layers)
    def forward(self, x):
        return self.net(x).squeeze(-1)

# Training util
def train_mlp_one_fold(X_tr, y_tr_log, X_val, y_val_log, in_dim, seed=42,
                       epochs=30, batch_size=1024, lr=1e-3, patience=5):
    torch.manual_seed(seed)
    model = MLPreg(in_dim).to(device)
    if gpu_count > 1:
        model = nn.DataParallel(model)
    optim = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-5)
    criterion = nn.L1Loss()  # MAE on log scale
    train_ds = TensorDataset(torch.from_numpy(X_tr.astype(np.float32)), torch.from_numpy(y_tr_log.astype(np.float32)))
    val_ds   = TensorDataset(torch.from_numpy(X_val.astype(np.float32)), torch.from_numpy(y_val_log.astype(np.float32)))
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True)
    val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)
    best_val = 1e9; best_epoch = -1; best_state=None; wait=0
    for ep in range(epochs):
        model.train()
        train_loss = 0.0
        for xb, yb in train_loader:
            xb = xb.to(device); yb = yb.to(device)
            optim.zero_grad()
            pred = model(xb)
            loss = criterion(pred, yb)
            loss.backward()
            optim.step()
            train_loss += loss.item() * xb.size(0)
        train_loss /= len(train_loader.dataset)
        # val
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for xb, yb in val_loader:
                xb = xb.to(device); yb = yb.to(device)
                pred = model(xb)
                loss = criterion(pred, yb)
                val_loss += loss.item() * xb.size(0)
        val_loss /= len(val_loader.dataset)
        # print progress
        print(f" Epoch {ep+1}/{epochs} | train MAE(log): {train_loss:.6f} | val MAE(log): {val_loss:.6f}")
        # early stop
        if val_loss < best_val - 1e-6:
            best_val = val_loss; best_epoch = ep; best_state = {k:v.cpu() for k,v in model.state_dict().items()}; wait=0
        else:
            wait += 1
            if wait >= patience:
                print(" Early stopping triggered.")
                break
    # load best
    model.load_state_dict(best_state)
    # predict
    def predict_np(X):
        model.eval()
        preds = []
        bs = 1024
        with torch.no_grad():
            for i in range(0, X.shape[0], bs):
                xb = torch.from_numpy(X[i:i+bs].astype(np.float32)).to(device)
                preds.append(model(xb).detach().cpu().numpy())
        return np.concatenate(preds).reshape(-1)
    return model, predict_np

# KFold for MLP
from sklearn.model_selection import KFold
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

oof_pred_mlp_log = np.zeros(len(train))
pred_test_mlp_log = np.zeros(len(test))
pred_sample_mlp_log = np.zeros(len(sample_test))

fold = 0
in_dim = X_train.shape[1]
for tr_idx, val_idx in tqdm(kf.split(X_train), total=n_splits, desc='MLP CV folds', ncols=120):
    fold+=1
    print(f"\nMLP Fold {fold}")
    X_tr, X_val = X_train[tr_idx], X_train[val_idx]
    y_tr_log, y_val_log = y_log[tr_idx], y_log[val_idx]
    model, pred_fn = train_mlp_one_fold(X_tr, y_tr_log, X_val, y_val_log,
                                        in_dim=in_dim, seed=42+fold, epochs=50, batch_size=2048, lr=1e-3, patience=6)
    # predict
    oof_pred_mlp_log[val_idx] = pred_fn(X_val)
    pred_test_mlp_log += pred_fn(X_test) / n_splits
    pred_sample_mlp_log += pred_fn(X_sample) / n_splits
    # compute fold smape (on orig scale)
    fold_smape = smape(np.expm1(y_val_log), np.expm1(oof_pred_mlp_log[val_idx]))
    print(f" MLP Fold {fold} SMAPE: {fold_smape:.4f}%")
    # free GPU memory if possible
    del model; gc.collect()
    if torch.cuda.is_available(): torch.cuda.empty_cache()

oof_pred_mlp = np.expm1(oof_pred_mlp_log)
pred_test_mlp = np.expm1(pred_test_mlp_log)
pred_sample_mlp = np.expm1(pred_sample_mlp_log)

print("MLP CV OOF SMAPE:", smape(y, oof_pred_mlp))


Device: cuda GPU count: 2


MLP CV folds:   0%|                                                                               | 0/5 [00:00<?, ?it/s]


MLP Fold 1
 Epoch 1/50 | train MAE(log): 1.384583 | val MAE(log): 1.198231
 Epoch 2/50 | train MAE(log): 0.676849 | val MAE(log): 0.590060
 Epoch 3/50 | train MAE(log): 0.620285 | val MAE(log): 0.604934
 Epoch 4/50 | train MAE(log): 0.595244 | val MAE(log): 0.581982
 Epoch 5/50 | train MAE(log): 0.574793 | val MAE(log): 0.565367
 Epoch 6/50 | train MAE(log): 0.559439 | val MAE(log): 0.565507
 Epoch 7/50 | train MAE(log): 0.541983 | val MAE(log): 0.558111
 Epoch 8/50 | train MAE(log): 0.530541 | val MAE(log): 0.562904
 Epoch 9/50 | train MAE(log): 0.516340 | val MAE(log): 0.544009
 Epoch 10/50 | train MAE(log): 0.505742 | val MAE(log): 0.540821
 Epoch 11/50 | train MAE(log): 0.491636 | val MAE(log): 0.546589
 Epoch 12/50 | train MAE(log): 0.482345 | val MAE(log): 0.546942
 Epoch 13/50 | train MAE(log): 0.472169 | val MAE(log): 0.541584
 Epoch 14/50 | train MAE(log): 0.462495 | val MAE(log): 0.542608
 Epoch 15/50 | train MAE(log): 0.452763 | val MAE(log): 0.537582
 Epoch 16/50 | train M

MLP CV folds:  20%|██████████████▏                                                        | 1/5 [00:54<03:38, 54.69s/it]

 MLP Fold 1 SMAPE: 52.3712%

MLP Fold 2
 Epoch 1/50 | train MAE(log): 1.348852 | val MAE(log): 1.070431
 Epoch 2/50 | train MAE(log): 0.676784 | val MAE(log): 0.586850
 Epoch 3/50 | train MAE(log): 0.622875 | val MAE(log): 0.592987
 Epoch 4/50 | train MAE(log): 0.593306 | val MAE(log): 0.559240
 Epoch 5/50 | train MAE(log): 0.576215 | val MAE(log): 0.548739
 Epoch 6/50 | train MAE(log): 0.556954 | val MAE(log): 0.550796
 Epoch 7/50 | train MAE(log): 0.543279 | val MAE(log): 0.544451
 Epoch 8/50 | train MAE(log): 0.527679 | val MAE(log): 0.550272
 Epoch 9/50 | train MAE(log): 0.514044 | val MAE(log): 0.538024
 Epoch 10/50 | train MAE(log): 0.504791 | val MAE(log): 0.531981
 Epoch 11/50 | train MAE(log): 0.491958 | val MAE(log): 0.536093
 Epoch 12/50 | train MAE(log): 0.482653 | val MAE(log): 0.535767
 Epoch 13/50 | train MAE(log): 0.468815 | val MAE(log): 0.525764
 Epoch 14/50 | train MAE(log): 0.463251 | val MAE(log): 0.529534
 Epoch 15/50 | train MAE(log): 0.452081 | val MAE(log): 0.5

MLP CV folds:  40%|████████████████████████████▍                                          | 2/5 [01:53<02:50, 56.91s/it]

 MLP Fold 2 SMAPE: 51.4164%

MLP Fold 3
 Epoch 1/50 | train MAE(log): 1.416945 | val MAE(log): 1.179944
 Epoch 2/50 | train MAE(log): 0.672823 | val MAE(log): 0.611814
 Epoch 3/50 | train MAE(log): 0.614358 | val MAE(log): 0.568079
 Epoch 4/50 | train MAE(log): 0.590684 | val MAE(log): 0.551207
 Epoch 5/50 | train MAE(log): 0.573918 | val MAE(log): 0.549584
 Epoch 6/50 | train MAE(log): 0.556973 | val MAE(log): 0.559423
 Epoch 7/50 | train MAE(log): 0.540290 | val MAE(log): 0.539862
 Epoch 8/50 | train MAE(log): 0.524483 | val MAE(log): 0.543905
 Epoch 9/50 | train MAE(log): 0.509255 | val MAE(log): 0.545150
 Epoch 10/50 | train MAE(log): 0.500002 | val MAE(log): 0.537605
 Epoch 11/50 | train MAE(log): 0.492756 | val MAE(log): 0.538928
 Epoch 12/50 | train MAE(log): 0.480973 | val MAE(log): 0.538684
 Epoch 13/50 | train MAE(log): 0.471932 | val MAE(log): 0.532348
 Epoch 14/50 | train MAE(log): 0.461839 | val MAE(log): 0.527084
 Epoch 15/50 | train MAE(log): 0.452762 | val MAE(log): 0.5

MLP CV folds:  60%|██████████████████████████████████████████▌                            | 3/5 [02:17<01:24, 42.06s/it]

 MLP Fold 3 SMAPE: 52.7250%

MLP Fold 4
 Epoch 1/50 | train MAE(log): 1.425834 | val MAE(log): 1.285322
 Epoch 2/50 | train MAE(log): 0.661848 | val MAE(log): 0.627251
 Epoch 3/50 | train MAE(log): 0.614595 | val MAE(log): 0.599901
 Epoch 4/50 | train MAE(log): 0.591679 | val MAE(log): 0.557321
 Epoch 5/50 | train MAE(log): 0.569837 | val MAE(log): 0.563097
 Epoch 6/50 | train MAE(log): 0.554981 | val MAE(log): 0.548382
 Epoch 7/50 | train MAE(log): 0.540622 | val MAE(log): 0.544144
 Epoch 8/50 | train MAE(log): 0.528057 | val MAE(log): 0.541986
 Epoch 9/50 | train MAE(log): 0.511815 | val MAE(log): 0.544009
 Epoch 10/50 | train MAE(log): 0.500247 | val MAE(log): 0.527734
 Epoch 11/50 | train MAE(log): 0.490283 | val MAE(log): 0.530020
 Epoch 12/50 | train MAE(log): 0.478578 | val MAE(log): 0.535270
 Epoch 13/50 | train MAE(log): 0.467935 | val MAE(log): 0.516675
 Epoch 14/50 | train MAE(log): 0.462735 | val MAE(log): 0.519130
 Epoch 15/50 | train MAE(log): 0.451136 | val MAE(log): 0.5

MLP CV folds:  80%|████████████████████████████████████████████████████████▊              | 4/5 [02:41<00:34, 34.72s/it]

 MLP Fold 4 SMAPE: 51.6580%

MLP Fold 5
 Epoch 1/50 | train MAE(log): 1.386627 | val MAE(log): 1.055416
 Epoch 2/50 | train MAE(log): 0.678534 | val MAE(log): 0.591036
 Epoch 3/50 | train MAE(log): 0.626863 | val MAE(log): 0.581930
 Epoch 4/50 | train MAE(log): 0.601963 | val MAE(log): 0.562463
 Epoch 5/50 | train MAE(log): 0.579324 | val MAE(log): 0.551381
 Epoch 6/50 | train MAE(log): 0.560079 | val MAE(log): 0.566916
 Epoch 7/50 | train MAE(log): 0.548390 | val MAE(log): 0.543303
 Epoch 8/50 | train MAE(log): 0.533517 | val MAE(log): 0.544364
 Epoch 9/50 | train MAE(log): 0.520534 | val MAE(log): 0.539819
 Epoch 10/50 | train MAE(log): 0.508407 | val MAE(log): 0.551492
 Epoch 11/50 | train MAE(log): 0.496616 | val MAE(log): 0.544622
 Epoch 12/50 | train MAE(log): 0.482742 | val MAE(log): 0.533874
 Epoch 13/50 | train MAE(log): 0.473548 | val MAE(log): 0.536029
 Epoch 14/50 | train MAE(log): 0.464694 | val MAE(log): 0.544246
 Epoch 15/50 | train MAE(log): 0.453943 | val MAE(log): 0.5

MLP CV folds: 100%|███████████████████████████████████████████████████████████████████████| 5/5 [03:23<00:00, 40.62s/it]

 MLP Fold 5 SMAPE: 51.9865%
MLP CV OOF SMAPE: 52.03141081160807





In [11]:
# OOF arrays currently: oof_pred_lgb (from LGB), oof_pred_mlp
# But ensure they are defined:
assert 'oof_pred_lgb' in globals() or 'oof_pred_lgb' in locals()
# oof_pred_lgb computed earlier, oof_pred_mlp computed from MLP

# Grid search blending weight for LGB (w) and MLP (1-w)
best_w = None
best_score = 1e9
for w in tqdm(np.linspace(0,1,101), desc='blend grid'):
    pred = w * oof_pred_lgb + (1-w) * oof_pred_mlp
    score = smape(y, pred)
    if score < best_score:
        best_score = score
        best_w = w
print(f"Best blend weight for LGB = {best_w:.2f}, CV SMAPE = {best_score:.4f}%")

# Compose test & sample preds
pred_test_blend = best_w * pred_test_lgb + (1-best_w) * pred_test_mlp
pred_sample_blend = best_w * pred_sample_lgb + (1-best_w) * pred_sample_mlp

# Clip to positive values
pred_test_blend = np.clip(pred_test_blend, 0.01, None)
pred_sample_blend = np.clip(pred_sample_blend, 0.01, None)

print("Final blended CV SMAPE (on train OOF):", best_score)


AssertionError: 

In [12]:
# Correcting errors
# Robust blending helper: find/load OOF + test preds for LGB and MLP, then blend
import os, sys
import numpy as np
from tqdm import tqdm

WORK_DIR = '/kaggle/working' if 'KAGGLE_WORKING_DIR' not in globals() else KAGGLE_WORKING_DIR
# list of candidate variable names in memory for each required array
cands = {
    'oof_lgb_orig': ['oof_pred_lgb', 'oof_pred_lgb_orig', 'oof_pred_lgb_final', 'oof_pred_lgb_final_orig'],
    'oof_lgb_log' : ['oof_pred_log', 'preds_oof_log', 'preds_oof_log', 'oof_pred_log'],
    'pred_test_lgb_orig': ['pred_test_lgb', 'pred_test_lgb_orig', 'pred_test_lgb_final'],
    'pred_test_lgb_log' : ['pred_test_log', 'preds_test_log', 'pred_test_log'],
    'oof_mlp_orig' : ['oof_pred_mlp', 'oof_pred_mlp_orig', 'oof_pred_mlp_final'],
    'oof_mlp_log'  : ['oof_pred_mlp_log', 'oof_pred_mlp_log_final'],
    'pred_test_mlp_orig': ['pred_test_mlp', 'pred_test_mlp_orig', 'pred_test_mlp_final'],
    'pred_test_mlp_log' : ['pred_test_mlp_log', 'pred_test_mlp_log_final']
}

# candidate filenames to try loading from WORK_DIR
file_cands = {
    'oof_lgb_orig': [os.path.join(WORK_DIR, 'oof_pred_lgb.npy'), os.path.join(WORK_DIR, 'oof_pred_lgb_orig.npy'), os.path.join(WORK_DIR, 'oof_pred_lgb_final.npy')],
    'oof_lgb_log' : [os.path.join(WORK_DIR, 'oof_pred_log.npy'), os.path.join(WORK_DIR, 'preds_oof_log.npy')],
    'pred_test_lgb_orig': [os.path.join(WORK_DIR, 'pred_test_lgb.npy'), os.path.join(WORK_DIR, 'pred_test_lgb_orig.npy')],
    'pred_test_lgb_log' : [os.path.join(WORK_DIR, 'pred_test_log.npy'), os.path.join(WORK_DIR, 'preds_test_log.npy')],
    'oof_mlp_orig' : [os.path.join(WORK_DIR, 'oof_pred_mlp.npy'), os.path.join(WORK_DIR, 'oof_pred_mlp_orig.npy')],
    'oof_mlp_log'  : [os.path.join(WORK_DIR, 'oof_pred_mlp_log.npy')],
    'pred_test_mlp_orig': [os.path.join(WORK_DIR, 'pred_test_mlp.npy')],
    'pred_test_mlp_log' : [os.path.join(WORK_DIR, 'pred_test_mlp_log.npy')]
}

# utility to find variable in memory
def find_in_memory(names):
    for n in names:
        if n in globals():
            return globals()[n]
        if n in locals():
            return locals()[n]
    return None

# utility to try load npy files
def try_load(paths):
    for p in paths:
        if p and os.path.exists(p):
            try:
                arr = np.load(p)
                print(f"Loaded {p} shape={arr.shape}")
                return arr
            except Exception as e:
                print(f"Failed loading {p}: {e}")
    return None

# Attempt to populate arrays (original scale)
def get_array(key):
    # try orig variables in memory
    arr = find_in_memory(cands.get(key, []))
    if arr is not None:
        print(f"Found {key} in memory (orig) shape={np.array(arr).shape}")
        return np.array(arr)
    # try log-named in memory and convert if found
    log_key = key.replace('_orig','_log')
    arr_log = find_in_memory(cands.get(log_key, []))
    if arr_log is not None:
        arr_log = np.array(arr_log)
        print(f"Found {log_key} in memory shape={arr_log.shape} -> converting expm1")
        return np.expm1(arr_log)
    # try file loads (orig)
    arr = try_load(file_cands.get(key, []))
    if arr is not None:
        return arr
    # try file loads (log) and convert
    arr_log = try_load(file_cands.get(log_key, []))
    if arr_log is not None:
        try:
            return np.expm1(arr_log)
        except:
            return None
    # nothing found
    return None

# fetch arrays
oof_lgb = get_array('oof_lgb_orig')
pred_test_lgb = get_array('pred_test_lgb_orig')
oof_mlp = get_array('oof_mlp_orig')
pred_test_mlp = get_array('pred_test_mlp_orig')

# Try alternative names if above failed (older notebook variations)
if oof_lgb is None:
    # try older variable preds_oof_log -> expm1
    if 'preds_oof_log' in globals():
        oof_lgb = np.expm1(np.array(globals()['preds_oof_log']))
        print("Recovered oof_lgb from preds_oof_log (converted expm1).")
if oof_mlp is None:
    if 'oof_pred_mlp' in globals():
        oof_mlp = np.array(globals()['oof_pred_mlp'])
        print("Recovered oof_mlp from oof_pred_mlp in memory.")

# sanity checks: lengths
missing = []
if oof_lgb is None:
    missing.append('oof_pred_lgb (OOF from LightGBM)')
if oof_mlp is None:
    missing.append('oof_pred_mlp (OOF from MLP)')
if pred_test_lgb is None:
    missing.append('pred_test_lgb (test preds from LightGBM)')
if pred_test_mlp is None:
    missing.append('pred_test_mlp (test preds from MLP)')

if missing:
    print("Could not find all required arrays. Missing:", missing)
    print("Possible fixes:")
    print(" - Re-run the LightGBM training cell to recreate oof_pred_lgb / pred_test_lgb,")
    print(" - Re-run the MLP training cell to recreate oof_pred_mlp / pred_test_mlp,")
    print(" - Or load saved .npy files (e.g., np.load('/kaggle/working/oof_pred_lgb.npy')) into memory before blending.")
    raise RuntimeError("Required arrays missing. See message above.")

# Final shape checks and proceed with blending
print("Shapes: oof_lgb:", oof_lgb.shape, "oof_mlp:", oof_mlp.shape,
      "pred_test_lgb:", pred_test_lgb.shape, "pred_test_mlp:", pred_test_mlp.shape)

# If sample predictions exist load them too (optional)
# We'll attempt to find sample preds similarly, but not required for blending weights.
# Compute best blend weight on OOF by grid search
best_w = None
best_score = 1e9
for w in tqdm(np.linspace(0,1,101), desc='blend grid', ncols=100):
    blended = w * oof_lgb + (1.0 - w) * oof_mlp
    score = smape(y, blended)   # assumes y (train prices) present in memory
    if score < best_score:
        best_score = score
        best_w = w

print(f"Best blend weight for LGB = {best_w:.2f}, CV SMAPE = {best_score:.4f}%")

# Compose test predictions
pred_test_blend = best_w * pred_test_lgb + (1.0 - best_w) * pred_test_mlp
pred_test_blend = np.clip(pred_test_blend, 0.01, None)

# Save final blended predictions (change path if needed)
out_path = os.path.join(WORK_DIR, 'test_out_blended.csv')
import pandas as pd
if 'test' in globals():
    out_df = pd.DataFrame({'sample_id': test['sample_id'], 'price': pred_test_blend})
    out_df.to_csv(out_path, index=False)
    print("Saved blended test predictions to:", out_path)
else:
    print("Warning: 'test' dataframe not in memory; not saving CSV. You can save pred_test_blend manually.")

# If sample predictions exist similarly attempt to blend and save
# Try to fetch sample preds (optional)
def try_get_sample_preds():
    for name in ['pred_sample_lgb', 'pred_sample_lgb_orig', 'pred_sample_lgb_final', 'pred_sample_lgb.npy']:
        if name in globals(): return globals()[name]
    for p in [os.path.join(WORK_DIR, 'pred_sample_lgb.npy'), os.path.join(WORK_DIR,'pred_sample_blend.npy')]:
        if os.path.exists(p):
            return np.load(p)
    return None

# End
print("Blending done. CV blended SMAPE:", best_score)


Found oof_lgb_log in memory shape=(75000,) -> converting expm1
Found pred_test_lgb_log in memory shape=(75000,) -> converting expm1
Found oof_mlp_orig in memory (orig) shape=(75000,)
Found pred_test_mlp_orig in memory (orig) shape=(75000,)
Shapes: oof_lgb: (75000,) oof_mlp: (75000,) pred_test_lgb: (75000,) pred_test_mlp: (75000,)


blend grid: 100%|███████████████████████████████████████████████| 101/101 [00:00<00:00, 1024.51it/s]

Best blend weight for LGB = 0.01, CV SMAPE = 52.0207%





Saved blended test predictions to: /kaggle/working/test_out_blended.csv
Blending done. CV blended SMAPE: 52.02073513764387


In [14]:
# Again correcting errors
# Robust saver: create pred_sample_blend if missing, then save test_out and sample_test_out
import os, numpy as np, pandas as pd
WORK_DIR = '/kaggle/working' if 'KAGGLE_WORKING_DIR' not in globals() else KAGGLE_WORKING_DIR

def find_in_memory(names):
    for n in names:
        if n in globals():
            return globals()[n]
    return None

def try_load(paths):
    for p in paths:
        if p and os.path.exists(p):
            try:
                arr = np.load(p)
                print(f"Loaded {p} shape={arr.shape}")
                return arr
            except Exception as e:
                print(f"Failed loading {p}: {e}")
    return None

# 1) If pred_sample_blend exists, use it
pred_sample_blend = find_in_memory(['pred_sample_blend','pred_sample_blend_final','pred_sample_blend.npy'])
if pred_sample_blend is not None:
    print("Using pred_sample_blend from memory.")
else:
    print("pred_sample_blend not found in memory. Attempting to construct it...")

    # 2) Try to find pred_sample_lgb and pred_sample_mlp
    cand_lgb = find_in_memory(['pred_sample_lgb','pred_sample_lgb_orig','pred_sample_lgb_final','pred_sample_lgb.npy'])
    if cand_lgb is None:
        cand_lgb = try_load([os.path.join(WORK_DIR,'pred_sample_lgb.npy'), os.path.join(WORK_DIR,'pred_sample_lgb_orig.npy'), os.path.join(WORK_DIR,'pred_sample_blend.npy')])
    if cand_lgb is not None:
        pred_sample_lgb = np.array(cand_lgb)
        print("Found pred_sample_lgb shape:", pred_sample_lgb.shape)
    else:
        pred_sample_lgb = None
        print("pred_sample_lgb not found.")

    cand_mlp = find_in_memory(['pred_sample_mlp','pred_sample_mlp_orig','pred_sample_mlp.npy'])
    if cand_mlp is None:
        cand_mlp = try_load([os.path.join(WORK_DIR,'pred_sample_mlp.npy')])
    if cand_mlp is not None:
        pred_sample_mlp = np.array(cand_mlp)
        print("Found pred_sample_mlp shape:", pred_sample_mlp.shape)
    else:
        pred_sample_mlp = None
        print("pred_sample_mlp not found.")

    # 3) Determine best_w (blend weight). Prefer existing best_w, otherwise compute from OOFs, otherwise default 0.5
    best_w_use = None
    if 'best_w' in globals():
        best_w_use = globals()['best_w']
        print("Using best_w from memory:", best_w_use)
    else:
        # try to compute best_w from OOF arrays if available
        oof_lgb = find_in_memory(['oof_pred_lgb','oof_pred_lgb_orig','oof_pred_lgb_final'])
        if oof_lgb is None:
            # try to load saved npy
            oof_lgb = try_load([os.path.join(WORK_DIR,'oof_pred_lgb.npy'), os.path.join(WORK_DIR,'oof_pred_lgb_orig.npy')])
        oof_mlp = find_in_memory(['oof_pred_mlp','oof_pred_mlp_orig'])
        if oof_mlp is None:
            oof_mlp = try_load([os.path.join(WORK_DIR,'oof_pred_mlp.npy')])

        if (oof_lgb is not None) and (oof_mlp is not None) and ('y' in globals()):
            oof_lgb = np.array(oof_lgb).reshape(-1)
            oof_mlp = np.array(oof_mlp).reshape(-1)
            y_arr = np.array(globals()['y']).reshape(-1)
            # grid search small set for best_w
            best_w_use = 0.0
            best_score = 1e9
            for w in np.linspace(0,1,101):
                blended = w * oof_lgb + (1 - w) * oof_mlp
                score = smape(y_arr, blended)
                if score < best_score:
                    best_score = score
                    best_w_use = w
            print(f"Computed best_w from OOFs: {best_w_use:.2f} (OOF SMAPE {best_score:.4f}%)")
        else:
            best_w_use = 0.5
            print("Could not compute best_w from OOFs. Falling back to best_w = 0.5 (equal blend).")

    # 4) Construct pred_sample_blend using available preds
    if (pred_sample_lgb is not None) and (pred_sample_mlp is not None):
        pred_sample_blend = best_w_use * pred_sample_lgb + (1 - best_w_use) * pred_sample_mlp
        print("Constructed pred_sample_blend by blending LGB and MLP sample preds.")
    elif (pred_sample_lgb is not None):
        pred_sample_blend = pred_sample_lgb
        print("Only LGB sample preds found; using those as pred_sample_blend.")
    elif (pred_sample_mlp is not None):
        pred_sample_blend = pred_sample_mlp
        print("Only MLP sample preds found; using those as pred_sample_blend.")
    else:
        # 5) FALLBACK: use group median per unit_extracted or global median
        print("No model sample predictions found. Using fallback: group median or global median.")
        # compute group median from train if possible
        if 'train' in globals() and 'sample_test' in globals() and 'price' in train.columns:
            try:
                grp_med = train.groupby('unit_extracted')['price'].median().to_dict()
                default_med = train['price'].median()
                sample_units = sample_test['unit_extracted'].fillna('').map(lambda x: x if x!='' else None)
                pred_sample_blend = []
                for u in sample_test['unit_extracted'].fillna(''):
                    val = grp_med.get(u, default_med)
                    pred_sample_blend.append(val)
                pred_sample_blend = np.array(pred_sample_blend)
                print("Fallback: used unit_extracted group medians (with global median fallback).")
            except Exception as e:
                print("Group median fallback failed:", e)
                default_med = float(train['price'].median()) if ('train' in globals() and 'price' in train.columns) else 10.0
                pred_sample_blend = np.full(len(sample_test), default_med)
                print("Fallback: using global median:", default_med)
        else:
            # no train available? just use global value 10.0
            pred_sample_blend = np.full(len(sample_test), 10.0)
            print("No train available: fallback to constant 10.0 predictions.")

# 6) Ensure arrays are numpy and clipped positive
pred_test_blend = np.array(globals().get('pred_test_blend')) if 'pred_test_blend' in globals() else None
if pred_test_blend is None:
    # try find test blend or construct from other test preds if available
    if 'pred_test_blend' in globals():
        pred_test_blend = np.array(globals()['pred_test_blend'])
    elif ('pred_test_lgb' in globals()) and ('pred_test_mlp' in globals()):
        bw = globals().get('best_w', 0.5)
        pred_test_blend = bw * np.array(globals()['pred_test_lgb']) + (1-bw) * np.array(globals()['pred_test_mlp'])
        print("Constructed pred_test_blend from pred_test_lgb and pred_test_mlp using best_w.")
    else:
        # try loading from files
        pred_test_blend = try_load([os.path.join(WORK_DIR,'pred_test_blend.npy'), os.path.join(WORK_DIR,'pred_test_blend.csv')])
        if pred_test_blend is None:
            raise RuntimeError("pred_test_blend not found. You must produce pred_test_blend or pred_test_lgb/pred_test_mlp first.")

pred_test_blend = np.clip(np.array(pred_test_blend).reshape(-1), 0.01, None)
pred_sample_blend = np.clip(np.array(pred_sample_blend).reshape(-1), 0.01, None)

# 7) Save outputs
out_test = pd.DataFrame({'sample_id': test['sample_id'], 'price': pred_test_blend})
out_sample = pd.DataFrame({'sample_id': sample_test['sample_id'], 'price': pred_sample_blend})

test_out_path = os.path.join(WORK_DIR, 'test_out.csv')
sample_out_path = os.path.join(WORK_DIR, 'sample_test_out.csv')

out_test.to_csv(test_out_path, index=False)
out_sample.to_csv(sample_out_path, index=False)

print("Saved outputs to:", test_out_path, sample_out_path)
print("test_out preview:")
display(out_test.head())
print("sample_test_out preview:")
display(out_sample.head())


pred_sample_blend not found in memory. Attempting to construct it...
pred_sample_lgb not found.
Found pred_sample_mlp shape: (100,)
Using best_w from memory: 0.01
Only MLP sample preds found; using those as pred_sample_blend.
Saved outputs to: /kaggle/working/test_out.csv /kaggle/working/sample_test_out.csv
test_out preview:


Unnamed: 0,sample_id,price
0,100179,14.519507
1,245611,14.131482
2,146263,19.992171
3,95658,10.229068
4,36806,30.934236


sample_test_out preview:


Unnamed: 0,sample_id,price
0,217392,51.338554
1,209156,16.353568
2,262333,3.250908
3,295979,17.740098
4,50604,17.147069


In [13]:
out_test = pd.DataFrame({'sample_id': test['sample_id'], 'price': pred_test_blend})
out_sample = pd.DataFrame({'sample_id': sample_test['sample_id'], 'price': pred_sample_blend})

test_out_path = os.path.join(KAGGLE_WORKING_DIR, 'test_out.csv')
sample_out_path = os.path.join(KAGGLE_WORKING_DIR, 'sample_test_out.csv')

out_test.to_csv(test_out_path, index=False)
out_sample.to_csv(sample_out_path, index=False)
print("Saved outputs to:", test_out_path, sample_out_path)
display(out_test.head())
display(out_sample.head())


NameError: name 'pred_sample_blend' is not defined

In [None]:
np.save(os.path.join(KAGGLE_WORKING_DIR, 'oof_pred_lgb.npy'), oof_pred_lgb)
np.save(os.path.join(KAGGLE_WORKING_DIR, 'oof_pred_mlp.npy'), oof_pred_mlp)
np.save(os.path.join(KAGGLE_WORKING_DIR, 'pred_test_blend.npy'), pred_test_blend)
np.save(os.path.join(KAGGLE_WORKING_DIR, 'pred_sample_blend.npy'), pred_sample_blend)
print("Saved OOF/pred arrays for debugging.")
