In [2]:
# STEP 1: Setup & Imports
# =======================

!pip install lightgbm --quiet

import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb
from scipy import sparse
import time

print("‚úÖ Libraries loaded successfully!")


‚úÖ Libraries loaded successfully!
‚úÖ Libraries loaded successfully!


In [3]:
# 0) Install (only if needed) and imports
# ------------------------------------------------
!pip install -q lightgbm tqdm scikit-learn joblib

import os, time, re, gc
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy import sparse
from scipy.sparse import hstack, csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
import lightgbm as lgb
import joblib

# 1) Config & Paths (edit your dataset path here)
# ------------------------------------------------
DATA_DIR = "/content/drive/MyDrive/dataset"   # <-- change if needed
TRAIN_F = os.path.join(DATA_DIR, "train.csv")
TEST_F  = os.path.join(DATA_DIR, "test.csv")
OUTPUT_PRED = "test_out.csv"
ARTIFACTS_F = "model_artifacts.joblib"

RANDOM_STATE = 42
N_FOLDS = 5               # 5-fold CV as requested
TFIDF_MAX_WORD = 10000    # increased TF-IDF
TFIDF_MAX_CHAR = 3000
CHAR_NGRAM = (2,4)
WORD_NGRAM = (1,2)

LGB_NUM_BOOST_ROUND = 1000
EARLY_STOPPING_ROUNDS = 50

# Ensemble / tuning: two light configs (fast and robust)
LGB_CONFIGS = [
    {"name":"fast","num_leaves":31,"learning_rate":0.05,"n_jobs":2},
    {"name":"robust","num_leaves":64,"learning_rate":0.03,"n_jobs":2}
]

# Safety note displayed at runtime
print("CONFIG: TFIDF_WORD =", TFIDF_MAX_WORD, ", TFIDF_CHAR =", TFIDF_MAX_CHAR)
print("CONFIG: 5-fold CV, ensemble of", len(LGB_CONFIGS), "LightGBM models")
print()
DATA_DIR = "/content/drive/MyDrive/amazonimages"  # <-- change this
TRAIN_F = os.path.join(DATA_DIR, "train.csv")
TEST_F  = os.path.join(DATA_DIR, "test.csv")


CONFIG: TFIDF_WORD = 10000 , TFIDF_CHAR = 3000
CONFIG: 5-fold CV, ensemble of 2 LightGBM models



In [4]:
# 2) Mount Drive (if using Colab) - run once
# ------------------------------------------------
try:
    from google.colab import drive
    drive.mount('/content/drive')
except Exception:
    # Not in Colab or already mounted
    pass


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
# 3) Utility functions
# ------------------------------------------------
def timer(start, text="Elapsed"):
    return f"{text}: {time.time()-start:.1f}s"

def smape(y_true, y_pred):
    y_true = np.array(y_true, dtype=np.float64)
    y_pred = np.array(y_pred, dtype=np.float64)
    denom = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    denom[denom == 0] = 1e-8
    return np.mean(np.abs(y_true - y_pred) / denom) * 100.0

def safe_text(x):
    if pd.isna(x):
        return ""
    x = str(x).lower()
    x = re.sub(r"http\S+|www\S+", "", x)
    x = re.sub(r"[^a-z0-9\s]", " ", x)
    x = re.sub(r"\s+", " ", x).strip()
    return x

# Step 3 is very fast; negligible time (~0.1s)


In [6]:
# 4) Load data
# ------------------------------------------------
# Check what files are present in the folder
!ls /content/drive/MyDrive/amazonimages   # <-- should list train.csv & test.csv

t0 = time.time()

# Update paths to your actual CSV files
TRAIN_F = "/content/drive/MyDrive/amazonimages/train.csv"
TEST_F  = "/content/drive/MyDrive/amazonimages/test.csv"

print("Loading data from:", TRAIN_F, TEST_F)
train = pd.read_csv(TRAIN_F)
test = pd.read_csv(TEST_F)
print("Loaded shapes -> train:", train.shape, " test:", test.shape)
print("Crash risk: ‚úÖ Very low (CSV load only).")

t1 = time.time()
print(f"Step 4 time: {t1-t0:.1f}s")




test.csv  train.csv
Loading data from: /content/drive/MyDrive/amazonimages/train.csv /content/drive/MyDrive/amazonimages/test.csv
Loaded shapes -> train: (75000, 4)  test: (75000, 3)
Crash risk: ‚úÖ Very low (CSV load only).
Step 4 time: 3.5s


In [7]:
# 5) Basic cleaning & tuned text fields
# ------------------------------------------------
t0 = time.time()
train['clean_text'] = train['catalog_content'].apply(safe_text)
test['clean_text']  = test['catalog_content'].apply(safe_text)
print("Cleaned text fields.")

t1 = time.time()
print(f"Step 5 time: {t1-t0:.1f}s")
print("Crash risk: ‚úÖ Very low.")


Cleaned text fields.
Step 5 time: 16.9s
Crash risk: ‚úÖ Very low.


In [8]:
# =========================
# Step 6: Numeric/text features (vectorized)
# =========================
t0 = time.time()

def extract_text_numeric_fast(df, src_col='catalog_content'):
    s = df[src_col].fillna("").astype(str)

    # Basic counts
    len_text = s.str.len().astype(np.int32)
    num_words = s.str.count(r'\S+').astype(np.int32)          # count words fast
    num_digits = s.str.count(r'\d').astype(np.int32)
    num_commas = s.str.count(',').astype(np.int32)
    # Use non-capturing group (?:...) to avoid warning
    has_unit = s.str.contains(r'\b(?:ml|g|kg|l|oz|pack|pcs|pair)\b', flags=re.IGNORECASE).astype(np.int8)

    # Ratios (safe division)
    digits_per_word = (num_digits / num_words.replace(0,1)).astype(np.float32)
    chars_per_word  = (len_text / num_words.replace(0,1)).astype(np.float32)

    return pd.DataFrame({
        'len_text': len_text,
        'num_words': num_words,
        'num_digits': num_digits,
        'num_commas': num_commas,
        'has_unit': has_unit,
        'digits_per_word': digits_per_word,
        'chars_per_word': chars_per_word
    })

# Apply to train and test
num_feat_train = extract_text_numeric_fast(train, 'catalog_content')
num_feat_test  = extract_text_numeric_fast(test, 'catalog_content')

print("Numeric text features shape:", num_feat_train.shape)
print(timer(t0, "Step 6 time"))
print("Crash risk: ‚úÖ Very low (vectorized, CPU-friendly).")


Numeric text features shape: (75000, 7)
Step 6 time: 15.7s
Crash risk: ‚úÖ Very low (vectorized, CPU-friendly).


In [9]:
t0 = time.time()

def extract_image_url_features_fast(df, col='image_link'):
    # Ensure Series
    s = df[col] if isinstance(df[col], pd.Series) else pd.Series(df[col])
    s = s.fillna('').astype(str)

    # Basic URL patterns
    img_len = s.str.len().astype(np.int32)
    img_https = s.str.startswith('https').astype(np.int8)
    img_num_digits = s.str.count(r'\d').astype(np.int32)
    img_num_letters = s.str.count(r'[A-Za-z]').astype(np.int32)
    img_num_slash = s.str.count('/').astype(np.int32)
    img_num_dash = s.str.count('-').astype(np.int32)
    img_jpg = s.str.lower().str.contains('.jpg').astype(np.int8)
    img_png = s.str.lower().str.contains('.png').astype(np.int8)

    # Extract filename part
    file_part = s.str.extract(r'/([^/]+\.(?:jpg|png))', expand=False)
    file_part = file_part.fillna('')
    file_len = file_part.str.len().astype(np.int32)
    file_digits = file_part.str.count(r'\d').astype(np.int32)
    file_letters = file_part.str.count(r'[A-Za-z]').astype(np.int32)

    # Last 2 chars ASCII sum
    last2 = file_part.str[-2:].fillna('00')
    last2_code = (last2.str[0].apply(ord) + last2.str[1].apply(ord)).astype(np.int32)

    return pd.DataFrame({
        'img_len': img_len,
        'img_https': img_https,
        'img_num_digits': img_num_digits,
        'img_num_letters': img_num_letters,
        'img_num_slash': img_num_slash,
        'img_num_dash': img_num_dash,
        'img_jpg': img_jpg,
        'img_png': img_png,
        'img_file_len': file_len,
        'img_file_digits': file_digits,
        'img_file_letters': file_letters,
        'img_last2_code': last2_code
    })

# Apply to train and test
img_feat_train = extract_image_url_features_fast(train)
img_feat_test  = extract_image_url_features_fast(test)

print("Image URL features shape:", img_feat_train.shape)
print(timer(t0, "Step 7 time"))
print("Crash risk: ‚úÖ Very low (vectorized, CPU-friendly).")


Image URL features shape: (75000, 12)
Step 7 time: 4.1s
Crash risk: ‚úÖ Very low (vectorized, CPU-friendly).


In [10]:
from sklearn.feature_extraction.text import HashingVectorizer

t0 = time.time()
print("Using HashingVectorizer for CPU-friendly TF-IDF...")

# Word-level
hv_word = HashingVectorizer(
    n_features=10000,   # same as TF-IDF max_features
    ngram_range=(1,2),
    alternate_sign=False,  # positive values only
    norm='l2',
    dtype=np.float32
)

# Char-level
hv_char = HashingVectorizer(
    n_features=3000,
    analyzer='char',
    ngram_range=(2,4),
    alternate_sign=False,
    norm='l2',
    dtype=np.float32
)

# Transform train + test
Xw_train = hv_word.transform(train['clean_text'])
Xw_test  = hv_word.transform(test['clean_text'])
Xc_train = hv_char.transform(train['clean_text'])
Xc_test  = hv_char.transform(test['clean_text'])

print("HashingVectorizer shapes -> word:", Xw_train.shape, " char:", Xc_train.shape)
print(timer(t0, "Step 8 time"))
print("Crash risk: ‚úÖ Very low (fast + memory-efficient)")


Using HashingVectorizer for CPU-friendly TF-IDF...
HashingVectorizer shapes -> word: (75000, 10000)  char: (75000, 3000)
Step 8 time: 242.9s
Crash risk: ‚úÖ Very low (fast + memory-efficient)


In [11]:
from scipy.sparse import hstack, csr_matrix
import gc
import time

t0 = time.time()
print("Step 9: Stacking all features into sparse matrix...")

# -----------------------------
# Numeric features ‚Üí sparse
# -----------------------------
num_train_sp = csr_matrix(num_feat_train.values.astype(np.float32))
num_test_sp  = csr_matrix(num_feat_test.values.astype(np.float32))

# -----------------------------
# Image URL features ‚Üí sparse
# -----------------------------
img_train_sp = csr_matrix(img_feat_train.values.astype(np.float32))
img_test_sp  = csr_matrix(img_feat_test.values.astype(np.float32))

# -----------------------------
# Stack: TF-IDF + numeric + image
# -----------------------------
X_train = hstack([Xw_train, Xc_train, num_train_sp, img_train_sp], format='csr')
X_test  = hstack([Xw_test, Xc_test, num_test_sp, img_test_sp], format='csr')

print("Combined feature shapes -> X_train:", X_train.shape, " X_test:", X_test.shape)
print(timer(t0, "Step 9 time"))

# -----------------------------
# Clean up to free memory
# -----------------------------
del num_train_sp, num_test_sp, img_train_sp, img_test_sp
gc.collect()

print("Crash risk: ‚úÖ Very low (sparse, CPU-friendly, ready for training)")


Step 9: Stacking all features into sparse matrix...
Combined feature shapes -> X_train: (75000, 13019)  X_test: (75000, 13019)
Step 9 time: 3.2s
Crash risk: ‚úÖ Very low (sparse, CPU-friendly, ready for training)


In [12]:
import numpy as np
import time

t0 = time.time()
print("Step 10: Preparing target variable...")

# Fill missing prices and ensure float32
y = train['price'].fillna(0).values.astype(np.float32)

# Clip negative values to 0 (just in case)
y = np.maximum(y, 0.0)

# Apply log1p transform for skew reduction
y_log = np.log1p(y)

print("Target prepared: original shape =", y.shape)
print("Sample log1p values:", y_log[:5])
print(timer(t0, "Step 10 time"))
print("Crash risk: ‚úÖ Very low")


Step 10: Preparing target variable...
Target prepared: original shape = (75000,)
Sample log1p values: [1.773256  2.6475923 1.088562  3.4448953 4.2119794]
Step 10 time: 0.0s
Crash risk: ‚úÖ Very low


In [13]:
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation
from sklearn.model_selection import KFold
import numpy as np
import gc
import time

t0_total = time.time()
N_FOLDS = 5
RANDOM_STATE = 42
TEST_BATCH_SIZE = 10000  # safer for weaker CPUs

# Ensemble configs
LGB_CONFIGS = [
    {"name": "fast", "num_leaves": 31, "learning_rate": 0.05, "n_jobs": 2},
    {"name": "robust", "num_leaves": 64, "learning_rate": 0.03, "n_jobs": 2}
]

# Containers for OOF and test predictions
oof_preds_all = np.zeros(len(train), dtype=np.float64)
test_preds_all = np.zeros(X_test.shape[0], dtype=np.float64)

kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)

# SMAPE function
def smape(y_true, y_pred):
    denom = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    denom[denom == 0] = 1e-8
    return np.mean(np.abs(y_true - y_pred) / denom) * 100.0

# -----------------------------
# Start CPU-safe ensemble training
# -----------------------------
for cfg_idx, cfg in enumerate(LGB_CONFIGS, 1):
    print(f"\n=== Config {cfg_idx}/{len(LGB_CONFIGS)}: {cfg['name']} ===")
    oof_preds = np.zeros(len(train), dtype=np.float64)
    test_preds = np.zeros(X_test.shape[0], dtype=np.float64)

    for fold, (train_idx, valid_idx) in enumerate(kf.split(X_train), 1):
        print(f"\n-- Fold {fold}/{N_FOLDS} --")
        t_fold = time.time()

        # -----------------------------
        # Train on current fold
        # -----------------------------
        X_tr, X_val = X_train[train_idx], X_train[valid_idx]
        y_tr, y_val = y_log[train_idx], y_log[valid_idx]

        lgb_params = {
            "objective": "regression",
            "metric": "rmse",
            "boosting_type": "gbdt",
            "learning_rate": cfg['learning_rate'],
            "num_leaves": cfg['num_leaves'],
            "feature_fraction": 0.8,
            "bagging_fraction": 0.9,
            "bagging_freq": 1,
            "verbose": -1,
            "seed": RANDOM_STATE,
            "n_jobs": cfg['n_jobs']
        }

        dtrain = lgb.Dataset(X_tr, label=y_tr)
        dvalid = lgb.Dataset(X_val, label=y_val)

        # -----------------------------
        # Early stopping callback
        # -----------------------------
        model = lgb.train(
            lgb_params,
            dtrain,
            num_boost_round=1000,
            valid_sets=[dtrain, dvalid],
            valid_names=['train','valid'],
            callbacks=[
                early_stopping(stopping_rounds=50),
                log_evaluation(period=100)  # prints progress every 100 rounds
            ]
        )

        # -----------------------------
        # Validation predictions
        # -----------------------------
        val_pred_log = model.predict(X_val, num_iteration=model.best_iteration)
        oof_preds[valid_idx] = np.expm1(val_pred_log)

        # -----------------------------
        # Test predictions in chunks
        # -----------------------------
        test_pred_log = np.zeros(X_test.shape[0], dtype=np.float32)
        for start in range(0, X_test.shape[0], TEST_BATCH_SIZE):
            end = min(start + TEST_BATCH_SIZE, X_test.shape[0])
            test_pred_log[start:end] = model.predict(X_test[start:end], num_iteration=model.best_iteration)
        test_preds += np.expm1(test_pred_log)

        print(f"Fold {fold} done. Fold time: {time.time()-t_fold:.1f}s")

        # -----------------------------
        # Free memory
        # -----------------------------
        del X_tr, X_val, y_tr, y_val, dtrain, dvalid, model, val_pred_log, test_pred_log
        gc.collect()

    # Average test predictions over folds
    test_preds /= N_FOLDS

    # Accumulate ensemble
    oof_preds_all += oof_preds
    test_preds_all += test_preds

    # SMAPE for this config
    config_smape = smape(y, oof_preds)
    print(f"Config '{cfg['name']}' OOF SMAPE: {config_smape:.4f}%")

# -----------------------------
# Final ensemble average across configs
# -----------------------------
oof_preds_all /= len(LGB_CONFIGS)
test_preds_all /= len(LGB_CONFIGS)

final_smape = smape(y, oof_preds_all)
print("\nFinal ensemble OOF SMAPE: {:.4f}%".format(final_smape))
print("Step 11 total time: {:.1f}s (~{:.1f} min)".format(time.time()-t0_total, (time.time()-t0_total)/60))

# -----------------------------
# Safety notes (comments only)
# -----------------------------
# 1) Keep TEST_BATCH_SIZE <= 15k (or 10k if RAM is low)
# 2) Close other heavy programs while training
# 3) Sparse matrices + float32 save memory
# 4) 5-fold CV is a good tradeoff between accuracy and speed
# 5) Expected OOF SMAPE with this setup: ~40‚Äì45%
# 6) CPU-friendly: no step should crash your system



=== Config 1/2: fast ===

-- Fold 1/5 --
Training until validation scores don't improve for 50 rounds
[100]	train's rmse: 0.714817	valid's rmse: 0.760125
[200]	train's rmse: 0.662427	valid's rmse: 0.734221
[300]	train's rmse: 0.627348	valid's rmse: 0.723591
[400]	train's rmse: 0.599106	valid's rmse: 0.717126
[500]	train's rmse: 0.574095	valid's rmse: 0.712499
[600]	train's rmse: 0.552088	valid's rmse: 0.709302
[700]	train's rmse: 0.53158	valid's rmse: 0.706656
[800]	train's rmse: 0.512998	valid's rmse: 0.704324
[900]	train's rmse: 0.495443	valid's rmse: 0.702808
[1000]	train's rmse: 0.479188	valid's rmse: 0.701475
Did not meet early stopping. Best iteration is:
[1000]	train's rmse: 0.479188	valid's rmse: 0.701475
Fold 1 done. Fold time: 2624.0s

-- Fold 2/5 --
Training until validation scores don't improve for 50 rounds
[100]	train's rmse: 0.718503	valid's rmse: 0.745107
[200]	train's rmse: 0.666453	valid's rmse: 0.718768
[300]	train's rmse: 0.630971	valid's rmse: 0.708521
[400]	train

KeyboardInterrupt: 

In [None]:
# ================================================
# Step 11b ‚Äì Mini Config 2 (3 folds x 600 rounds)
# ================================================
from sklearn.model_selection import KFold
import lightgbm as lgb
import numpy as np
import gc
import time

# 3-fold CV, max 600 rounds
MINI_N_FOLDS = 3
MINI_MAX_ROUNDS = 600

kf_mini = KFold(n_splits=MINI_N_FOLDS, shuffle=True, random_state=RANDOM_STATE)

# mini Config 2
mini_cfg = {"name":"robust_mini","num_leaves":64,"learning_rate":0.03,"n_jobs":2}

oof_preds_mini = np.zeros(len(train), dtype=np.float64)
test_preds_mini = np.zeros(X_test.shape[0], dtype=np.float64)

fold = 0
t_total = time.time()
for train_idx, valid_idx in kf_mini.split(X_train):
    fold += 1
    t_fold = time.time()
    print(f"\n-- Mini Fold {fold}/{MINI_N_FOLDS} --")

    X_tr = X_train[train_idx]
    X_val = X_train[valid_idx]
    y_tr = y_log[train_idx]
    y_val = y_log[valid_idx]

    lgb_params = {
        "objective": "regression",
        "metric": "rmse",
        "boosting_type": "gbdt",
        "learning_rate": mini_cfg['learning_rate'],
        "num_leaves": mini_cfg['num_leaves'],
        "max_depth": -1,
        "feature_fraction": 0.8,
        "bagging_fraction": 0.9,
        "bagging_freq": 1,
        "verbose": -1,
        "seed": RANDOM_STATE,
        "n_jobs": mini_cfg['n_jobs']
    }

    dtrain = lgb.Dataset(X_tr, label=y_tr)
    dvalid = lgb.Dataset(X_val, label=y_val)

    model = lgb.train(
        lgb_params,
        dtrain,
        num_boost_round=MINI_MAX_ROUNDS,
        valid_sets=[dtrain, dvalid],
        valid_names=['train','valid'],
        verbose_eval=100
    )

    # Predict log1p space -> convert back
    val_pred_log = model.predict(X_val, num_iteration=model.best_iteration)
    oof_preds_mini[valid_idx] = np.expm1(val_pred_log)

    test_pred_log = model.predict(X_test, num_iteration=model.best_iteration)
    test_preds_mini += np.expm1(test_pred_log)

    print(f"Mini Fold {fold} done. Fold time: {time.time()-t_fold:.1f}s")

    # free memory
    del X_tr, X_val, y_tr, y_val, dtrain, dvalid, model, val_pred_log, test_pred_log
    gc.collect()

# average test predictions over folds
test_preds_mini /= MINI_N_FOLDS

# combine with Config 1 predictions (already done)
final_test_preds = (test_preds_all + test_preds_mini) / 2.0  # simple average ensemble

# OOF SMAPE for mini Config 2
def smape(y_true, y_pred):
    y_true = np.array(y_true, dtype=np.float64)
    y_pred = np.array(y_pred, dtype=np.float64)
    denom = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    denom[denom == 0] = 1e-8
    return np.mean(np.abs(y_true - y_pred) / denom) * 100.0

mini_oof_smape = smape(y, oof_preds_mini)
final_oof_smape = smape(y, (oof_preds_all + oof_preds_mini)/2.0)
print(f"\nMini Config 2 OOF SMAPE: {mini_oof_smape:.4f}%")
print(f"Final ensemble OOF SMAPE: {final_oof_smape:.4f}%")
print(f"Mini 3-fold training done in {time.time()-t_total:.1f}s (~{(time.time()-t_total)/60:.1f} min)")


In [15]:
# =========================================================
# Step 12) Save artifacts safely (vectorizers + metadata)
# =========================================================
import joblib, os, gc

t0 = time.time()
print("\n=== Step 12: Saving artifacts safely ===")

try:
    # Create artifacts dictionary (no large models inside)
    artifacts = {
        "tfidf_word": tfidf_word,
        "tfidf_char": tfidf_char,
        "num_feat_columns": list(num_feat_train.columns),
        "img_feat_columns": list(img_feat_train.columns),
        "lgb_configs": LGB_CONFIGS,
        "final_oof_smape": final_smape,
        "random_state": RANDOM_STATE,
        "n_folds": N_FOLDS
    }

    # Save using joblib (compressed for size efficiency)
    joblib.dump(artifacts, ARTIFACTS_F, compress=3)

    # Verification step
    if os.path.exists(ARTIFACTS_F):
        size_mb = os.path.getsize(ARTIFACTS_F) / (1024 * 1024)
        print(f"‚úÖ Artifacts saved successfully ‚Üí {ARTIFACTS_F} ({size_mb:.2f} MB)")
    else:
        print("‚ö† Warning: Artifacts file not found after save attempt!")

except Exception as e:
    print("‚ùå Error saving artifacts:", str(e))
    # Backup attempt
    backup_path = ARTIFACTS_F.replace(".joblib", "_backup.joblib")
    try:
        joblib.dump(artifacts, backup_path, compress=1)
        print(f"üü° Saved backup artifacts to {backup_path}")
    except Exception as e2:
        print("‚ùå Backup save also failed:", str(e2))

# Memory cleanup
gc.collect()
print(f"Step 12 completed in {time.time() - t0:.2f}s")
print("Crash risk: ‚úÖ Very low (small dictionary, no large model objects).")



=== Step 12: Saving artifacts safely ===
‚ùå Error saving artifacts: name 'tfidf_word' is not defined
‚ùå Backup save also failed: name 'artifacts' is not defined
Step 12 completed in 1.97s
Crash risk: ‚úÖ Very low (small dictionary, no large model objects).


In [16]:
# =========================================================
# Step 13) Create and save submission (no dependency on Step 12)
# =========================================================
import pandas as pd, numpy as np, os, time

t0 = time.time()
print("\n=== Step 13: Creating and saving submission file (independent) ===")

try:
    # --- Safety checks ---
    if 'test' not in globals() or 'test_preds_all' not in globals():
        raise RuntimeError("Required variables not found in memory: please ensure 'test' and 'test_preds_all' exist.")

    # --- Create submission DataFrame ---
    submission = pd.DataFrame({
        "sample_id": test['sample_id'].values,
        "price": np.maximum(test_preds_all, 0.0)
    })

    # --- Save submission to CSV ---
    submission.to_csv("test_out.csv", index=False)

    # --- Verify file saved ---
    if os.path.exists("test_out.csv"):
        size_kb = os.path.getsize("test_out.csv") / 1024
        print(f"‚úÖ Submission saved successfully ‚Üí test_out.csv  ({size_kb:.1f} KB)")
        print(f"üìÑ Shape: {submission.shape}")
        print("\nTop 5 rows preview:")
        display(submission.head())
    else:
        print("‚ö† Warning: CSV not found after saving attempt!")

except Exception as e:
    print("‚ùå Submission creation failed:", str(e))
    # Try backup
    try:
        submission.to_csv("test_out_backup.csv", index=False)
        print("üü° Backup saved to test_out_backup.csv")
    except:
        print("Backup save also failed.")

print(f"Step 13 completed in {time.time()-t0:.2f}s")
print("üöÄ You can now upload 'test_out.csv' to the Amazon ML Challenge portal.")



=== Step 13: Creating and saving submission file (independent) ===
‚úÖ Submission saved successfully ‚Üí test_out.csv  (1835.5 KB)
üìÑ Shape: (75000, 2)

Top 5 rows preview:


Unnamed: 0,sample_id,price
0,100179,15.800496
1,245611,10.833378
2,146263,25.940461
3,95658,10.765281
4,36806,33.907073


Step 13 completed in 0.55s
üöÄ You can now upload 'test_out.csv' to the Amazon ML Challenge portal.


In [17]:
import numpy as np

def smape(y_true, y_pred):
    y_true = np.array(y_true, dtype=np.float64)
    y_pred = np.array(y_pred, dtype=np.float64)
    denom = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    denom[denom == 0] = 1e-8
    return np.mean(np.abs(y_true - y_pred) / denom) * 100.0

# Calculate
final_smape_check = smape(y, oof_preds_all)
print(f"‚úÖ Final SMAPE on training (OOF): {final_smape_check:.4f}%")


‚úÖ Final SMAPE on training (OOF): 53.2082%


In [18]:
!ls -lh /content | grep test_out.csv


-rw-r--r-- 1 root root 1.8M Oct 13 14:38 test_out.csv


In [19]:
from google.colab import files
files.download("/content/test_out.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [20]:
def smape(y_true, y_pred):
    y_true = np.array(y_true, dtype=np.float64)
    y_pred = np.array(y_pred, dtype=np.float64)
    denom = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    denom[denom == 0] = 1e-8
    return np.mean(np.abs(y_true - y_pred) / denom) * 100.0


In [21]:
final_smape = smape(y, oof_preds_all)
print("Final OOF SMAPE:", round(final_smape, 4), "%")
print("Estimated accuracy:", round(100 - final_smape, 2), "%")


Final OOF SMAPE: 53.2082 %
Estimated accuracy: 46.79 %
