In [None]:
# Run in a single code cell
!pip uninstall -y pillow
!pip install pillow==10.3.0
!pip install -U sentence-transformers transformers lightgbm xgboost scikit-learn tqdm requests
# If you want to use GPU optimizations (optional): accelerate/bitsandbytes not required here


Found existing installation: pillow 11.3.0
Uninstalling pillow-11.3.0:
  Successfully uninstalled pillow-11.3.0
Collecting pillow==10.3.0
  Downloading pillow-10.3.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (9.2 kB)
Downloading pillow-10.3.0-cp312-cp312-manylinux_2_28_x86_64.whl (4.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m60.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pillow
Successfully installed pillow-10.3.0


Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting requests
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Downloading scikit_learn-1.7.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (9.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m110.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading requests-2.32.5-py3-none-any.whl (64 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.7/64.7 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: requests, scikit-learn
  Attempting uninstall: requests
    Found existing installation: requests 2.32.4
    Uninstalling requests-2.32.4:
      Successfully uninstalled requests-2.32.4
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.6.1
    Uninstalling scikit-learn-1.6.1:
      Successfully uninstalled scikit-l

In [None]:
import os, zipfile, tarfile
from pathlib import Path

ROOT = '/content'
# Unzip infiniper.zip (if not already extracted)
zip_path = os.path.join(ROOT, 'infiniper.zip')
if os.path.exists(zip_path):
    print("Extracting infiniper.zip ...")
    with zipfile.ZipFile(zip_path, 'r') as z:
        z.extractall(ROOT)
else:
    print("infiniper.zip not found at", zip_path)

# Extract processed_images.tar (contains processed_images folder or images tar)
tar_path = os.path.join(ROOT, 'processed_images.tar')
if os.path.exists(tar_path):
    print("Extracting processed_images.tar ...")
    with tarfile.open(tar_path, 'r') as tar:
        tar.extractall(ROOT)
else:
    print("processed_images.tar not found at", tar_path)

# List dataset files
data_dir = os.path.join(ROOT, 'dataset')
print("Dataset folder exists?", os.path.exists(data_dir))
print("Dataset listing:")
if os.path.exists(data_dir):
    print(os.listdir(data_dir))
else:
    raise FileNotFoundError("Please upload the dataset folder into Colab /content/dataset")


Extracting infiniper.zip ...
Extracting processed_images.tar ...


  tar.extractall(ROOT)


Dataset folder exists? True
Dataset listing:
['sample_test.csv', 'train.csv', 'sample_test_out.csv', 'test.csv']


In [None]:
# Imports
import os, sys, re, gc
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

# SMAPE
def smape(y_true, y_pred, eps=1e-8):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    denom = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    denom = np.where(denom == 0, eps, denom)
    return np.mean(np.abs(y_pred - y_true) / denom) * 100.0


In [None]:
DATA_DIR = '/content/dataset'
train = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
test  = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))
sample_test = pd.read_csv(os.path.join(DATA_DIR, 'sample_test.csv'))

print("Shapes:", train.shape, test.shape, sample_test.shape)
print("Train price stats:")
print(train['price'].describe())

# Show a couple of rows
display(train.head(2))
display(test.head(2))
display(sample_test.head(2))


Shapes: (75000, 4) (75000, 3) (100, 3)
Train price stats:
count    75000.000000
mean        23.647654
std         33.376932
min          0.130000
25%          6.795000
50%         14.000000
75%         28.625000
max       2796.000000
Name: price, dtype: float64


Unnamed: 0,sample_id,catalog_content,image_link,price
0,33127,"Item Name: La Victoria Green Taco Sauce Mild, ...",https://m.media-amazon.com/images/I/51mo8htwTH...,4.89
1,198967,"Item Name: Salerno Cookies, The Original Butte...",https://m.media-amazon.com/images/I/71YtriIHAA...,13.12


Unnamed: 0,sample_id,catalog_content,image_link
0,100179,Item Name: Rani 14-Spice Eshamaya's Mango Chut...,https://m.media-amazon.com/images/I/71hoAn78AW...
1,245611,Item Name: Natural MILK TEA Flavoring extract ...,https://m.media-amazon.com/images/I/61ex8NHCIj...


Unnamed: 0,sample_id,catalog_content,image_link
0,217392,Item Name: Gift Basket Village Gourmet Meat an...,https://m.media-amazon.com/images/I/91GB1wC6Ob...
1,209156,"Item Name: NPG Dried Lotus Seeds 16 Oz, Uncook...",https://m.media-amazon.com/images/I/81VnzF1vkv...


In [None]:
import math
from tqdm import tqdm
tqdm.pandas()

def parse_value_unit(text):
    val = np.nan
    unit = ""
    if not isinstance(text, str):
        return val, unit
    # Robust patterns
    m = re.search(r'Value[:\s]*([0-9]+(?:\.[0-9]+)?)', text, flags=re.IGNORECASE)
    if m:
        try:
            val = float(m.group(1))
        except:
            val = np.nan
    m2 = re.search(r'Unit[:\s]*([A-Za-z0-9%/ ._-]+)', text, flags=re.IGNORECASE)
    if m2:
        unit = m2.group(1).strip()
    return val, unit

def basic_text_features(df, col='catalog_content'):
    df[col] = df[col].fillna('')
    df['char_len'] = df[col].str.len()
    df['word_count'] = df[col].str.split().apply(len)
    df['num_digits'] = df[col].str.count(r'\d').fillna(0).astype(int)
    # parse Value and Unit using tqdm progress_map:
    vals_units = []
    for txt in tqdm(df[col].tolist(), desc=f'Parsing {col}'):
        vals_units.append(parse_value_unit(txt))
    vals, units = zip(*vals_units)
    df['value_extracted'] = list(vals)
    df['unit_extracted'] = list(units)
    return df

print("Parsing train...")
train = basic_text_features(train)
print("Parsing test...")
test = basic_text_features(test)
print("Parsing sample_test...")
sample_test = basic_text_features(sample_test)

# Quick check
display(train[['sample_id','value_extracted','unit_extracted','char_len','word_count']].head(3))


Parsing train...


Parsing catalog_content: 100%|██████████| 75000/75000 [00:01<00:00, 46307.80it/s]


Parsing test...


Parsing catalog_content: 100%|██████████| 75000/75000 [00:01<00:00, 44616.08it/s]


Parsing sample_test...


Parsing catalog_content: 100%|██████████| 100/100 [00:00<00:00, 23949.66it/s]


Unnamed: 0,sample_id,value_extracted,unit_extracted,char_len,word_count
0,33127,72.0,Fl Oz,91,18
1,198967,32.0,Ounce,511,80
2,261251,11.4,Ounce,328,59


In [None]:
# Configure where processed images were extracted
# After extracting processed_images.tar, your images may be in '/content/processed_images' or similar.
# Set this to the folder that contains the image files (the filenames should be Path(image_link).name)
IMAGES_DIR = '/content/processed_images'  # <-- change if needed

# find and confirm presence of images
print("IMAGES_DIR exists:", os.path.exists(IMAGES_DIR))
if not os.path.exists(IMAGES_DIR):
    # maybe images extracted into /content/images
    if os.path.exists('/content/images'):
        IMAGES_DIR = '/content/images'
        print("Using /content/images instead.")
    else:
        raise FileNotFoundError("Processed images not found. Please set IMAGES_DIR to the correct location.")

def image_path_from_link(link):
    if not isinstance(link, str): return ''
    return os.path.join(IMAGES_DIR, Path(link).name)

# Map
train['image_path'] = train['image_link'].map(image_path_from_link)
test['image_path']  = test['image_link'].map(image_path_from_link)
sample_test['image_path'] = sample_test['image_link'].map(image_path_from_link)

# Count missing images
for name, df in [('train',train), ('test', test), ('sample_test', sample_test)]:
    missing = df['image_path'].apply(lambda p: not os.path.exists(p) if p else True).sum()
    total = len(df)
    print(f"{name}: {missing}/{total} images missing (paths may be wrong).")


IMAGES_DIR exists: True
train: 5/75000 images missing (paths may be wrong).
test: 7/75000 images missing (paths may be wrong).
sample_test: 0/100 images missing (paths may be wrong).


In [None]:
# Insert this new cell after Cell 5

import re
import os
import pandas as pd
import multiprocessing
from time import time as timer
from tqdm import tqdm
import numpy as np
from pathlib import Path
from functools import partial
import requests
import urllib

# --- Your provided download functions ---
def download_image(image_link, savefolder):
    if(isinstance(image_link, str)):
        filename = Path(image_link).name
        image_save_path = os.path.join(savefolder, filename)
        if(not os.path.exists(image_save_path)):
            try:
                # Use a timeout and headers to be more robust
                urllib.request.urlretrieve(image_link, image_save_path)
            except Exception as ex:
                # We can skip printing warnings to avoid cluttering the output
                # print(f'Warning: Not able to download - {image_link}\n{ex}')
                pass
        else:
            return
    return

def download_images(image_links, download_folder):
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)

    # Filter out any non-string or empty links before starting
    links_to_process = [link for link in image_links if isinstance(link, str) and link]

    download_image_partial = partial(download_image, savefolder=download_folder)
    with multiprocessing.Pool(100) as pool:
        # Use tqdm to show progress for the download process
        list(tqdm(pool.imap(download_image_partial, links_to_process), total=len(links_to_process), desc="Downloading images"))
        pool.close()
        pool.join()

# --- Main download logic ---
IMAGES_DIR = '/content/processed_images' # This should be the same as in Cell 5

# Get the lists of image links to download
test_links = test['image_link'].unique().tolist()
sample_test_links = sample_test['image_link'].unique().tolist()

# Combine them into a single list to download all at once
all_links_to_download = test_links + sample_test_links

print(f"Found {len(all_links_to_download)} unique image links to download for test and sample_test sets.")

# Run the download
download_images(all_links_to_download, IMAGES_DIR)

print("\nImage download process complete! ✅")

Found 72322 unique image links to download for test and sample_test sets.


Downloading images: 100%|██████████| 72322/72322 [06:28<00:00, 186.34it/s]



Image download process complete! ✅


In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

TEXT_EMB_DIR = '/content/embeddings_text'
os.makedirs(TEXT_EMB_DIR, exist_ok=True)

# load model
print("Loading SBERT text model (all-MiniLM-L6-v2)...")
text_model = SentenceTransformer('all-MiniLM-L6-v2')

# Helper to compute and/or load
def compute_or_load_text_emb(name, texts, save_path):
    if os.path.exists(save_path):
        print(f"Loading saved text embeddings from {save_path}")
        return np.load(save_path)
    print(f"Computing text embeddings for {name} (this can take a bit)...")
    # model.encode supports progress bar internally
    emb = text_model.encode(texts, batch_size=64, show_progress_bar=True, convert_to_numpy=True)
    np.save(save_path, emb)
    return emb

train_text_emb_path = os.path.join(TEXT_EMB_DIR, 'train_text_emb.npy')
test_text_emb_path  = os.path.join(TEXT_EMB_DIR, 'test_text_emb.npy')
sample_text_emb_path= os.path.join(TEXT_EMB_DIR, 'sample_text_emb.npy')

train_text_emb = compute_or_load_text_emb('train', train['catalog_content'].tolist(), train_text_emb_path)
test_text_emb = compute_or_load_text_emb('test', test['catalog_content'].tolist(), test_text_emb_path)
sample_text_emb = compute_or_load_text_emb('sample_test', sample_test['catalog_content'].tolist(), sample_text_emb_path)

print("Text emb shapes:", train_text_emb.shape, test_text_emb.shape, sample_text_emb.shape)


In [None]:
# Image embeddings using CLIP (transformers) with tqdm and save/load
import torch
from transformers import CLIPProcessor, CLIPModel
from PIL import Image

IMAGE_EMB_DIR = '/content/embeddings_image'
os.makedirs(IMAGE_EMB_DIR, exist_ok=True)

clip_model_name = "openai/clip-vit-base-patch32"
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device for CLIP:", device)

# Load CLIP
print("Loading CLIP model...")
clip_model = CLIPModel.from_pretrained(clip_model_name).to(device)
clip_processor = CLIPProcessor.from_pretrained(clip_model_name)
clip_model.eval()

def compute_or_load_image_emb(name, paths, save_path, batch_size=32):
    if os.path.exists(save_path):
        print(f"Loading saved image embeddings from {save_path}")
        return np.load(save_path)
    print(f"Computing CLIP image embeddings for {name} (batch_size={batch_size})...")
    embs = []
    n = len(paths)
    for i in tqdm(range(0, n, batch_size), desc=f"CLIP image emb {name}", ncols=100):
        batch_paths = paths[i:i+batch_size]
        imgs = []
        for p in batch_paths:
            try:
                if isinstance(p, str) and p and os.path.exists(p):
                    img = Image.open(p).convert('RGB')
                else:
                    img = Image.new('RGB', (224,224), (255,255,255))
            except:
                img = Image.new('RGB', (224,224), (255,255,255))
            imgs.append(img)
        inputs = clip_processor(images=imgs, return_tensors='pt')
        pixel_values = inputs['pixel_values'].to(device)
        with torch.no_grad():
            img_feats = clip_model.get_image_features(pixel_values=pixel_values)  # (B, 512)
            # Optionally normalize:
            # img_feats = torch.nn.functional.normalize(img_feats, p=2, dim=1)
            embs.append(img_feats.cpu().numpy())
    if len(embs) == 0:
        out = np.zeros((n, 512), dtype=np.float32)
    else:
        out = np.vstack(embs)
    np.save(save_path, out)
    return out

train_img_emb_path = os.path.join(IMAGE_EMB_DIR, 'train_img_emb.npy')
test_img_emb_path  = os.path.join(IMAGE_EMB_DIR, 'test_img_emb.npy')
sample_img_emb_path= os.path.join(IMAGE_EMB_DIR, 'sample_img_emb.npy')

train_img_emb = compute_or_load_image_emb('train', train['image_path'].tolist(), train_img_emb_path, batch_size=32)
test_img_emb  = compute_or_load_image_emb('test', test['image_path'].tolist(), test_img_emb_path, batch_size=32)
sample_img_emb= compute_or_load_image_emb('sample_test', sample_test['image_path'].tolist(), sample_img_emb_path, batch_size=32)

print("Image emb shapes:", train_img_emb.shape, test_img_emb.shape, sample_img_emb.shape)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!mkdir -p /content/drive/MyDrive/Colab_Project_Backup

In [None]:
!cp -r /content/dataset \
       /content/Documentation_template.md \
       /content/embeddings_image \
       /content/embeddings_text \
       /content/processed_images \
       /content/README.md \
       /content/sample_code.py \
       /content/src \
       /content/drive/MyDrive/Colab_Project_Backup/

In [None]:
!cp -r /content/embeddings_image \
       /content/drive/MyDrive/Colab_Project_Backup/

In [None]:
!unzip -q /content/infiniper.zip -d /content

In [None]:
# Code to reload the Progress: Load all the necessary DataFrames and embeddings into variables.
# Imports needed to load data
import pandas as pd
import numpy as np
import os
import re
from tqdm import tqdm
from pathlib import Path

print("Reloading all necessary data and variables...")

## 1. Re-run steps to create DataFrames (Cells 3 & 4)
# This is fast and ensures the dataframes are correctly processed.
DATA_DIR = '/content/dataset'
train = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
test  = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))
sample_test = pd.read_csv(os.path.join(DATA_DIR, 'sample_test.csv'))

def parse_value_unit(text):
    val = np.nan
    unit = ""
    if not isinstance(text, str):
        return val, unit
    m = re.search(r'Value[:\s]*([0-9]+(?:\.[0-9]+)?)', text, flags=re.IGNORECASE)
    if m:
        try:
            val = float(m.group(1))
        except:
            val = np.nan
    m2 = re.search(r'Unit[:\s]*([A-Za-z0-9%/ ._-]+)', text, flags=re.IGNORECASE)
    if m2:
        unit = m2.group(1).strip()
    return val, unit

def basic_text_features(df, col='catalog_content'):
    df[col] = df[col].fillna('')
    df['char_len'] = df[col].str.len()
    df['word_count'] = df[col].str.split().apply(len)
    df['num_digits'] = df[col].str.count(r'\d').fillna(0).astype(int)
    vals_units = [parse_value_unit(txt) for txt in df[col].tolist()]
    vals, units = zip(*vals_units)
    df['value_extracted'] = list(vals)
    df['unit_extracted'] = list(units)
    return df

print("Processing text features for dataframes...")
train = basic_text_features(train)
test = basic_text_features(test)
sample_test = basic_text_features(sample_test)
print("DataFrames are ready.")

## 2. Load the saved embeddings from your backup
# Your console output confirms these files exist, so we'll just load them.
TEXT_EMB_DIR = '/content/embeddings_text'
IMAGE_EMB_DIR = '/content/embeddings_image'

print("\nLoading text and image embeddings from .npy files...")
train_text_emb = np.load(os.path.join(TEXT_EMB_DIR, 'train_text_emb.npy'))
test_text_emb = np.load(os.path.join(TEXT_EMB_DIR, 'test_text_emb.npy'))
sample_text_emb = np.load(os.path.join(TEXT_EMB_DIR, 'sample_text_emb.npy'))

train_img_emb = np.load(os.path.join(IMAGE_EMB_DIR, 'train_img_emb.npy'))
test_img_emb = np.load(os.path.join(IMAGE_EMB_DIR, 'test_img_emb.npy'))
sample_img_emb = np.load(os.path.join(IMAGE_EMB_DIR, 'sample_img_emb.npy'))

print("All embeddings loaded successfully!")
print("\n✅ You are now ready to proceed from Cell 8.")

Reloading all necessary data and variables...
Processing text features for dataframes...
DataFrames are ready.

Loading text and image embeddings from .npy files...
All embeddings loaded successfully!

✅ You are now ready to proceed from Cell 8.


In [None]:
# Stack features (text + image + numeric) with progress info
def stack_features(df, text_emb, img_emb):
    # numeric features: char_len, word_count, num_digits, value_extracted (fill 0)
    numeric = df[['char_len','word_count','num_digits','value_extracted']].fillna(0).to_numpy(dtype=float)
    # Concatenate horizontally: [text_emb, img_emb, numeric]
    X = np.hstack([text_emb, img_emb, numeric])
    return X

print("Building feature matrices (this will show shapes):")
X_train = stack_features(train, train_text_emb, train_img_emb)
X_test  = stack_features(test, test_text_emb, test_img_emb)
X_sample= stack_features(sample_test, sample_text_emb, sample_img_emb)
y = train['price'].values

print("Shapes -> X_train:", X_train.shape, "X_test:", X_test.shape, "X_sample:", X_sample.shape, "y:", y.shape)


Building feature matrices (this will show shapes):
Shapes -> X_train: (75000, 900) X_test: (75000, 900) X_sample: (100, 900) y: (75000,)


In [None]:
# Train LightGBM with 5-fold CV; tqdm over folds and track per fold SMAPE
import lightgbm as lgb
from sklearn.model_selection import KFold
import gc
from tqdm import tqdm

y_log = np.log1p(y)  # log-transform target
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

preds_oof_log = np.zeros(len(train))
preds_test_log = np.zeros(len(test))
preds_sample_log = np.zeros(len(sample_test))

fold = 0
for tr_idx, val_idx in tqdm(kf.split(X_train), total=n_splits, desc='CV folds', ncols=100):
    fold += 1
    print(f"\n--- Fold {fold} ---")
    X_tr, X_val = X_train[tr_idx], X_train[val_idx]
    y_tr, y_val = y_log[tr_idx], y_log[val_idx]

    model = lgb.LGBMRegressor(
        n_estimators=2000,
        learning_rate=0.05,
        # num_leaves=512,
        num_leaves=100,
        n_jobs=-1,
        verbosity=-1,
        random_state=42+fold
    )

    # --- THIS IS THE CORRECTED PART ---
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        callbacks=[lgb.early_stopping(stopping_rounds=100),
                   lgb.log_evaluation(period=100)]
    )

    preds_oof_log[val_idx] = model.predict(X_val)
    preds_test_log += model.predict(X_test) / n_splits
    preds_sample_log += model.predict(X_sample) / n_splits

    # free mem
    del model
    gc.collect()

# Inverse transform predictions
preds_val = np.expm1(preds_oof_log)
preds_test = np.expm1(preds_test_log)
preds_sample = np.expm1(preds_sample_log)

cv_smape = smape(y, preds_val)
print(f"\nCV SMAPE (on train via OOF): {cv_smape:.4f}%")

CV folds:   0%|                                                               | 0/5 [00:00<?, ?it/s]


--- Fold 1 ---
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 0.560758
[200]	valid_0's l2: 0.530008
[300]	valid_0's l2: 0.519205
[400]	valid_0's l2: 0.514125
[500]	valid_0's l2: 0.509988
[600]	valid_0's l2: 0.507016
[700]	valid_0's l2: 0.505097
[800]	valid_0's l2: 0.50367
[900]	valid_0's l2: 0.502786
[1000]	valid_0's l2: 0.501482
[1100]	valid_0's l2: 0.500571
[1200]	valid_0's l2: 0.499651
[1300]	valid_0's l2: 0.499023
[1400]	valid_0's l2: 0.498523
[1500]	valid_0's l2: 0.498035
[1600]	valid_0's l2: 0.49785
[1700]	valid_0's l2: 0.497588
[1800]	valid_0's l2: 0.497211
[1900]	valid_0's l2: 0.496892
[2000]	valid_0's l2: 0.496704
Did not meet early stopping. Best iteration is:
[2000]	valid_0's l2: 0.496704


CV folds:  20%|██████████▏                                        | 1/5 [20:44<1:22:59, 1244.92s/it]


--- Fold 2 ---
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 0.53994
[200]	valid_0's l2: 0.509144
[300]	valid_0's l2: 0.49762
[400]	valid_0's l2: 0.491689
[500]	valid_0's l2: 0.488266
[600]	valid_0's l2: 0.485404
[700]	valid_0's l2: 0.483648
[800]	valid_0's l2: 0.482102
[900]	valid_0's l2: 0.480892
[1000]	valid_0's l2: 0.479683
[1100]	valid_0's l2: 0.478686
[1200]	valid_0's l2: 0.477945
[1300]	valid_0's l2: 0.47746
[1400]	valid_0's l2: 0.477049
[1500]	valid_0's l2: 0.476654
[1600]	valid_0's l2: 0.476319
[1700]	valid_0's l2: 0.475779
[1800]	valid_0's l2: 0.475405
[1900]	valid_0's l2: 0.475129
[2000]	valid_0's l2: 0.474883
Did not meet early stopping. Best iteration is:
[1999]	valid_0's l2: 0.474877


CV folds:  40%|████████████████████▍                              | 2/5 [41:42<1:02:37, 1252.57s/it]


--- Fold 3 ---
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 0.53344
[200]	valid_0's l2: 0.502659
[300]	valid_0's l2: 0.491165
[400]	valid_0's l2: 0.485327
[500]	valid_0's l2: 0.481912
[600]	valid_0's l2: 0.479391
[700]	valid_0's l2: 0.477332
[800]	valid_0's l2: 0.475523
[900]	valid_0's l2: 0.474194
[1000]	valid_0's l2: 0.472805
[1100]	valid_0's l2: 0.47166
[1200]	valid_0's l2: 0.470934
[1300]	valid_0's l2: 0.470247
[1400]	valid_0's l2: 0.46955
[1500]	valid_0's l2: 0.468873
[1600]	valid_0's l2: 0.468443
[1700]	valid_0's l2: 0.468104
[1800]	valid_0's l2: 0.467885
[1900]	valid_0's l2: 0.467538
[2000]	valid_0's l2: 0.467347
Did not meet early stopping. Best iteration is:
[2000]	valid_0's l2: 0.467347


CV folds:  60%|██████████████████████████████▌                    | 3/5 [1:02:26<41:36, 1248.35s/it]


--- Fold 4 ---
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 0.523279
[200]	valid_0's l2: 0.494715
[300]	valid_0's l2: 0.485207
[400]	valid_0's l2: 0.47942
[500]	valid_0's l2: 0.47559
[600]	valid_0's l2: 0.472755
[700]	valid_0's l2: 0.470892
[800]	valid_0's l2: 0.469141
[900]	valid_0's l2: 0.46796
[1000]	valid_0's l2: 0.466982
[1100]	valid_0's l2: 0.466067
[1200]	valid_0's l2: 0.46543
[1300]	valid_0's l2: 0.465051
[1400]	valid_0's l2: 0.464588
[1500]	valid_0's l2: 0.464266
[1600]	valid_0's l2: 0.463878
[1700]	valid_0's l2: 0.463557
[1800]	valid_0's l2: 0.463316
[1900]	valid_0's l2: 0.463082
[2000]	valid_0's l2: 0.462919
Did not meet early stopping. Best iteration is:
[1999]	valid_0's l2: 0.462908


CV folds:  80%|████████████████████████████████████████▊          | 4/5 [1:23:24<20:52, 1252.34s/it]


--- Fold 5 ---
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 0.540513
[200]	valid_0's l2: 0.50954
[300]	valid_0's l2: 0.497973
[400]	valid_0's l2: 0.492425
[500]	valid_0's l2: 0.487967
[600]	valid_0's l2: 0.485005
[700]	valid_0's l2: 0.482908
[800]	valid_0's l2: 0.481484
[900]	valid_0's l2: 0.480183
[1000]	valid_0's l2: 0.478996
[1100]	valid_0's l2: 0.478227
[1200]	valid_0's l2: 0.477218
[1300]	valid_0's l2: 0.476609
[1400]	valid_0's l2: 0.475954
[1500]	valid_0's l2: 0.475529
[1600]	valid_0's l2: 0.475175
[1700]	valid_0's l2: 0.474837
[1800]	valid_0's l2: 0.474561
[1900]	valid_0's l2: 0.474339
[2000]	valid_0's l2: 0.474077
Did not meet early stopping. Best iteration is:
[2000]	valid_0's l2: 0.474077


CV folds: 100%|███████████████████████████████████████████████████| 5/5 [1:44:09<00:00, 1249.92s/it]


CV SMAPE (on train via OOF): 52.8090%





In [None]:
# Save outputs for test.csv and sample_test.csv
# Ensure positive and reasonable min value
preds_test_clipped = np.clip(preds_test, 0.01, None)
preds_sample_clipped = np.clip(preds_sample, 0.01, None)

out_test = pd.DataFrame({'sample_id': test['sample_id'], 'price': preds_test_clipped})
out_sample = pd.DataFrame({'sample_id': sample_test['sample_id'], 'price': preds_sample_clipped})

out_test.to_csv('/content/test_out.csv', index=False)
out_sample.to_csv('/content/sample_test_out.csv', index=False)

print("Saved /content/test_out.csv and /content/sample_test_out.csv")
display(out_test.head())
display(out_sample.head())


Saved /content/test_out.csv and /content/sample_test_out.csv


Unnamed: 0,sample_id,price
0,100179,14.161912
1,245611,16.817086
2,146263,20.290917
3,95658,10.190421
4,36806,26.269453


Unnamed: 0,sample_id,price
0,217392,38.641364
1,209156,16.640629
2,262333,11.27301
3,295979,8.413365
4,50604,13.398156


In [None]:
# Save artifacts (models not saved here; we saved embeddings)
# Save embeddings (if not already saved)
np.save('/content/train_text_emb.npy', train_text_emb)
np.save('/content/test_text_emb.npy', test_text_emb)
np.save('/content/sample_text_emb.npy', sample_text_emb)

np.save('/content/train_img_emb.npy', train_img_emb)
np.save('/content/test_img_emb.npy', test_img_emb)
np.save('/content/sample_img_emb.npy', sample_img_emb)

print("Saved embedding files in /content/")


Saved embedding files in /content/


In [None]:
# Run this to mount your Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Create a new directory in your Google Drive
!mkdir -p /content/drive/MyDrive/Amazon_ML_Results

# Copy your final prediction files
!cp /content/test_out.csv /content/drive/MyDrive/Amazon_ML_Results/
!cp /content/sample_test_out.csv /content/drive/MyDrive/Amazon_ML_Results/

print("✅ Your result files have been successfully saved to Google Drive!")

Mounted at /content/drive
✅ Your result files have been successfully saved to Google Drive!


In [None]:
# Version2:
# Improving LightGBM training loop with:
# - Stratified KFold on binned log(price)
# - Custom SMAPE eval for LightGBM
# - Bagging across seeds
# - Parameter improvements (regularization, feature/bagging fractions)
# - Blend with group median baseline (weight tuned by CV)
# - Tqdm progress bars and per-fold SMAPE

import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
import gc
from tqdm import tqdm

# 1) create stratify bins on log price
y_log = np.log1p(y)
n_bins = 10
try:
    bins = pd.qcut(y_log, q=n_bins, labels=False, duplicates='drop')
except Exception:
    bins = pd.cut(y_log, bins=n_bins, labels=False)
bins = np.array(bins, dtype=int)

# 2) --- THIS IS THE CORRECTED FUNCTION ---
def lgb_smape_eval(y_pred_log, data):
    """Custom SMAPE evaluation function for LightGBM."""
    y_true_log = data.get_label() # Get the true labels from the dataset object
    y_true = np.expm1(y_true_log)
    y_pred = np.expm1(y_pred_log)
    denom = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    denom = np.where(denom == 0, 1e-8, denom)
    val = np.mean(np.abs(y_pred - y_true) / denom)
    return 'SMAPE', val, False # False -> lower is better

# 3) LightGBM param template
lgb_params_template = {
    'objective': 'regression',
    'boosting_type': 'gbdt',
    'metric': 'None',
    'learning_rate': 0.02,
    'n_estimators': 5000,
    'num_leaves': 100,
    'min_data_in_leaf': 30,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'lambda_l1': 0.5,
    'lambda_l2': 0.5,
    'random_state': 42,
    'n_jobs': -1,
    'verbosity': -1
}

# 4) Bagging seeds
seeds = [42, 2023]
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

preds_oof_log = np.zeros(len(train))
preds_test_log = np.zeros(len(test))
preds_sample_log = np.zeros(len(sample_test))

fold_idx = 0
fold_smape_list = []

# main CV loop
for train_idx, val_idx in tqdm(skf.split(X_train, bins), total=n_splits, desc='CV folds', ncols=120):
    fold_idx += 1
    print(f"\n--- Fold {fold_idx} ---")
    X_tr, X_val = X_train[train_idx], X_train[val_idx]
    y_tr_log, y_val_log = y_log[train_idx], y_log[val_idx]
    y_val_orig = np.expm1(y_val_log)

    lgb_train = lgb.Dataset(X_tr, y_tr_log)
    lgb_val = lgb.Dataset(X_val, y_val_log, reference=lgb_train)

    fold_val_preds_log = np.zeros(len(val_idx))
    fold_test_preds_log = np.zeros(X_test.shape[0])
    fold_sample_preds_log = np.zeros(X_sample.shape[0])

    for seed in seeds:
        print(f" Training seed {seed} ...")
        params = lgb_params_template.copy()
        params['random_state'] = seed

        model = lgb.train(
            params,
            lgb_train,
            valid_sets=[lgb_val],
            feval=lgb_smape_eval,
            callbacks=[lgb.early_stopping(stopping_rounds=200, verbose=False),
                       lgb.log_evaluation(period=200)]
        )

        val_pred_log = model.predict(X_val, num_iteration=model.best_iteration)
        test_pred_log = model.predict(X_test, num_iteration=model.best_iteration)
        sample_pred_log = model.predict(X_sample, num_iteration=model.best_iteration)

        fold_val_preds_log += val_pred_log
        fold_test_preds_log += test_pred_log
        fold_sample_preds_log += sample_pred_log

        del model
        gc.collect()

    fold_val_preds_log /= len(seeds)
    fold_test_preds_log /= len(seeds)
    fold_sample_preds_log /= len(seeds)

    preds_oof_log[val_idx] = fold_val_preds_log
    preds_test_log += fold_test_preds_log / n_splits
    preds_sample_log += fold_sample_preds_log / n_splits

    fold_val_preds = np.expm1(fold_val_preds_log)
    fold_smape = smape(y_val_orig, fold_val_preds)
    fold_smape_list.append(fold_smape)
    print(f" Fold {fold_idx} SMAPE: {fold_smape:.4f}%")

# Final inverse transform
preds_val = np.expm1(preds_oof_log)
preds_test = np.expm1(preds_test_log)
preds_sample = np.expm1(preds_sample_log)

cv_smape = smape(y, preds_val)
print(f"\nOverall CV OOF SMAPE: {cv_smape:.4f}%")
print("Per-fold SMAPEs:", [f"{s:.4f}%" for s in fold_smape_list])

CV folds:   0%|                                                                                   | 0/5 [00:00<?, ?it/s]


--- Fold 1 ---
 Training seed 42 ...
[200]	valid_0's SMAPE: 0.582709
[400]	valid_0's SMAPE: 0.561356
[600]	valid_0's SMAPE: 0.550942
[800]	valid_0's SMAPE: 0.54437
[1000]	valid_0's SMAPE: 0.540373
[1200]	valid_0's SMAPE: 0.537475
[1400]	valid_0's SMAPE: 0.535405
[1600]	valid_0's SMAPE: 0.533521
[1800]	valid_0's SMAPE: 0.53208
[2000]	valid_0's SMAPE: 0.530975
[2200]	valid_0's SMAPE: 0.529911
[2400]	valid_0's SMAPE: 0.529154
[2600]	valid_0's SMAPE: 0.528311
[2800]	valid_0's SMAPE: 0.527675
[3000]	valid_0's SMAPE: 0.527077
[3200]	valid_0's SMAPE: 0.526567
[3400]	valid_0's SMAPE: 0.526049
[3600]	valid_0's SMAPE: 0.525536
[3800]	valid_0's SMAPE: 0.525131
[4000]	valid_0's SMAPE: 0.524809
[4200]	valid_0's SMAPE: 0.52457
[4400]	valid_0's SMAPE: 0.52429
[4600]	valid_0's SMAPE: 0.524098
[4800]	valid_0's SMAPE: 0.523845
[5000]	valid_0's SMAPE: 0.523685
 Training seed 2023 ...
[200]	valid_0's SMAPE: 0.582564
[400]	valid_0's SMAPE: 0.560726
[600]	valid_0's SMAPE: 0.551074
[800]	valid_0's SMAPE: 0.