In [2]:
import pandas as pd
import numpy as np
import re
import os
import string
from tqdm import tqdm
tqdm.pandas()

train_path = r"D:\Amazon ML\student_resource\dataset\large_train.csv"
test_path = r"D:\Amazon ML\student_resource\dataset\test.csv"

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

In [3]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("CUDA device count:", torch.cuda.device_count())
print("CUDA device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")


CUDA available: True
CUDA device count: 1
CUDA device name: NVIDIA GeForce RTX 3050 Laptop GPU


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error
import torch
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
tqdm.pandas()

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)


Using device: cuda


In [5]:
train_data, val_data = train_test_split(train, test_size=0.2, random_state=42)
print(train_data.shape, val_data.shape)

(119999, 6) (30000, 6)


In [6]:
y_train = np.log1p(train_data["price"].values)  # log(price + 1)
y_val = np.log1p(val_data["price"].values)


In [7]:
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
tqdm.pandas()

In [8]:
# Initialize model on GPU
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device='cuda')

# Encode train and validation texts
train_texts = train_data["clean_text"].tolist()
val_texts = val_data["clean_text"].tolist()

# Generate embeddings
print("Generating train embeddings...")
train_embeddings = model.encode(train_texts, batch_size=64, show_progress_bar=True, device='cuda')

print("Generating validation embeddings...")
val_embeddings = model.encode(val_texts, batch_size=64, show_progress_bar=True, device='cuda')

# Convert to numpy arrays
X_train_bert = np.array(train_embeddings)
X_val_bert = np.array(val_embeddings)

print("Train embeddings shape:", X_train_bert.shape)
print("Validation embeddings shape:", X_val_bert.shape)

Generating train embeddings...


Batches:   0%|          | 0/1875 [00:00<?, ?it/s]

Generating validation embeddings...


Batches:   0%|          | 0/469 [00:00<?, ?it/s]

Train embeddings shape: (119999, 768)
Validation embeddings shape: (30000, 768)


In [9]:
import torch
from torchvision import models, transforms

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load pretrained EfficientNet-B3
efficientnet = models.efficientnet_b3(weights='IMAGENET1K_V1')  # new API
efficientnet = efficientnet.to(device)
efficientnet.eval()  # set to evaluation mode

# Remove classifier to get embeddings (keep avgpool)
feature_extractor = torch.nn.Sequential(
    efficientnet.features,
    efficientnet.avgpool
)

# Image preprocessing
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])


In [12]:
import torch
import clip
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load CLIP model
clip_model, preprocess = clip.load("ViT-B/32", device=device)


100%|███████████████████████████████████████| 338M/338M [02:54<00:00, 2.03MiB/s]


In [15]:
from PIL import Image
from tqdm import tqdm
import numpy as np
import torch

def get_clip_image_embeddings(img_paths, batch_size=16):
    all_embeds = []

    for i in tqdm(range(0, len(img_paths), batch_size), desc="Extracting CLIP embeddings"):
        batch_imgs = []

        for p in img_paths[i:i+batch_size]:
            try:
                with Image.open(p).convert('RGB') as img:
                    batch_imgs.append(preprocess(img))
            except Exception:
                batch_imgs.append(torch.zeros(3, 224, 224))  # fallback

        batch_tensor = torch.stack(batch_imgs).to(device)

        with torch.no_grad():
            feats = clip_model.encode_image(batch_tensor)
            feats = feats / feats.norm(dim=-1, keepdim=True)  # normalize
            all_embeds.append(feats.cpu().numpy())

        torch.cuda.empty_cache()

    return np.concatenate(all_embeds, axis=0)


In [None]:
import os
import numpy as np

train_img_folder = r"D:\Amazon ML\student_resource\dataset\images\train"
val_img_folder = r"D:\Amazon ML\student_resource\dataset\images\train"  # same folder

# Build image paths
train_img_paths = [
    os.path.join(train_img_folder, f"{sid}.jpg") for sid in train_data["sample_id"]
]
val_img_paths = [
    os.path.join(val_img_folder, f"{sid}.jpg") for sid in val_data["sample_id"]
]

# --- Extract embeddings ---
print("Extracting train image embeddings...")
X_train_img = get_clip_image_embeddings(train_img_paths, batch_size=48)
 



Extracting train image embeddings...


Extracting CLIP embeddings: 100%|██████████| 2500/2500 [1:58:35<00:00,  2.85s/it]


Extracting validation image embeddings...


Extracting CLIP embeddings:  33%|███▎      | 206/625 [10:06<20:33,  2.94s/it]


KeyboardInterrupt: 

In [18]:
print("Extracting validation image embeddings...")
X_val_img = get_clip_image_embeddings(val_img_paths, batch_size=48)

# --- Verify shapes ---
print(f"Train image embeddings shape: {X_train_img.shape}")
print(f"Validation image embeddings shape: {X_val_img.shape}")

# --- Save embeddings ---
os.makedirs(r"D:\Amazon ML\student_resource\dataset", exist_ok=True)
np.save(r"D:\Amazon ML\student_resource\dataset\train_image_embeds.npy", X_train_img)
np.save(r"D:\Amazon ML\student_resource\dataset\val_image_embeds.npy", X_val_img)

print("✅ Image embeddings saved successfully!")

Extracting validation image embeddings...


Extracting CLIP embeddings: 100%|██████████| 625/625 [29:32<00:00,  2.84s/it]


Train image embeddings shape: (119999, 512)
Validation image embeddings shape: (30000, 512)
✅ Image embeddings saved successfully!


In [19]:
X_train_final = np.hstack([X_train_bert, X_train_img, train_data["item_quantity"].values.reshape(-1,1)])
X_val_final = np.hstack([X_val_bert, X_val_img, val_data["item_quantity"].values.reshape(-1,1)])
# Targets (log-transform)
y_train_log = np.log1p(train_data["price"].values)
y_val_log = np.log1p(val_data["price"].values)

In [25]:
from lightgbm import LGBMRegressor, early_stopping, log_evaluation

model = LGBMRegressor(
    objective='regression',
    boosting_type='gbdt',
    learning_rate=0.03,
    num_leaves=64,
    max_depth=12,
    n_estimators=4000,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    device='gpu'
)

model.fit(
    X_train_final, y_train_log,
    eval_set=[(X_val_final, y_val_log)],
    eval_metric='mae',
    callbacks=[
        early_stopping(stopping_rounds=50),
        log_evaluation(period=50)  # prints progress every 50 rounds
    ]
)


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 326565
[LightGBM] [Info] Number of data points in the train set: 119999, number of used features: 1281
[LightGBM] [Info] Using GPU Device: Intel(R) UHD Graphics, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 1281 dense feature groups (146.94 MB) transferred to GPU in 0.064612 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 2.739347
Training until validation scores don't improve for 50 rounds
[50]	valid_0's l1: 0.476291	valid_0's l2: 0.412161
[100]	valid_0's l1: 0.443534	valid_0's l2: 0.369023
[150]	valid_0's l1: 0.42788	valid_0's l2: 0.348667
[200]	valid_0's l1: 0.418607	valid_0's l2: 0.336315
[250]	valid_0's l1: 0.412054	valid_0's l2: 0.327674
[300]	valid_0's l1: 0.407085	valid_0's l2: 0.321022
[350]	valid_0's l1: 0.403013	valid_0's l

In [26]:
# Predict
val_preds_log = model.predict(X_val_final)
val_preds = np.expm1(val_preds_log)  # back to original price scale
val_true = val_data["price"].values

# Compute SMAPE
smape = np.mean(np.abs(val_preds - val_true) / ((np.abs(val_preds) + np.abs(val_true))/2)*100) 
print(f"Multimodal LightGBM SMAPE on validation: {smape:.2f}")


Multimodal LightGBM SMAPE on validation: 37.85


In [27]:
import joblib
joblib.dump(model, r"D:\Amazon ML\student_resource\dataset\lgbm_multimodal_model_1.pkl")
print("✅ Multimodal LightGBM model saved")


✅ Multimodal LightGBM model saved


In [28]:
import re
def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"[^a-z0-9\s.,-]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text
def extract_quantity(text):
    if not isinstance(text, str):
        return 1
    patterns = [
        r"pack of (\d+)",
        r"set of (\d+)",
        r"(\d+)\s?pcs?",
        r"(\d+)\s?x",
        r"(\d+)\s?count",
        r"(\d+)\s?piece"
    ]
    for p in patterns:
        m = re.search(p, text)
        if m:
            return int(m.group(1))
    return 1


In [29]:
# Load sample test and output
sample_test = pd.read_csv(r"D:\Amazon ML\student_resource\dataset\sample_test.csv")
sample_test_out = pd.read_csv(r"D:\Amazon ML\student_resource\dataset\sample_test_out.csv")


# Apply to sample_test
sample_test["clean_text"] = sample_test["catalog_content"].progress_apply(clean_text)
sample_test["item_quantity"] = sample_test["clean_text"].progress_apply(extract_quantity)

# Extract features
# Text embeddings
sample_test_texts = sample_test["clean_text"].tolist()
from sentence_transformers import SentenceTransformer

# Load a sentence transformer model
embedding_model = SentenceTransformer('all-mpnet-base-v2', device='cuda')

# Encode text
X_sample_text = embedding_model.encode(sample_test_texts, batch_size=64, show_progress_bar=True, device='cuda')


# Image embeddings
sample_img_folder = r"D:\Amazon ML\student_resource\dataset\images\sample_test"
sample_img_paths = [os.path.join(sample_img_folder, f"{sid}.jpg") for sid in sample_test["sample_id"]]
X_sample_img = get_clip_image_embeddings(sample_img_paths, batch_size=32)

# Combine embeddings + numeric
X_sample_final = np.hstack([
    X_sample_text,
    X_sample_img,
    sample_test["item_quantity"].values.reshape(-1,1)
])


100%|██████████| 100/100 [00:00<00:00, 3093.26it/s]
100%|██████████| 100/100 [00:00<00:00, 11398.49it/s]


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting CLIP embeddings: 100%|██████████| 4/4 [00:07<00:00,  1.76s/it]


In [30]:
import joblib

# Load trained multimodal LightGBM
model = joblib.load(r"D:\Amazon ML\student_resource\dataset\lgbm_multimodal_model_1.pkl")

# Predict (log scale → back to original)
sample_preds_log = model.predict(X_sample_final)
sample_preds = np.expm1(sample_preds_log)  # this is your actual predicted price array


In [31]:
# True prices
y_true = sample_test_out["price"].values

# Predicted prices
y_pred = sample_preds  # from previous step

# Compute SMAPE
smape = np.mean(np.abs(y_pred - y_true) / ((np.abs(y_pred) + np.abs(y_true))/2)*100) 
print(f"Multimodal LightGBM SMAPE on sample_test: {smape:.2f}%")


Multimodal LightGBM SMAPE on sample_test: 108.01%


In [32]:
# Load sample test and output
test_clean = pd.read_csv(r"D:\Amazon ML\student_resource\dataset\test_clean.csv")


# Apply to sample_test
test_clean["clean_text"] = test_clean["catalog_content"].progress_apply(clean_text)
test_clean["item_quantity"] = test_clean["clean_text"].progress_apply(extract_quantity)

# Extract features
# Text embeddings
test_clean_texts = test_clean["clean_text"].tolist()
from sentence_transformers import SentenceTransformer

# Load a sentence transformer model
embedding_model = SentenceTransformer('all-mpnet-base-v2', device='cuda')

# Encode text
X_sample_text = embedding_model.encode(test_clean_texts, batch_size=64, show_progress_bar=True, device='cuda')


# Image embeddings
test_img_folder = r"D:\Amazon ML\student_resource\dataset\images\test"
test_img_paths = [os.path.join(test_img_folder, f"{sid}.jpg") for sid in test_clean["sample_id"]]
X_sample_img = get_clip_image_embeddings(test_img_paths, batch_size=32)

# Combine embeddings + numeric
X_sample_final = np.hstack([
    X_sample_text,
    X_sample_img,
    test_clean["item_quantity"].values.reshape(-1,1)
])


100%|██████████| 75000/75000 [00:03<00:00, 20066.88it/s]
100%|██████████| 75000/75000 [00:02<00:00, 26595.57it/s]


Batches:   0%|          | 0/1172 [00:00<?, ?it/s]

Extracting CLIP embeddings: 100%|██████████| 2344/2344 [1:16:52<00:00,  1.97s/it]


In [33]:
# Predict (log scale → back to original)
sample_preds_log = model.predict(X_sample_final)
sample_preds = np.expm1(sample_preds_log)  # this is your actual predicted price array
# Create DataFrame for submission
submission = pd.DataFrame({
    "sample_id": test_clean["sample_id"],
    "price": sample_preds
})

# Save to CSV
output_path = r"D:\Amazon ML\student_resource\dataset\test_out_2.csv"
submission.to_csv(output_path, index=False)

print(f"✅ Saved predictions to: {output_path}")


✅ Saved predictions to: D:\Amazon ML\student_resource\dataset\test_out_2.csv
