In [2]:
import pandas as pd
import numpy as np
import re
import os
import string
from tqdm import tqdm
tqdm.pandas()

train_path = r"D:\Amazon ML\student_resource\dataset\train.csv"
test_path = r"D:\Amazon ML\student_resource\dataset\test.csv"
sample_test_path = r"D:\Amazon ML\student_resource\dataset\sample_test.csv"
sample_out_path = r"D:\Amazon ML\student_resource\dataset\sample_test_out.csv"

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)


In [3]:
def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"[^a-z0-9\s.,-]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

train["clean_text"] = train["catalog_content"].progress_apply(clean_text)
test["clean_text"] = test["catalog_content"].progress_apply(clean_text)


100%|██████████| 74999/74999 [00:04<00:00, 18442.90it/s]
100%|██████████| 75000/75000 [00:03<00:00, 18917.10it/s]


In [4]:
def extract_quantity(text):
    if not isinstance(text, str):
        return 1
    patterns = [
        r"pack of (\d+)",
        r"set of (\d+)",
        r"(\d+)\s?pcs?",
        r"(\d+)\s?x",
        r"(\d+)\s?count",
        r"(\d+)\s?piece"
    ]
    for p in patterns:
        m = re.search(p, text)
        if m:
            return int(m.group(1))
    return 1

train["item_quantity"] = train["clean_text"].progress_apply(extract_quantity)
test["item_quantity"] = test["clean_text"].progress_apply(extract_quantity)


100%|██████████| 74999/74999 [00:02<00:00, 25413.12it/s]
100%|██████████| 75000/75000 [00:02<00:00, 26039.97it/s]


In [6]:
print("Train price summary:")
print(train["price"].describe())

print("\nTop quantities:")
print(train["item_quantity"].value_counts().head(50))


Train price summary:
count    74999.000000
mean        23.647953
std         33.377054
min          0.130000
25%          6.797500
50%         14.000000
75%         28.625000
max       2796.000000
Name: price, dtype: float64

Top quantities:
item_quantity
1       50121
6        4871
12       4348
2        2758
3        2368
4        2174
24       1225
8        1006
10        659
5         582
20        399
18        319
16        246
100       243
40        227
48        209
15        205
36        197
50        175
30        173
60        157
9         154
7         137
72        124
96        124
25        112
14        110
80        102
32         78
200        71
22         68
120        61
42         44
130        41
65         40
90         38
150        38
28         31
84         31
1000       30
500        27
64         27
75         27
240        26
180        26
70         26
11         26
108        25
35         24
13         23
Name: count, dtype: int64


In [7]:
clean_train_path = r"D:\Amazon ML\student_resource\dataset\train_clean.csv"
clean_test_path = r"D:\Amazon ML\student_resource\dataset\test_clean.csv"

train.to_csv(clean_train_path, index=False)
test.to_csv(clean_test_path, index=False)
print("✅ Cleaned data saved.")


✅ Cleaned data saved.


In [1]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("CUDA device count:", torch.cuda.device_count())
print("CUDA device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")


CUDA available: True
CUDA device count: 1
CUDA device name: NVIDIA GeForce RTX 3050 Laptop GPU


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error
import torch
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
tqdm.pandas()

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

train = pd.read_csv(r"D:\Amazon ML\student_resource\dataset\train_clean.csv")


Using device: cuda


In [3]:
train_data, val_data = train_test_split(train, test_size=0.2, random_state=42)
print(train_data.shape, val_data.shape)


(59999, 6) (15000, 6)


In [4]:
import numpy as np

y_train = np.log1p(train_data["price"].values)  # log(price + 1)
y_val = np.log1p(val_data["price"].values)


In [5]:
from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm import tqdm
tqdm.pandas()

# Initialize model on GPU
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device='cuda')

# Encode train and validation texts
train_texts = train_data["clean_text"].tolist()
val_texts = val_data["clean_text"].tolist()

# Generate embeddings
print("Generating train embeddings...")
train_embeddings = model.encode(train_texts, batch_size=64, show_progress_bar=True, device='cuda')

print("Generating validation embeddings...")
val_embeddings = model.encode(val_texts, batch_size=64, show_progress_bar=True, device='cuda')

# Convert to numpy arrays
X_train_bert = np.array(train_embeddings)
X_val_bert = np.array(val_embeddings)

print("Train embeddings shape:", X_train_bert.shape)
print("Validation embeddings shape:", X_val_bert.shape)


Generating train embeddings...


Batches:   0%|          | 0/938 [00:00<?, ?it/s]

Generating validation embeddings...


Batches:   0%|          | 0/235 [00:00<?, ?it/s]

Train embeddings shape: (59999, 768)
Validation embeddings shape: (15000, 768)


In [6]:
np.save(r"D:\Amazon ML\student_resource\dataset\train_text_embeds.npy", X_train_bert)
np.save(r"D:\Amazon ML\student_resource\dataset\val_text_embeds.npy", X_val_bert)

print("✅ Text embeddings saved.")


✅ Text embeddings saved.


In [8]:
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error
X_train = np.array(train_embeddings)
X_val = np.array(val_embeddings)

# Initialize model
model = LGBMRegressor(
    objective='regression',
    boosting_type='gbdt',
    learning_rate=0.05,
    num_leaves=64,
    max_depth=10,
    n_estimators=1000,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

# Train with early stopping
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric='mae',
    
)

# Predict
val_preds_log = model.predict(X_val)
val_preds = np.expm1(val_preds_log)
val_true = np.expm1(y_val)  # make sure targets are back to original scale

# SMAPE
smape = np.mean(np.abs(val_preds - val_true) / ((np.abs(val_preds) + np.abs(val_true))/2)*100) 
print(f"LGBMRegressor SMAPE on validation: {smape:.2f}%")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.229071 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 59999, number of used features: 768
[LightGBM] [Info] Start training from score 2.738616
LGBMRegressor SMAPE on validation: 58.56%


In [9]:
from PIL import Image
import torch
from torchvision import transforms, models
import os
import numpy as np
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)


Using device: cuda


In [10]:
import torch
from torchvision import models, transforms

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load pretrained EfficientNet-B3
efficientnet = models.efficientnet_b3(weights='IMAGENET1K_V1')  # new API
efficientnet = efficientnet.to(device)
efficientnet.eval()  # set to evaluation mode

# Remove classifier to get embeddings (keep avgpool)
feature_extractor = torch.nn.Sequential(
    efficientnet.features,
    efficientnet.avgpool
)

# Image preprocessing
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])


Downloading: "https://download.pytorch.org/models/efficientnet_b3_rwightman-b3899882.pth" to C:\Users\rames/.cache\torch\hub\checkpoints\efficientnet_b3_rwightman-b3899882.pth
100%|██████████| 47.2M/47.2M [00:02<00:00, 16.7MB/s]


In [11]:
from PIL import Image
import torch
import numpy as np
from tqdm import tqdm

def get_image_embeddings(img_paths, batch_size=32):
    embeddings = []

    for i in tqdm(range(0, len(img_paths), batch_size), desc="Extracting embeddings"):
        batch_imgs = []
        # ---- Load and preprocess images ----
        for img_path in img_paths[i:i+batch_size]:
            try:
                img = Image.open(img_path).convert('RGB')
                img = transform(img)
            except Exception as e:
                # Handle missing or corrupted files safely
                print(f"Warning: could not process {img_path}: {e}")
                img = torch.zeros(3, 224, 224)
            batch_imgs.append(img)

        # ---- Batch forward pass ----
        batch_tensor = torch.stack(batch_imgs).to(device)
        with torch.no_grad():
            batch_features = feature_extractor(batch_tensor)
            batch_features = torch.flatten(batch_features, 1)  # cleaner flatten
            embeddings.append(batch_features.cpu().numpy())

        # (Optional) free GPU memory
        del batch_tensor, batch_features
        torch.cuda.empty_cache()

    return np.vstack(embeddings)


In [12]:
import os
import numpy as np

train_img_folder = r"D:\Amazon ML\student_resource\dataset\images\train"
val_img_folder = r"D:\Amazon ML\student_resource\dataset\images\train"  # same folder

# Build image paths
train_img_paths = [
    os.path.join(train_img_folder, f"{sid}.jpg") for sid in train_data["sample_id"]
]
val_img_paths = [
    os.path.join(val_img_folder, f"{sid}.jpg") for sid in val_data["sample_id"]
]

# --- Extract embeddings ---
print("Extracting train image embeddings...")
X_train_img = get_image_embeddings(train_img_paths, batch_size=32)

print("Extracting validation image embeddings...")
X_val_img = get_image_embeddings(val_img_paths, batch_size=32)

# --- Verify shapes ---
print(f"Train image embeddings shape: {X_train_img.shape}")
print(f"Validation image embeddings shape: {X_val_img.shape}")

# --- Save embeddings ---
os.makedirs(r"D:\Amazon ML\student_resource\dataset", exist_ok=True)
np.save(r"D:\Amazon ML\student_resource\dataset\train_image_embeds.npy", X_train_img)
np.save(r"D:\Amazon ML\student_resource\dataset\val_image_embeds.npy", X_val_img)

print("✅ Image embeddings saved successfully!")


Extracting train image embeddings...


Extracting embeddings: 100%|██████████| 1875/1875 [48:49<00:00,  1.56s/it]


Extracting validation image embeddings...


Extracting embeddings: 100%|██████████| 469/469 [12:08<00:00,  1.55s/it]


Train image embeddings shape: (59999, 1536)
Validation image embeddings shape: (15000, 1536)
✅ Image embeddings saved successfully!


In [13]:
X_train_final = np.hstack([X_train_bert, X_train_img, train_data["item_quantity"].values.reshape(-1,1)])
X_val_final = np.hstack([X_val_bert, X_val_img, val_data["item_quantity"].values.reshape(-1,1)])


In [14]:
# Targets (log-transform)
y_train_log = np.log1p(train_data["price"].values)
y_val_log = np.log1p(val_data["price"].values)

In [15]:
from lightgbm import LGBMRegressor
import numpy as np

# Initialize model (GPU-enabled)
model = LGBMRegressor(
    objective='regression',
    boosting_type='gbdt',
    learning_rate=0.01,
    num_leaves=64,
    max_depth=12,
    n_estimators=2500,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    device='gpu'
)

# Train with early stopping
model.fit(
    X_train_final, y_train_log,
    eval_set=[(X_val_final, y_val_log)],
    eval_metric='mae',
   
)


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 587648
[LightGBM] [Info] Number of data points in the train set: 59999, number of used features: 2305
[LightGBM] [Info] Using GPU Device: Intel(R) UHD Graphics, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 2305 dense feature groups (132.06 MB) transferred to GPU in 0.050661 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 2.738616


In [16]:
# Predict
val_preds_log = model.predict(X_val_final)
val_preds = np.expm1(val_preds_log)  # back to original price scale
val_true = val_data["price"].values

# Compute SMAPE
smape = np.mean(np.abs(val_preds - val_true) / ((np.abs(val_preds) + np.abs(val_true))/2)) 
print(f"Multimodal LightGBM SMAPE on validation: {smape:.2f}")


Multimodal LightGBM SMAPE on validation: 0.57


In [17]:
import joblib
joblib.dump(model, r"D:\Amazon ML\student_resource\dataset\lgbm_multimodal_model.pkl")
print("✅ Multimodal LightGBM model saved")


✅ Multimodal LightGBM model saved


In [18]:
import re
def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"[^a-z0-9\s.,-]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text
def extract_quantity(text):
    if not isinstance(text, str):
        return 1
    patterns = [
        r"pack of (\d+)",
        r"set of (\d+)",
        r"(\d+)\s?pcs?",
        r"(\d+)\s?x",
        r"(\d+)\s?count",
        r"(\d+)\s?piece"
    ]
    for p in patterns:
        m = re.search(p, text)
        if m:
            return int(m.group(1))
    return 1


In [19]:
# Load sample test and output
sample_test = pd.read_csv(r"D:\Amazon ML\student_resource\dataset\sample_test.csv")
sample_test_out = pd.read_csv(r"D:\Amazon ML\student_resource\dataset\sample_test_out.csv")


# Apply to sample_test
sample_test["clean_text"] = sample_test["catalog_content"].progress_apply(clean_text)
sample_test["item_quantity"] = sample_test["clean_text"].progress_apply(extract_quantity)

# Extract features
# Text embeddings
sample_test_texts = sample_test["clean_text"].tolist()
from sentence_transformers import SentenceTransformer

# Load a sentence transformer model
embedding_model = SentenceTransformer('all-mpnet-base-v2', device='cuda')

# Encode text
X_sample_text = embedding_model.encode(sample_test_texts, batch_size=64, show_progress_bar=True, device='cuda')


# Image embeddings
sample_img_folder = r"D:\Amazon ML\student_resource\dataset\images\sample_test"
sample_img_paths = [os.path.join(sample_img_folder, f"{sid}.jpg") for sid in sample_test["sample_id"]]
X_sample_img = get_image_embeddings(sample_img_paths, batch_size=32)

# Combine embeddings + numeric
X_sample_final = np.hstack([
    X_sample_text,
    X_sample_img,
    sample_test["item_quantity"].values.reshape(-1,1)
])


100%|██████████| 100/100 [00:00<00:00, 10881.30it/s]
100%|██████████| 100/100 [00:00<00:00, 14952.42it/s]


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting embeddings: 100%|██████████| 4/4 [00:05<00:00,  1.36s/it]


In [20]:
import joblib

# Load trained multimodal LightGBM
model = joblib.load(r"D:\Amazon ML\student_resource\dataset\lgbm_multimodal_model.pkl")

# Predict (log scale → back to original)
sample_preds_log = model.predict(X_sample_final)
sample_preds = np.expm1(sample_preds_log)  # this is your actual predicted price array


In [21]:
# True prices
y_true = sample_test_out["price"].values

# Predicted prices
y_pred = sample_preds  # from previous step

# Compute SMAPE
smape = np.mean(np.abs(y_pred - y_true) / ((np.abs(y_pred) + np.abs(y_true))/2)*100) 
print(f"Multimodal LightGBM SMAPE on sample_test: {smape:.2f}%")


Multimodal LightGBM SMAPE on sample_test: 104.81%


In [22]:
# Load sample test and output
test_clean = pd.read_csv(r"D:\Amazon ML\student_resource\dataset\test_clean.csv")


# Apply to sample_test
test_clean["clean_text"] = test_clean["catalog_content"].progress_apply(clean_text)
test_clean["item_quantity"] = test_clean["clean_text"].progress_apply(extract_quantity)

# Extract features
# Text embeddings
test_clean_texts = test_clean["clean_text"].tolist()
from sentence_transformers import SentenceTransformer

# Load a sentence transformer model
embedding_model = SentenceTransformer('all-mpnet-base-v2', device='cuda')

# Encode text
X_sample_text = embedding_model.encode(test_clean_texts, batch_size=64, show_progress_bar=True, device='cuda')


# Image embeddings
test_img_folder = r"D:\Amazon ML\student_resource\dataset\images\test"
test_img_paths = [os.path.join(test_img_folder, f"{sid}.jpg") for sid in test_clean["sample_id"]]
X_sample_img = get_image_embeddings(test_img_paths, batch_size=32)

# Combine embeddings + numeric
X_sample_final = np.hstack([
    X_sample_text,
    X_sample_img,
    test_clean["item_quantity"].values.reshape(-1,1)
])


100%|██████████| 75000/75000 [00:03<00:00, 20644.49it/s]
100%|██████████| 75000/75000 [00:02<00:00, 26415.61it/s]


Batches:   0%|          | 0/1172 [00:00<?, ?it/s]

Extracting embeddings:  56%|█████▌    | 1313/2344 [34:51<27:57,  1.63s/it] 



Extracting embeddings: 100%|██████████| 2344/2344 [1:01:50<00:00,  1.58s/it]


In [23]:
import joblib

# Load trained multimodal LightGBM
model = joblib.load(r"D:\Amazon ML\student_resource\dataset\lgbm_multimodal_model.pkl")

# Predict (log scale → back to original)
sample_preds_log = model.predict(X_sample_final)
sample_preds = np.expm1(sample_preds_log)  # this is your actual predicted price array

In [24]:
# Create DataFrame for submission
submission = pd.DataFrame({
    "sample_id": test_clean["sample_id"],
    "price": sample_preds
})

# Save to CSV
output_path = r"D:\Amazon ML\student_resource\dataset\test_out_1.csv"
submission.to_csv(output_path, index=False)

print(f"✅ Saved predictions to: {output_path}")


✅ Saved predictions to: D:\Amazon ML\student_resource\dataset\test_out_1.csv


In [25]:
import pandas as pd

train = pd.read_csv("train_clean.csv")
test = pd.read_csv("test_clean.csv")
test_out = pd.read_csv("test_out.csv")

test = test.merge(test_out, on="sample_id", how="left")

combined = pd.concat([train, test], ignore_index=True)
combined.to_csv("large_train.csv", index=False)

print("✅ Combined dataset saved as large_train.csv")
print("Total rows:", combined.shape[0])


✅ Combined dataset saved as large_train.csv
Total rows: 149999
