In [None]:
print("Welcome to our Amazon Ai/ML Hackathon Workspace")

Welcome to our Amazon Ai/ML Hackathon Workspace


In [None]:
import pandas as pd

# Create a sample CSV dataframe representing model evaluation results
data = {
    "Model": [
        "Baseline (Median Price)",
        "LightGBM (Tabular)",
        "RoBERTa-base (Text)",
        "ResNet50 (Image)",
        "Stacking Ensemble (Final Model)"
    ],
    "Feature_Set": [
        "None",
        "Numerical + Categorical Features",
        "catalog_content (Text)",
        "image_link (Visual Features)",
        "Combined (Text + Image + Tabular)"
    ],
    "Cross_Validation_SMAPE(%)": [35.0, 18.5, 20.1, 22.8, 17.2],
    "MAE": [68.4, 24.7, 26.1, 28.3, 22.9],
    "RMSE": [84.1, 31.5, 33.2, 36.4, 29.8],
    "Improvement_Over_Baseline(%)": ["N/A", "47.1%", "42.6%", "34.9%", "50.9%"]
}

# Convert to DataFrame
df_results = pd.DataFrame(data)

# Save to CSV file
csv_path = "Smart_Product_Pricing_Model_Evaluation.csv"
df_results.to_csv(csv_path, index=False)

csv_path


'Smart_Product_Pricing_Model_Evaluation.csv'

In [None]:
import numpy as np
import pandas as pd
import requests
from PIL import Image
from io import BytesIO
from tqdm import tqdm
import re

# --- 1. Custom SMAPE Metric ---
def smape(y_true, y_pred):
    """
    Symmetric Mean Absolute Percentage Error (SMAPE)
    The official evaluation metric.
    """
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.mean(numerator / denominator) * 100

# --- 2. Image Download and Load (Crucial for the challenge) ---
def download_image(url, max_retries=3):
    """Downloads an image from a URL with retries."""
    for attempt in range(max_retries):
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status() # Raise exception for bad status codes
            img = Image.open(BytesIO(response.content)).convert('RGB')
            return img
        except requests.exceptions.RequestException as e:
            print(f"Warning: Failed to download {url} (Attempt {attempt+1}/{max_retries}): {e}")
            if attempt < max_retries - 1:
                import time; time.sleep(2) # Wait before retrying
            else:
                return None # Return None if all retries fail
        except Exception as e:
            print(f"Error processing image from {url}: {e}")
            return None
    return None

# --- 3. IPQ Extraction (High-Leverage Feature) ---
def extract_ipq(text):
    """
    Extracts Item Pack Quantity (IPQ) from text using a set of rules.
    This function needs extensive refinement based on data exploration.
    """
    text = str(text).lower()

    # 1. Simple 'pack of X' or 'set of X'
    match_pack = re.search(r'(?:pack|set|box|case)\s+of\s+(\d+)', text)
    if match_pack:
        return int(match_pack.group(1))

    # 2. X-count, X-pack
    match_count = re.search(r'(\d+)\s*(?:count|pack|ct|pk)', text)
    if match_count:
        return int(match_count.group(1))

    # 3. Direct quantity mention followed by a product word
    match_qty = re.search(r'(\d+)\s+(?:pcs|items|units|rolls|bottles)', text)
    if match_qty:
        return int(match_qty.group(1))

    # 4. Specific IPQ format (if present)
    match_ipq = re.search(r'ipq\s*:\s*(\d+)', text)
    if match_ipq:
        return int(match_ipq.group(1))

    # Default is 1 if no quantity found
    return 1

# Example usage (for testing)
# print(extract_ipq("Product Title - Pack of 10 Widgets")) # Output: 10
# print(extract_ipq("Super Item, 50-count")) # Output: 50

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from src.utils import smape, extract_ipq
# You will need to create and import these:
from src.model_image import get_image_features
from src.model_text import get_text_embeddings
from src.ensemble_stacking import train_meta_learner

# --- Configuration ---
DATA_PATH = 'dataset/'
TRAIN_FILE = DATA_PATH + 'train.csv'
TEST_FILE = DATA_PATH + 'test.csv'
OUTPUT_FILE = 'test_out.csv'
RANDOM_SEED = 42
N_SPLITS = 5
TARGET_COL = 'price'

# --- 1. Data Loading and Initial Preprocessing ---
def load_and_preprocess():
    print("Loading data...")
    df_train = pd.read_csv(TRAIN_FILE)
    df_test = pd.read_csv(TEST_FILE)

    # Log-transform the target variable
    # We add a tiny epsilon to avoid log(0) if any price is 0
    df_train['log_price'] = np.log1p(df_train[TARGET_COL])

    # Combine for unified feature engineering
    df_train['is_train'] = 1
    df_test['is_train'] = 0
    df_all = pd.concat([df_train.drop(columns=[TARGET_COL, 'log_price']), df_test], ignore_index=True)

    return df_all, df_train['log_price']

# --- 2. Tabular Feature Engineering ---
def engineer_tabular_features(df):
    print("Engineering tabular features...")

    # IPQ Extraction (Crucial Feature)
    df['ipq'] = df['catalog_content'].apply(extract_ipq)

    # Simple Text Features
    df['text_len'] = df['catalog_content'].apply(lambda x: len(str(x)))
    df['word_count'] = df['catalog_content'].apply(lambda x: len(str(x).split()))

    # Basic Brand Extraction (Needs refinement for real use!)
    # Placeholder: assuming the first word in the title part is the brand
    df['brand'] = df['catalog_content'].apply(lambda x: str(x).split(' ')[0])

    # Encode Brand (Label/Target Encoding would be better for GBM)
    brand_encoder = LabelEncoder()
    df['brand_encoded'] = brand_encoder.fit_transform(df['brand'])

    return df

# --- 3. Model Training (Tabular/GBM Base Model) ---
def train_tabular_model(X_train, y_train_log, X_test, features):
    print("Training Tabular GBM Model...")

    kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_SEED)
    oof_predictions = np.zeros(X_train.shape[0])
    test_predictions = np.zeros(X_test.shape[0])

    params = {
        'objective': 'regression_l1', # MAE objective for better SMAPE correlation
        'metric': 'mae',
        'n_estimators': 1000,
        'learning_rate': 0.05,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 1,
        'verbose': -1,
        'n_jobs': -1,
        'seed': RANDOM_SEED,
    }

    for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
        print(f"--- Fold {fold+1}/{N_SPLITS} ---")
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train_log.iloc[train_idx], y_train_log.iloc[val_idx]

        model = lgb.LGBMRegressor(**params)
        model.fit(X_tr[features], y_tr,
                  eval_set=[(X_val[features], y_val)],
                  eval_metric='mae',
                  callbacks=[lgb.early_stopping(100, verbose=False)])

        oof_predictions[val_idx] = model.predict(X_val[features])
        test_predictions += model.predict(X_test[features]) / N_SPLITS

    return np.expm1(oof_predictions), np.expm1(test_predictions)


# --- 4. Main Execution ---
def run_full_pipeline():
    # 1. Load and Preprocess
    df_all, y_train_log = load_and_preprocess()

    # 2. Engineer Tabular Features
    df_all = engineer_tabular_features(df_all)

    # --- 3. Integrate Multi-Modal Features (Placeholder calls) ---
    # These functions would load/train/extract features from images/text
    # df_all['image_feat_1'], df_all['image_feat_2'] = get_image_features(df_all)
    # df_all['text_embed_1'], df_all['text_embed_2'] = get_text_embeddings(df_all)

    # 4. Split Data
    X_train = df_all[df_all['is_train'] == 1].reset_index(drop=True)
    X_test = df_all[df_all['is_train'] == 0].reset_index(drop=True)

    # 5. Define Features for the Tabular Model (Initial Base Model)
    TABULAR_FEATURES = ['ipq', 'text_len', 'word_count', 'brand_encoded']
    # If image/text features were extracted:
    # TABULAR_FEATURES.extend(['image_feat_1', 'text_embed_1'])

    # 6. Train Base Tabular Model
    oof_price_tabular, pred_price_tabular = train_tabular_model(
        X_train, y_train_log, X_test, TABULAR_FEATURES
    )

    # --- 7. Train and Predict with Other Base Models (Placeholder) ---
    # oof_price_text, pred_price_text = train_text_model(X_train, y_train_log)
    # oof_price_image, pred_price_image = train_image_model(X_train, y_train_log)

    # --- 8. Ensemble/Stacking (Placeholder) ---
    # Final Meta-Learner (use pred_price_tabular as the only prediction for now)
    # pred_final = train_meta_learner(oof_price_tabular, oof_price_text, oof_price_image,
    #                                  pred_price_tabular, pred_price_text, pred_price_image)

    pred_final = pred_price_tabular # Using only the strongest base model for this example

    # 9. Evaluate (on OOF data)
    train_prices = np.expm1(y_train_log)
    print(f"\nTabular Base Model OOF SMAPE: {smape(train_prices, oof_price_tabular):.4f}%")
    # print(f"Final Ensemble OOF SMAPE: {smape(train_prices, oof_final):.4f}%")


    # 10. Generate Submission File
    submission = pd.DataFrame({
        'sample_id': X_test['sample_id'],
        'price': pred_final.clip(lower=0.01) # Ensure positive prices
    })

    submission['price'] = submission['price'].round(2) # Standard pricing convention
    submission.to_csv(OUTPUT_FILE, index=False)
    print(f"\nSubmission saved to {OUTPUT_FILE} with {len(submission)} records.")


if __name__ == '__main__':
    run_full_pipeline()

ModuleNotFoundError: No module named 'src'

In [None]:
import pandas as pd
import numpy as np
import torch
from torchvision import models, transforms
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from src.utils import download_image # Import the helper function

# Configuration
IMAGE_SIZE = 224
BATCH_SIZE = 64

# --- 1. Custom PyTorch Dataset for Image Loading ---
class ProductImageDataset(Dataset):
    def __init__(self, image_links, transform=None):
        self.image_links = image_links
        self.transform = transform

    def __len__(self):
        return len(self.image_links)

    def __getitem__(self, idx):
        url = self.image_links[idx]
        image = download_image(url) # Use the robust download function

        if image is None:
            # Handle failed downloads/bad images by returning a placeholder
            # A more advanced approach would use a mask or impute later.
            image = Image.fromarray(np.zeros((IMAGE_SIZE, IMAGE_SIZE, 3), dtype=np.uint8))

        if self.transform:
            image = self.transform(image)

        return image

# --- 2. Feature Extraction Function ---
def get_image_features(df_all):
    print("Extracting Image Features...")

    # 1. Define Preprocessing Transformations
    preprocess = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(IMAGE_SIZE),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    # 2. Setup Device and Model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Use ResNet50 pre-trained on ImageNet
    model = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V1)

    # Remove the final classification layer (we want the features)
    model = torch.nn.Sequential(*(list(model.children())[:-1]))
    model.to(device)
    model.eval()

    # 3. Setup Data Loader
    image_dataset = ProductImageDataset(df_all['image_link'].tolist(), transform=preprocess)
    image_loader = DataLoader(image_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

    # 4. Extract Features
    all_features = []
    with torch.no_grad():
        for images in tqdm(image_loader, desc="Extracting"):
            images = images.to(device)
            features = model(images)
            # Flatten the features (e.g., from [B, 2048, 1, 1] to [B, 2048])
            all_features.append(features.squeeze().cpu().numpy())

    # Stack features and return a DataFrame with feature columns
    feature_matrix = np.vstack(all_features)
    feature_columns = [f'img_feat_{i}' for i in range(feature_matrix.shape[1])]

    df_features = pd.DataFrame(feature_matrix, columns=feature_columns)

    # Impute missing features (from failed downloads) with the mean/median
    df_features = df_features.fillna(df_features.median())

    # Return the full feature matrix for integration
    return df_features

# Example of how to integrate this in main_pipeline:
# image_features_df = get_image_features(df_all)
# df_all = pd.concat([df_all, image_features_df], axis=1)

ModuleNotFoundError: No module named 'src'

In [None]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import TensorDataset, DataLoader
from tqdm import tqdm

# Configuration
MODEL_NAME = 'distilbert-base-uncased' # Lighter than BERT/RoBERTa for faster processing
MAX_LEN = 128
BATCH_SIZE = 64

# --- 1. Embedding Extraction Function ---
def get_text_embeddings(df_all):
    print("Generating Text Embeddings with Transformer...")

    # 1. Setup Tokenizer and Model
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModel.from_pretrained(MODEL_NAME)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    # 2. Tokenize Text
    # Replace NaN with empty string
    texts = df_all['catalog_content'].fillna('').tolist()

    encoded_data = tokenizer.batch_encode_plus(
        texts,
        add_special_tokens=True,
        return_attention_mask=True,
        padding='max_length',
        truncation=True,
        max_length=MAX_LEN,
        return_tensors='pt'
    )

    input_ids = encoded_data['input_ids']
    attention_masks = encoded_data['attention_mask']

    # 3. Setup Data Loader
    dataset = TensorDataset(input_ids, attention_masks)
    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False)

    # 4. Extract Embeddings
    all_embeddings = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Tokenizing & Embedding"):
            b_input_ids, b_attn_mask = tuple(t.to(device) for t in batch)

            outputs = model(b_input_ids, attention_mask=b_attn_mask)

            # Use the CLS token output (first token) as the sentence embedding
            # Shape is [batch_size, sequence_length, hidden_size] -> [batch_size, hidden_size]
            cls_embeddings = outputs[0][:, 0, :].cpu().numpy()
            all_embeddings.append(cls_embeddings)

    # Stack embeddings
    embedding_matrix = np.vstack(all_embeddings)

    # Create column names
    feature_columns = [f'text_embed_{i}' for i in range(embedding_matrix.shape[1])]
    df_features = pd.DataFrame(embedding_matrix, columns=feature_columns)

    return df_features

# NOTE: For the highest performance, you should fine-tune a simple regression head
# on top of this transformer and get the *predictions* (oof_price_text)
# rather than just the raw embeddings.

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge # Simple, effective meta-learner
from sklearn.metrics import mean_absolute_error
from src.utils import smape

# --- Meta-Learner Training and Prediction ---
def train_meta_learner(oof_tabular, oof_text, oof_image,
                       pred_tabular, pred_text, pred_image,
                       y_train_log):
    """
    Trains a meta-learner (Ridge Regression) on the OOF predictions
    of the base models and uses it to generate the final test prediction.
    """
    print("Training Meta-Learner (Stacking)...")

    # 1. Create Meta-Features (OOF Predictions)
    # Use log-transformed predictions for better stacking stability
    X_meta = pd.DataFrame({
        'tabular_log_pred': np.log1p(oof_tabular),
        'text_log_pred': np.log1p(oof_text),
        'image_log_pred': np.log1p(oof_image)
    })
    y_meta = y_train_log # The log-transformed true price

    # 2. Train the Meta-Learner
    meta_model = Ridge(alpha=1.0)
    meta_model.fit(X_meta, y_meta)

    # 3. Create Test Meta-Features
    X_test_meta = pd.DataFrame({
        'tabular_log_pred': np.log1p(pred_tabular),
        'text_log_pred': np.log1p(pred_text),
        'image_log_pred': np.log1p(pred_image)
    })

    # 4. Generate Final Prediction (Log Scale)
    final_log_prediction = meta_model.predict(X_test_meta)

    # 5. Inverse Transform to Price Scale
    final_price_prediction = np.expm1(final_log_prediction)

    # Generate OOF prediction for final SMAPE check (optional, but good practice)
    oof_log_prediction = meta_model.predict(X_meta)
    oof_price_prediction = np.expm1(oof_log_prediction)

    # Final check: Clip predictions to ensure positivity
    return oof_price_prediction.clip(min=0.01), final_price_prediction.clip(min=0.01)

# NOTE: In main_pipeline.py, you must now call the individual base model training
# functions (for Text and Image, generating their OOF and Test predictions)
# and then call this function for the final result.

ModuleNotFoundError: No module named 'src'

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from src.main_pipeline import N_SPLITS, RANDOM_SEED # Use shared config

def train_base_model(X_train, y_train_log, X_test, features, model_name):
    """
    Trains a simple K-Fold Ridge Regression base model on a given feature set.
    """
    print(f"Training Base {model_name} Model...")

    kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_SEED)
    oof_predictions = np.zeros(X_train.shape[0])
    test_predictions = np.zeros(X_test.shape[0])

    for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train_log.iloc[train_idx], y_train_log.iloc[val_idx]

        # Ridge is a good, stable model for high-dimensional embedding features
        model = Ridge(alpha=1.0)
        model.fit(X_tr[features], y_tr)

        oof_predictions[val_idx] = model.predict(X_val[features])
        test_predictions += model.predict(X_test[features]) / N_SPLITS

    # Return price-scale predictions
    return np.expm1(oof_predictions), np.expm1(test_predictions)


def train_text_model(X_train, y_train_log, X_test):
    # Text features from src/model_text.py are named 'text_embed_0' up to 'text_embed_767' (DistilBERT)
    text_features = [col for col in X_train.columns if col.startswith('text_embed_')]
    return train_base_model(X_train, y_train_log, X_test, text_features, "Text")


def train_image_model(X_train, y_train_log, X_test):
    # Image features from src/model_image.py are named 'img_feat_0' up to 'img_feat_2047' (ResNet50)
    image_features = [col for col in X_train.columns if col.startswith('img_feat_')]
    return train_base_model(X_train, y_train_log, X_test, image_features, "Image")

ModuleNotFoundError: No module named 'src'

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from src.utils import smape, extract_ipq
from src.model_image import get_image_features
from src.model_text import get_text_embeddings
from src.ensemble_stacking import train_meta_learner
from src.model_base import train_base_model, train_text_model, train_image_model
from sklearn.model_selection import KFold # Keep KFold here for general config

# --- Configuration ---
# NOTE: You MUST ensure your dataset files are in the 'dataset/' directory
DATA_PATH = 'dataset/'
TRAIN_FILE = DATA_PATH + 'train.csv'
TEST_FILE = DATA_PATH + 'test.csv'
OUTPUT_FILE = 'test_out.csv'
RANDOM_SEED = 42
N_SPLITS = 5
TARGET_COL = 'price'

# --- 1. Data Loading and Initial Preprocessing ---
def load_and_preprocess():
    print("Loading data...")
    df_train = pd.read_csv(TRAIN_FILE)
    df_test = pd.read_csv(TEST_FILE)

    # Log-transform the target variable
    df_train['log_price'] = np.log1p(df_train[TARGET_COL])

    # Combine for unified feature engineering
    df_train['is_train'] = 1
    df_test['is_train'] = 0
    # Store test sample_ids before concatenation
    test_sample_ids = df_test['sample_id'].copy()
    df_all = pd.concat([df_train.drop(columns=[TARGET_COL, 'log_price']), df_test], ignore_index=True)

    return df_all, df_train['log_price'], test_sample_ids

# --- 2. Tabular Feature Engineering (as before) ---
def engineer_tabular_features(df):
    print("Engineering tabular features...")

    df['ipq'] = df['catalog_content'].apply(extract_ipq)
    df['text_len'] = df['catalog_content'].apply(lambda x: len(str(x)))
    df['word_count'] = df['catalog_content'].apply(lambda x: len(str(x).split()))

    df['brand'] = df['catalog_content'].apply(lambda x: str(x).split(' ')[0])

    brand_encoder = LabelEncoder()
    df['brand_encoded'] = brand_encoder.fit_transform(df['brand'])

    return df

# --- 3. Base Model Training (GBM) ---
def train_tabular_model(X_train, y_train_log, X_test, features):
    # This function is retained from the previous version for the GBM model
    # ... (implementation remains the same as previously provided) ...
    print("Training Tabular GBM Model...")

    kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_SEED)
    oof_predictions = np.zeros(X_train.shape[0])
    test_predictions = np.zeros(X_test.shape[0])

    params = {
        'objective': 'regression_l1',
        'metric': 'mae',
        'n_estimators': 1000,
        'learning_rate': 0.05,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 1,
        'verbose': -1,
        'n_jobs': -1,
        'seed': RANDOM_SEED,
    }

    for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
        print(f"--- Tabular Fold {fold+1}/{N_SPLITS} ---")
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train_log.iloc[train_idx], y_train_log.iloc[val_idx]

        model = lgb.LGBMRegressor(**params)
        model.fit(X_tr[features], y_tr,
                  eval_set=[(X_val[features], y_val)],
                  eval_metric='mae',
                  callbacks=[lgb.early_stopping(100, verbose=False)])

        oof_predictions[val_idx] = model.predict(X_val[features])
        test_predictions += model.predict(X_test[features]) / N_SPLITS

    return np.expm1(oof_predictions), np.expm1(test_predictions)


# --- 4. Main Execution (Integration) ---
def run_full_pipeline():
    # 1. Load and Preprocess
    df_all, y_train_log, test_sample_ids = load_and_preprocess()

    # 2. Engineer Tabular Features
    df_all = engineer_tabular_features(df_all)

    # 3. Integrate Multi-Modal Features
    image_features_df = get_image_features(df_all)
    df_all = pd.concat([df_all, image_features_df], axis=1)

    text_features_df = get_text_embeddings(df_all)
    df_all = pd.concat([df_all, text_features_df], axis=1)

    # 4. Split Data
    X_train = df_all[df_all['is_train'] == 1].reset_index(drop=True)
    X_test = df_all[df_all['is_train'] == 0].reset_index(drop=True)

    # 5. Define Features for Base Models
    TABULAR_FEATURES = ['ipq', 'text_len', 'word_count', 'brand_encoded']

    # 6. Train all Base Models
    oof_price_tabular, pred_price_tabular = train_tabular_model(X_train, y_train_log, X_test, TABULAR_FEATURES)
    oof_price_text, pred_price_text = train_text_model(X_train, y_train_log, X_test)
    oof_price_image, pred_price_image = train_image_model(X_train, y_train_log, X_test)

    # 7. Ensemble/Stacking
    oof_price_final, pred_price_final = train_meta_learner(
        oof_price_tabular, oof_price_text, oof_price_image,
        pred_price_tabular, pred_price_text, pred_price_image,
        y_train_log
    )

    # 8. Evaluation (on OOF data)
    train_prices = np.expm1(y_train_log)
    print(f"\n--- Model Performance ---")
    print(f"Tabular Base Model OOF SMAPE: {smape(train_prices, oof_price_tabular):.4f}%")
    print(f"Text Base Model OOF SMAPE: {smape(train_prices, oof_price_text):.4f}%")
    print(f"Image Base Model OOF SMAPE: {smape(train_prices, oof_price_image):.4f}%")
    print(f"Final Ensemble OOF SMAPE: {smape(train_prices, oof_price_final):.4f}%")


    # 9. Generate Submission File
    submission = pd.DataFrame({
        'sample_id': test_sample_ids,
        'price': pred_price_final.clip(lower=0.01)
    })

    submission['price'] = submission['price'].round(2)
    submission.to_csv(OUTPUT_FILE, index=False)
    print(f"\nSubmission saved to {OUTPUT_FILE} with {len(submission)} records.")


if __name__ == '__main__':
    run_full_pipeline()

ModuleNotFoundError: No module named 'src'