In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/embeddings/processed_data.csv
/kaggle/input/embeddings/sbert_embeddings.npy
/kaggle/input/dataset/train.csv
/kaggle/input/dataset/test.csv


In [3]:
# Cell 1: Imports and Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Import ML models
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

import time

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 6)

# Define SMAPE function with log transformation
def smape(y_true, y_pred):
    """
    Symmetric Mean Absolute Percentage Error
    Uses log transformation for better price prediction evaluation
    """
    # Convert to log scale for better percentage calculation
    y_true_log = np.log1p(y_true)  # log(1 + y) to handle zeros
    y_pred_log = np.log1p(y_pred)
    
    return 100 * np.mean(2 * np.abs(y_pred_log - y_true_log) / (np.abs(y_true_log) + np.abs(y_pred_log) + 1e-8))

print("=" * 100)
print("ML PIPELINE FOR PRODUCT PRICE PREDICTION")
print("=" * 100)

ML PIPELINE FOR PRODUCT PRICE PREDICTION


In [4]:
# Cell 2: Load Data & Embeddings
print("\n📂 Loading processed data and embeddings...")

df = pd.read_csv("/kaggle/input/embeddings/processed_data.csv")
embeddings = np.load("/kaggle/input/embeddings/sbert_embeddings.npy")

print(f"✅ Data loaded successfully!")
print(f"   Dataset shape: {df.shape}")
print(f"   Embeddings shape: {embeddings.shape}")

# Display basic info about the target variable
print(f"\n📊 Target Variable (Price) Statistics:")
print(f"   Min price: ${df['price'].min():.2f}")
print(f"   Max price: ${df['price'].max():.2f}")
print(f"   Mean price: ${df['price'].mean():.2f}")
print(f"   Median price: ${df['price'].median():.2f}")


📂 Loading processed data and embeddings...
✅ Data loaded successfully!
   Dataset shape: (75000, 7)
   Embeddings shape: (75000, 384)

📊 Target Variable (Price) Statistics:
   Min price: $0.13
   Max price: $61.37
   Mean price: $20.34
   Median price: $14.00


In [5]:
# Cell 3: Prepare Features
print("\n" + "=" * 100)
print("FEATURE ENGINEERING")
print("=" * 100)

# Create feature matrix combining embeddings and text statistics
text_features = df[["word_count", "char_count"]].values
X = np.hstack([embeddings, text_features])

# Target variable (using raw prices, no transformation)
y = df["price"].values

print(f"\n📊 Feature Matrix:")
print(f"   Total features: {X.shape[1]}")
print(f"   - SBERT embeddings: {embeddings.shape[1]}")
print(f"   - Text statistics: {text_features.shape[1]}")
print(f"   Target variable shape: {y.shape}")

# Feature scaling (important for some models)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

print(f"\n✂️ Data Split:")
print(f"   Training set: {X_train.shape}")
print(f"   Testing set: {X_test.shape}")


FEATURE ENGINEERING

📊 Feature Matrix:
   Total features: 386
   - SBERT embeddings: 384
   - Text statistics: 2
   Target variable shape: (75000,)

✂️ Data Split:
   Training set: (60000, 386)
   Testing set: (15000, 386)


In [6]:
# Cell 4: Define Models
print("\n" + "=" * 100)
print("MODEL CONFIGURATION")
print("=" * 100)

models = {
    "XGBoost": XGBRegressor(
        n_estimators=500,
        max_depth=8,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
        verbosity=0
    ),
    
    "LightGBM": LGBMRegressor(
        n_estimators=500,
        max_depth=8,
        learning_rate=0.05,
        num_leaves=50,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
        verbose=-1
    ),
    
    "CatBoost": CatBoostRegressor(
        iterations=500,
        depth=8,
        learning_rate=0.05,
        random_state=42,
        verbose=0
    )
}

print("\n🤖 Models configured:")
for name in models.keys():
    print(f"   ✓ {name}")


MODEL CONFIGURATION

🤖 Models configured:
   ✓ XGBoost
   ✓ LightGBM
   ✓ CatBoost


In [8]:
# Cell 5: Train Models on Full Training Set (No K-Fold)
print("\n" + "=" * 100)
print("TRAINING MODELS ON FULL TRAINING SET")
print("=" * 100)

trained_models = {}
train_results = {}

for name, model in models.items():
    print(f"\n🎯 Training {name} on full training set...")
    start_time = time.time()
    
    # Train model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Calculate metrics
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    train_mae = mean_absolute_error(y_train, y_train_pred)
    train_r2 = r2_score(y_train, y_train_pred)
    train_smape = smape(y_train, y_train_pred)
    
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    test_mae = mean_absolute_error(y_test, y_test_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    test_smape = smape(y_test, y_test_pred)
    
    elapsed_time = time.time() - start_time
    
    # Store results
    trained_models[name] = model
    train_results[name] = {
        'train_rmse': train_rmse,
        'train_mae': train_mae,
        'train_r2': train_r2,
        'train_smape': train_smape,
        'test_rmse': test_rmse,
        'test_mae': test_mae,
        'test_r2': test_r2,
        'test_smape': test_smape,
        'train_time': elapsed_time,
        'y_test_pred': y_test_pred
    }
    
    print(f"   ✅ Training completed in {elapsed_time:.2f}s")
    print(f"      Train RMSE: {train_rmse:.4f} | Test RMSE: {test_rmse:.4f}")
    print(f"      Train R²: {train_r2:.4f} | Test R²: {test_r2:.4f}")
    print(f"      Train SMAPE: {train_smape:.4f}% | Test SMAPE: {test_smape:.4f}%")


TRAINING MODELS ON FULL TRAINING SET

🎯 Training XGBoost on full training set...
   ✅ Training completed in 139.22s
      Train RMSE: 8.9942 | Test RMSE: 17.4061
      Train R²: 0.7399 | Test R²: 0.0316
      Train SMAPE: 19.4839% | Test SMAPE: 29.3921%

🎯 Training LightGBM on full training set...
   ✅ Training completed in 31.95s
      Train RMSE: 13.0434 | Test RMSE: 17.1810
      Train R²: 0.4529 | Test R²: 0.0565
      Train SMAPE: 24.2265% | Test SMAPE: 29.2478%

🎯 Training CatBoost on full training set...
   ✅ Training completed in 121.15s
      Train RMSE: 15.3892 | Test RMSE: 17.0864
      Train R²: 0.2384 | Test R²: 0.0669
      Train SMAPE: 26.8825% | Test SMAPE: 29.2262%


In [9]:
# Cell 7: Model Comparison
print("\n" + "=" * 100)
print("MODEL PERFORMANCE COMPARISON")
print("=" * 100)

results_df = pd.DataFrame({
    'Model': list(train_results.keys()),
    'Train RMSE': [train_results[m]['train_rmse'] for m in train_results.keys()],
    'Test RMSE': [train_results[m]['test_rmse'] for m in train_results.keys()],
    'Train MAE': [train_results[m]['train_mae'] for m in train_results.keys()],
    'Test MAE': [train_results[m]['test_mae'] for m in train_results.keys()],
    'Train R²': [train_results[m]['train_r2'] for m in train_results.keys()],
    'Test R²': [train_results[m]['test_r2'] for m in train_results.keys()],
    'Train SMAPE': [train_results[m]['train_smape'] for m in train_results.keys()],
    'Test SMAPE': [train_results[m]['test_smape'] for m in train_results.keys()],
    'Training Time (s)': [train_results[m]['train_time'] for m in train_results.keys()]
})

results_df = results_df.sort_values('Test SMAPE')  # Sort by SMAPE for price prediction
print("\n", results_df.to_string(index=False))

# Identify best model based on Test SMAPE (better for price prediction)
best_model_name = results_df.iloc[0]['Model']
print(f"\n🏆 BEST MODEL: {best_model_name}")
print(f"   Test RMSE: {results_df.iloc[0]['Test RMSE']:.4f}")
print(f"   Test SMAPE: {results_df.iloc[0]['Test SMAPE']:.4f}%")
print(f"   Test R²: {results_df.iloc[0]['Test R²']:.4f}")


MODEL PERFORMANCE COMPARISON

    Model  Train RMSE  Test RMSE  Train MAE  Test MAE  Train R²  Test R²  Train SMAPE  Test SMAPE  Training Time (s)
CatBoost   15.389150  17.086390  12.194705 13.515751  0.238402 0.066879    26.882529   29.226169         121.146733
LightGBM   13.043411  17.181027  10.345387 13.568828  0.452885 0.056514    24.226505   29.247807          31.947243
 XGBoost    8.994186  17.406093   7.196419 13.719302  0.739852 0.031633    19.483944   29.392061         139.215312

🏆 BEST MODEL: CatBoost
   Test RMSE: 17.0864
   Test SMAPE: 29.2262%
   Test R²: 0.0669


In [10]:
# Cell 12: Final Summary
print("\n" + "=" * 100)
print("FINAL SUMMARY")
print("=" * 100)

print(f"\n📊 Dataset Information:")
print(f"   Total samples: {len(df)}")
print(f"   Training samples: {len(X_train)}")
print(f"   Testing samples: {len(X_test)}")
print(f"   Feature dimensions: {X.shape[1]}")

print(f"\n🎯 Best Model Performance:")
print(f"   Model: {best_model_name}")
print(f"   Test RMSE: ${results_df.iloc[0]['Test RMSE']:.2f}")
print(f"   Test MAE: ${results_df.iloc[0]['Test MAE']:.2f}")
print(f"   Test SMAPE: {results_df.iloc[0]['Test SMAPE']:.2f}%")
print(f"   Test R² Score: {results_df.iloc[0]['Test R²']:.4f}")

print(f"\n🏅 Model Rankings (by Test SMAPE):")
for idx, row in results_df.iterrows():
    print(f"   {idx+1}. {row['Model']}: SMAPE {row['Test SMAPE']:.2f}%, RMSE ${row['Test RMSE']:.2f}")

print("\n" + "=" * 100)
print("✅ PIPELINE COMPLETE!")
print("=" * 100)

print("\n📁 Generated Files:")
print("   - model_results.csv")
print("   - best_model_*.pkl")
print("   - scaler.pkl")
print("   - All model pickle files")
print("   - model_comparison.png")
print("   - best_model_predictions.png")
print("   - error_distribution.png")


FINAL SUMMARY

📊 Dataset Information:
   Total samples: 75000
   Training samples: 60000
   Testing samples: 15000
   Feature dimensions: 386

🎯 Best Model Performance:
   Model: CatBoost
   Test RMSE: $17.09
   Test MAE: $13.52
   Test SMAPE: 29.23%
   Test R² Score: 0.0669

🏅 Model Rankings (by Test SMAPE):
   3. CatBoost: SMAPE 29.23%, RMSE $17.09
   2. LightGBM: SMAPE 29.25%, RMSE $17.18
   1. XGBoost: SMAPE 29.39%, RMSE $17.41

✅ PIPELINE COMPLETE!

📁 Generated Files:
   - model_results.csv
   - best_model_*.pkl
   - scaler.pkl
   - All model pickle files
   - model_comparison.png
   - best_model_predictions.png
   - error_distribution.png
