In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor
from scipy.stats import uniform, randint
import warnings
warnings.filterwarnings("ignore")

# --- 1. Load Data ---
df = pd.read_csv("retail_price.csv")

# --- 2. Preprocessing & Feature Engineering (as implemented before) ---
df.drop(columns=["product_id", "month_year", "total_price"], inplace=True)
df['price_ratio_comp1'] = df['unit_price'] / (df['comp_1'] + 1e-6)
df['price_ratio_comp2'] = df['unit_price'] / (df['comp_2'] + 1e-6)
df['price_ratio_comp3'] = df['unit_price'] / (df['comp_3'] + 1e-6)
df = pd.get_dummies(df, columns=["product_category_name"], drop_first=True)

# --- 3. Define Target and Features ---
y_orig = df['qty']
X = df.drop(columns=['qty'])
y_train_trans = np.log1p(y_orig) # Transform target for training

# --- 4. Split Data ---
X_train, X_test, y_train_trans, y_test_orig = train_test_split(X, y_train_trans, test_size=0.2, random_state=42)

# --- 5. Hyperparameter Tuning using Randomized Search ---

# Define the parameter distribution space
param_dist = {
    'n_estimators': randint(100, 500), # Number of boosting stages
    'learning_rate': uniform(0.01, 0.2), # Step size shrinkage
    'max_depth': randint(3, 7), # Maximum depth of individual estimators
    'min_samples_split': randint(2, 10),
    'subsample': uniform(0.6, 0.4) # Fraction of samples to be used for fitting the individual base learners
}

# Initialize the model
gbr = GradientBoostingRegressor(random_state=42)

# Initialize RandomizedSearchCV
# n_iter=50 means 50 different parameter combinations will be tested
# cv=5 means 5-fold cross-validation will be used
random_search = RandomizedSearchCV(
    estimator=gbr,
    param_distributions=param_dist,
    n_iter=50,
    cv=5,
    scoring='neg_mean_squared_error',
    verbose=0,
    random_state=42,
    n_jobs=-1
)

print("Starting Randomized Hyperparameter Search (50 iterations)...")
# Fit search on the transformed training data
random_search.fit(X_train, y_train_trans)

# Get the best estimator
best_model = random_search.best_estimator_

print(f"✅ Best Parameters Found: {random_search.best_params_}")

# --- 6. Predict, Inverse Transform, and Evaluate with Best Model ---

# Predict on the transformed scale (log scale)
y_pred_trans = best_model.predict(X_test)

# Inverse transform the actual test values (which are still in log scale)
y_test_orig_untrans = np.expm1(y_test_orig)

# Inverse transform predictions back to the original unit scale
y_pred_orig = np.expm1(y_pred_trans)
y_pred_orig[y_pred_orig < 0] = 0 # Ensure predicted quantities are not negative

# Calculate metrics using the actual test values and the inverse-transformed predictions
rmse = np.sqrt(mean_squared_error(y_test_orig_untrans, y_pred_orig))
r2 = r2_score(y_test_orig_untrans, y_pred_orig)

print("\n--- Tuned Demand Model Performance (on Original Quantity Units) ---")
print(f"✅ Improved RMSE (in Units): {rmse:.2f}")
print(f"✅ Improved R² Score: {r2:.2f}")

Starting Randomized Hyperparameter Search (50 iterations)...
✅ Best Parameters Found: {'learning_rate': np.float64(0.09503117489824894), 'max_depth': 4, 'min_samples_split': 3, 'n_estimators': 359, 'subsample': np.float64(0.6739417822102108)}

--- Tuned Demand Model Performance (on Original Quantity Units) ---
✅ Improved RMSE (in Units): 8.60
✅ Improved R² Score: 0.73
