In [8]:
import os
import pickle
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# 📂 Paths
TRAINING_DATA_PATH = '../../data/training_data.csv'
MODEL_OUTPUT_PATH = '../models/pricing_model.pkl'

# ✅ Load Data
df = pd.read_csv(TRAINING_DATA_PATH)

# ✅ Feature + target setup
features = [
    'amp',
    'inventory_level',
    'mrp',
    'holiday_score',
    'brand_reputation_score',
    'expiry_score',
    'location_zone'
]
target = 'final_price'

X = df[features]
y = df[target]

# ✅ Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ✅ Define transformer
categorical_features = ['location_zone']
preprocessor = ColumnTransformer(
    transformers=[
        ('zone', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

# ✅ Pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(
        random_state=42,
        n_estimators=100,
        learning_rate=0.1,
        reg_alpha=0.5,
        reg_lambda=1.0,
        max_depth=5
    ))
])

# ✅ Train
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"✅ RMSE: {rmse:.2f}")

# ✅ Save
os.makedirs(os.path.dirname(MODEL_OUTPUT_PATH), exist_ok=True)
with open(MODEL_OUTPUT_PATH, 'wb') as f:
    pickle.dump(pipeline, f)

print("✅ Saved pipeline at:", MODEL_OUTPUT_PATH)


✅ RMSE: 2.98
✅ Saved pipeline at: ../models/pricing_model.pkl
