In [1]:
# 1. Import Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
import joblib


In [2]:
# 2. Load Dataset
df = pd.read_csv("retail_store_inventory.csv")
df['Date'] = pd.to_datetime(df['Date'])
df.sort_values(['Store ID', 'Product ID', 'Date'], inplace=True)

In [3]:
# 3. Encode Categorical Columns
cat_cols = ['Store ID', 'Product ID', 'Category', 'Region', 'Weather Condition', 'Seasonality']
le_dict = {}
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    le_dict[col] = le


In [4]:
# 4. Feature Engineering
df['day_of_week'] = df['Date'].dt.dayofweek
df['month'] = df['Date'].dt.month
df['year'] = df['Date'].dt.year

def create_lag_features(group):
    group['lag_7'] = group['Units Sold'].shift(7)
    group['lag_14'] = group['Units Sold'].shift(14)
    group['rolling_mean_7'] = group['Units Sold'].shift(1).rolling(window=7).mean()
    group['rolling_std_7'] = group['Units Sold'].shift(1).rolling(window=7).std()
    return group

df = df.groupby(['Store ID', 'Product ID']).apply(create_lag_features)
df.dropna(inplace=True)



  df = df.groupby(['Store ID', 'Product ID']).apply(create_lag_features)


In [7]:
# 5. Feature Selection (Dynamic: Keep top 5 features based on importance)
all_features = [
    'Store ID', 'Product ID', 'Category', 'Region', 'Inventory Level',
    'Units Ordered', 'Demand Forecast', 'Price', 'Discount',
    'Weather Condition', 'Holiday/Promotion', 'Competitor Pricing',
    'Seasonality', 'day_of_week', 'month', 'year',
    'lag_7', 'lag_14', 'rolling_mean_7', 'rolling_std_7'
]
X_full = df[all_features]
y = df['Units Sold']

# Pre-split to prevent leakage
X_train_full, X_val_full, y_train_full, y_val_full = train_test_split(X_full, y, test_size=0.2, shuffle=False)

# Fit temporary model for importance
selector_model = xgb.XGBRegressor(random_state=42)
selector_model.fit(X_train_full, y_train_full)

# Get top 5 important features
importances = selector_model.feature_importances_
importance_df = pd.DataFrame({'feature': X_full.columns, 'importance': importances})
importance_df.sort_values(by='importance', ascending=False, inplace=True)

top_5_features = importance_df['feature'].iloc[:5].tolist()
print(f"Selected Top 5 Features: {top_5_features}")

# Final feature set
X = X_full[top_5_features]


Selected Top 5 Features: ['Demand Forecast', 'Inventory Level', 'Competitor Pricing', 'rolling_mean_7', 'rolling_std_7']


In [9]:
# 6. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [11]:
# 7. Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [15]:
# 8. Model Training
params = {
    'n_estimators': [100, 150],
    'max_depth': [3, 5],
    'learning_rate': [0.1, 0.05]
}
xgb_model = xgb.XGBRegressor(random_state=42)
gs = GridSearchCV(xgb_model, params, cv=3, scoring='neg_root_mean_squared_error')
gs.fit(X_train_scaled, y_train)
best_model = gs.best_estimator_

In [17]:
# 9. Evaluation
y_pred = best_model.predict(X_test_scaled)
print("Train RMSE:", mean_squared_error(y_train, best_model.predict(X_train_scaled), squared=False))
print("Test RMSE:", mean_squared_error(y_test, y_pred, squared=False))
print("Test MAE:", mean_absolute_error(y_test, y_pred))
print("Test R2:", r2_score(y_test, y_pred))


Train RMSE: 8.302586038357672
Test RMSE: 8.37406106665489
Test MAE: 7.158233215951853
Test R2: 0.9941690564155579




In [19]:
# 10. Save Models
joblib.dump(best_model, "xgb_inventory_model.pkl")
joblib.dump(scaler, "inventory_scaler.pkl")
joblib.dump(le_dict, "le_dict.pkl")
joblib.dump(top_5_features, "selected_features.pkl")
df.to_csv("updated_inventory_dataset.csv", index=False)

In [21]:
# 11. Prediction Function
def predict_inventory(user_input):
    # Load assets
    model = joblib.load("xgb_inventory_model.pkl")
    scaler = joblib.load("inventory_scaler.pkl")
    le_dict = joblib.load("le_dict.pkl")
    selected_features = joblib.load("selected_features.pkl")
    df_hist = pd.read_csv("updated_inventory_dataset.csv")
    df_hist['Date'] = pd.to_datetime(df_hist['Date'])

    # Encode categorical columns
    input_df = pd.DataFrame([user_input])
    for col in cat_cols:
        value=input_df[col].iloc[0]
        encoder=le_dict[col]
        if value not in encoder.classes_:
            encoder.classes_=np.append(encoder.classes_,value)
        input_df[col]=encoder.transform([value])

    # Add time features
    input_df['Date'] = pd.to_datetime(input_df['Date'])
    input_df['day_of_week'] = input_df['Date'].dt.dayofweek
    input_df['month'] = input_df['Date'].dt.month
    input_df['year'] = input_df['Date'].dt.year

    # Merge with history to compute rolling features
    match_df = df_hist[
        (df_hist['Store ID'] == input_df['Store ID'].iloc[0]) &
        (df_hist['Product ID'] == input_df['Product ID'].iloc[0])
    ].copy()
    combined = pd.concat([match_df, input_df], ignore_index=True)
    combined.sort_values('Date', inplace=True)
    combined = create_lag_features(combined)
    input_row = combined.iloc[-1:][selected_features]

     # Scale and Predict
    input_scaled = scaler.transform(input_row)
    predicted_sales = model.predict(input_scaled)[0]

    # Inventory Recommendation
    safety_stock = predicted_sales * 0.15
    reorder_point = predicted_sales * 7 + safety_stock
    recommended_stock = predicted_sales + safety_stock

    # Dynamic Pricing
    demand_gap = predicted_sales - user_input['Inventory Level']
    price_adjustment = 0.1 * (demand_gap / predicted_sales) if predicted_sales != 0 else 0
    competitor_adjustment = (user_input['Competitor Pricing'] - user_input['Price']) * 0.2
    new_price = user_input['Price'] + user_input['Price'] * price_adjustment + competitor_adjustment
    new_price = round(max(new_price, 0.01), 2)

    print("\n--- Inventory & Pricing Recommendation ---")
    print("Predicted Sales:", round(predicted_sales, 2))
    print("Safety Stock:", round(safety_stock, 2))
    print("Recommended Stock Level:", round(recommended_stock, 2))
    print("Reorder Point:", round(reorder_point, 2))
    print("Suggested Price:", round(new_price, 2))

In [23]:
# 12. Sample Real-Time Prediction
sample_input = {
    'Store ID': 'S001',
    'Product ID': 'P0001',
    'Category': 'Groceries',
    'Region': 'North',
    'Inventory Level': 231,
    'Units Ordered': 55,
    'Demand Forecast': 135.47,
    'Price': 33.5,
    'Discount': 20,
    'Weather Condition': 'Rainy',
    'Holiday/Promotion': 0,
    'Competitor Pricing': 26.69,
    'Seasonality': 'Autumn',
    'Date': '2022-01-01'
}
result = predict_inventory(sample_input)
print(result)


--- Inventory & Pricing Recommendation ---
Predicted Sales: 47.59
Safety Stock: 7.14
Recommended Stock Level: 54.72
Reorder Point: 340.24
Suggested Price: 19.23
None
