In [1]:
from pydantic import BaseModel
import pickle
import pandas as pd
from datetime import datetime

# Pydantic model for structured input
class InputData(BaseModel):
    customerID: str
    category: str
    product: str
    date: str

# Load category rules
with open("category_rules.pkl", "rb") as f:
    category_rules = pickle.load(f)

# Load customer-country mapping
customer_country_df = pd.read_csv("customer_country_mapping.csv")
customer_country_df['Customer ID'] = customer_country_df['Customer ID'].astype(str)
customer_to_country = dict(zip(customer_country_df['Customer ID'], customer_country_df['Country']))
country_list = customer_country_df['Country'].dropna().unique().tolist()

# Load product prices
product_price_df = pd.read_csv("product_price_map.csv")
product_price_df['Description'] = product_price_df['Description'].str.strip().str.lower()
product_to_price = dict(zip(product_price_df['Description'], product_price_df['Price']))

# Determine season from month
def get_season(month):
    if month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    elif month in [12, 1, 2]:
        return 'Winter'
    else:
        return 'Autumn'

# Prepare input row
def prepare_input_row(data: InputData, rules) -> pd.DataFrame:
    try:
        month = datetime.strptime(data.date, "%Y-%m-%d").month
    except ValueError:
        raise ValueError(f"Invalid date format: {data.date}. Expected YYYY-MM-DD.")
    
    season = get_season(month)
    basket = {data.category}

    default_country = customer_to_country.get(data.customerID, 'Unspecified')
    if default_country is None:
        raise ValueError(f"Country not found for customer ID: {data.customerID}")

    price = product_to_price.get(data.product.lower())
    if price is None:
        raise ValueError(f"Price not found for product: {data.product}")

    row = {
        'Price': price,
        'Customer ID': data.customerID,
        'Month': month,
        'category': data.category
    }

    for s in ['Spring', 'Summer', 'Winter']:
        row[f'Season_{s}'] = int(season == s)

    for c in country_list:
        row[f'Country_{c}'] = int(c == default_country)

    for idx, rule in enumerate(rules):
        lhs_items = set(rule.lhs)
        row[f'cat_rule_{idx:03d}'] = int(lhs_items.issubset(basket))

    return pd.DataFrame([row]), price

# -----------------------
# ✅ Test the function
# -----------------------
test_data = {
    'customerID': '13085',
    'category': 'Accessories',
    'product': 'PINK SWEETHEART BRACELET',
    'date': '2020-12-01'
}

df, price = prepare_input_row(InputData(**test_data), category_rules)
print(df)
print(f"Product price: {price}")


   Price Customer ID  Month     category  Season_Spring  Season_Summer  \
0   4.25       13085     12  Accessories              0              0   

   Season_Winter  Country_United Kingdom  Country_Iceland  Country_Finland  \
0              1                       1                0                0   

   ...  cat_rule_050  cat_rule_051  cat_rule_052  cat_rule_053  cat_rule_054  \
0  ...             0             0             0             0             0   

   cat_rule_055  cat_rule_056  cat_rule_057  cat_rule_058  cat_rule_059  
0             0             0             0             0             0  

[1 rows x 104 columns]
Product price: 4.25


# Stacking

In [2]:
import pickle
with open("Stacking_model.pkl", "rb") as f:
    stack_model = pickle.load(f)

# prob = xgb_model.predict_proba(df)
prob = stack_model.predict_proba(df)
prob

array([[0.97326145, 0.02673855]])

In [6]:
import pandas as pd

def clean_feature_name(name):
    # Remove prefix before double underscores "__"
    # e.g. "num__Price" -> "Price"
    #       "cat__category_Interior Finishes" -> "category_Interior Finishes"
    #       "remainder__Country_Germany" -> "Country_Germany"
    if "__" in name:
        return name.split("__", 1)[1]
    return name

def get_feature_importance(stack_model, feature_names):
    # Extract base learners and meta-model
    xgb = stack_model.named_estimators_['xgb'].named_steps['xgbclassifier']
    rf = stack_model.named_estimators_['rf'].named_steps['randomforestclassifier']
    meta_model = stack_model.final_estimator_

    # Get feature importances from base learners
    xgb_importance = xgb.feature_importances_
    rf_importance = rf.feature_importances_

    # Normalize base importances
    xgb_norm = xgb_importance / xgb_importance.sum()
    rf_norm = rf_importance / rf_importance.sum()

    # # Weight base importances by logistic regression coefficients
    # # These are the meta-model weights for xgb and rf predictions
    # meta_weights = meta_model.coef_[0]
    # xgb_weight = meta_weights[0]
    # rf_weight = meta_weights[1]
    
    # # Zip with feature names
    # importance_dict = dict(zip(feature_names, final_importance))

    # # Combined weighted feature importance
    # final_importance = xgb_weight * xgb_norm + rf_weight * rf_norm
    
    # Average the normalized importances
    avg_importance = (xgb_norm + rf_norm) / 2
    
    # Clean feature names
    cleaned_feature_names = [clean_feature_name(f) for f in feature_names]
    
    importance_dict = dict(zip(cleaned_feature_names, avg_importance))

    # Sort by importance
    sorted_importance = dict(sorted(importance_dict.items(), key=lambda x: x[1], reverse=True))

    return sorted_importance


In [7]:
import numpy as np
meta_weights = stack_model.final_estimator_.coef_[0]
exp_weights = np.exp(meta_weights)           # exponentiate all values
meta_weights = exp_weights / exp_weights.sum()  # normalize to sum to 1

meta_weights
stack_model.final_estimator_.coef_

array([[ 0.27667269, -0.1578872 ]])

In [8]:
import pickle
with open("Stacking_model.pkl", "rb") as f:
    stack_model = pickle.load(f)

xgb_pipeline = stack_model.named_estimators_['xgb']
# Extract the preprocessor from any pipeline (they share the same)
column_transformer  = xgb_pipeline.named_steps['columntransformer']

# Get transformed feature names (after one-hot encoding etc)
transformed_feature_names = column_transformer .get_feature_names_out()

importance_df = get_feature_importance(stack_model, transformed_feature_names)
print(importance_df)

{'Price': 0.1946110714663411, 'Customer ID': 0.1446904731711019, 'Month': 0.055196270296343115, 'category_Interior Finishes': 0.021499019036328358, 'category_Lighting': 0.020848446179961053, 'category_Heat Therapy Products': 0.016670744016039366, 'category_Door Mats': 0.01609968454531859, 'category_Housewares': 0.015076461126743292, 'category_Home & Living': 0.01320741673760218, 'category_Hospitality Supplies': 0.011637516139132761, 'category_Home Goods': 0.010929901457243771, 'category_Consumer Goods': 0.010741811152617587, 'category_Tableware': 0.009907505985412556, 'Country_Germany': 0.009717745606760786, 'Country_Japan': 0.009236219119805239, 'Season_Winter': 0.00896444644179394, 'category_Consumables': 0.008856233153571686, 'category_Bags': 0.008814298009309874, 'category_Toiletry Bags': 0.008677693362780157, 'category_Hardware': 0.008632737225450243, 'category_others': 0.008526913151048258, 'category_Kitchen & Dining': 0.008177039661734723, 'Country_France': 0.00767832235298997, 

# XGB Only

In [24]:
with open("XGBoost_model.pkl", "rb") as f:
    xgb_model = pickle.load(f)

In [25]:
xgb_model

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [32]:
def get_feature_importance(xgb_model, feature_names):
    # For XGBoost model, feature_importances_ attribute exists
    importance_values = xgb_model.named_steps['xgbclassifier'].feature_importances_

    importance_dict = dict(zip(feature_names, importance_values))

    sorted_importance = dict(sorted(importance_dict.items(), key=lambda x: x[1], reverse=True))
    return sorted_importance

In [None]:
test_data = {
    'customerID': '13085',
    'category': 'Accessories',
    'product': 'PINK SWEETHEART BRACELET',
    'date': '2020-12-01'
}

df, price = prepare_input_row(InputData(**test_data), category_rules)

# prob = xgb_model.predict_proba(df)[0][1]
prob = xgb_model.predict_proba(df)
prob
list(df.columns)

['Price',
 'Customer ID',
 'Month',
 'category',
 'Season_Spring',
 'Season_Summer',
 'Season_Winter',
 'Country_United Kingdom',
 'Country_Iceland',
 'Country_Finland',
 'Country_Italy',
 'Country_Unspecified',
 'Country_Norway',
 'Country_Bahrain',
 'Country_Portugal',
 'Country_Switzerland',
 'Country_Austria',
 'Country_Cyprus',
 'Country_Belgium',
 'Country_Netherlands',
 'Country_Australia',
 'Country_RSA',
 'Country_Denmark',
 'Country_Spain',
 'Country_Germany',
 'Country_France',
 'Country_USA',
 'Country_Thailand',
 'Country_Sweden',
 'Country_Greece',
 'Country_Poland',
 'Country_Israel',
 'Country_United Arab Emirates',
 'Country_Singapore',
 'Country_Brazil',
 'Country_Japan',
 'Country_Korea',
 'Country_EIRE',
 'Country_Channel Islands',
 'Country_Lithuania',
 'Country_Canada',
 'Country_Malta',
 'Country_Nigeria',
 'Country_West Indies',
 'cat_rule_000',
 'cat_rule_001',
 'cat_rule_002',
 'cat_rule_003',
 'cat_rule_004',
 'cat_rule_005',
 'cat_rule_006',
 'cat_rule_007',

In [34]:
importances = get_feature_importance(xgb_model, list(df.columns))
importances

{'cat_rule_025': 0.037504274,
 'Price': 0.031326376,
 'cat_rule_008': 0.029216105,
 'Country_Singapore': 0.029101804,
 'cat_rule_035': 0.026406566,
 'cat_rule_023': 0.02405431,
 'cat_rule_019': 0.019247321,
 'Country_France': 0.016508643,
 'cat_rule_007': 0.015263978,
 'cat_rule_012': 0.01522123,
 'cat_rule_015': 0.014974682,
 'Country_Israel': 0.013336606,
 'Country_Austria': 0.012803646,
 'Country_USA': 0.012501656,
 'cat_rule_028': 0.011613793,
 'Country_EIRE': 0.011609142,
 'Country_Nigeria': 0.011151391,
 'cat_rule_034': 0.0108948825,
 'Country_Canada': 0.010615516,
 'cat_rule_014': 0.010269109,
 'cat_rule_046': 0.01018937,
 'Country_Japan': 0.009665984,
 'Country_Brazil': 0.009575503,
 'Country_Finland': 0.008946634,
 'cat_rule_043': 0.008771695,
 'cat_rule_038': 0.008652881,
 'cat_rule_039': 0.008532917,
 'Month': 0.007865472,
 'Country_Korea': 0.0076335203,
 'cat_rule_016': 0.0074468474,
 'cat_rule_002': 0.0074306815,
 'category': 0.0074133803,
 'cat_rule_005': 0.0073545235,
 '