In [1]:
#Libraries  
import pandas as pd

In [2]:
df_structures = pd.read_csv('product_structures_sales.csv')
df_structures.head()

Unnamed: 0,structure_level_4,structure_level_3,structure_level_2,structure_level_1,sku,time_key,quantity
0,3020206,30202,302,3,3111,20230618,18.684
1,3020608,30206,302,3,3278,20240731,396.1008
2,3020809,30208,302,3,3603,20230807,6.228
3,3020608,30206,302,3,4604,20230131,27.4032
4,3040808,30408,304,3,3041,20230906,6.228


In [None]:
df_structures_dedup = df_structures.drop_duplicates(subset='sku')

In [4]:
df_final = pd.read_csv('df_final.csv')

In [5]:
df = df_final.merge(df_structures_dedup[['sku', 'structure_level_2']], on='sku', how='left')

In [6]:
df.head()

Unnamed: 0,time_key,sku,competitor,pvp_final,structure_level_4,flag_promo,chain_price,structure_level_2
0,2024-10-25,1128,chain,57.1,3020407.0,1.0,57.1,302.0
1,2024-10-25,1128,competitorA,76.2,3020407.0,0.0,57.1,302.0
2,2024-10-25,1128,competitorB,75.7261,,,57.1,302.0
3,2024-10-26,1128,chain,57.1,3020407.0,1.0,57.1,302.0
4,2024-10-26,1128,competitorA,76.2,3020407.0,0.0,57.1,302.0


In [7]:
df.dtypes

time_key              object
sku                    int64
competitor            object
pvp_final            float64
structure_level_4    float64
flag_promo           float64
chain_price          float64
structure_level_2    float64
dtype: object

In [11]:
df.isnull().sum()

time_key                   0
sku                        0
competitor                 0
pvp_final                  0
structure_level_4          6
flag_promo           2237087
chain_price                0
structure_level_2          6
dtype: int64

In [9]:
# Fill Nan struture_level_4
map_structure = (
    df.dropna(subset=['structure_level_4'])
      .drop_duplicates(subset=['sku'])[['sku', 'structure_level_4']]
      .set_index('sku')['structure_level_4']
      .to_dict()
)

df['structure_level_4'] = df['structure_level_4'].fillna(
    df['sku'].map(map_structure)
)

In [10]:
# Fill Nan struture_level_2
map_structure = (
    df.dropna(subset=['structure_level_2'])
      .drop_duplicates(subset=['sku'])[['sku', 'structure_level_2']]
      .set_index('sku')['structure_level_2']
      .to_dict()
)

df['structure_level_2'] = df['structure_level_2'].fillna(
    df['sku'].map(map_structure)
)

In [12]:
print(df.columns)


Index(['time_key', 'sku', 'competitor', 'pvp_final', 'structure_level_4',
       'flag_promo', 'chain_price', 'structure_level_2'],
      dtype='object')


In [13]:
df['time_key'] = pd.to_datetime(df['time_key'])

## Model for competitorA

In [14]:
# Filtering
df_a = df[df['competitor']=='competitorA']

df_a.head()

Unnamed: 0,time_key,sku,competitor,pvp_final,structure_level_4,flag_promo,chain_price,structure_level_2
1,2024-10-25,1128,competitorA,76.2,3020407.0,0.0,57.1,302.0
4,2024-10-26,1128,competitorA,76.2,3020407.0,0.0,57.1,302.0
7,2024-10-27,1128,competitorA,76.2,3020407.0,0.0,57.1,302.0
10,2024-10-28,1128,competitorA,76.2,3020407.0,0.0,57.1,302.0
13,2024-10-27,1130,competitorA,39.368,3010811.0,,34.54,301.0


In [15]:
df_a.isnull().sum()

time_key                  0
sku                       0
competitor                0
pvp_final                 0
structure_level_4         2
flag_promo           652000
chain_price               0
structure_level_2         2
dtype: int64

In [16]:
#In the feature flag_promo, the null values correspond to values that were previously imputed. Therefore, I choose to fill these null values with 0, since I cannot confirm that a promotion exists.

df_a['flag_promo'] = df_a['flag_promo'].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_a['flag_promo'] = df_a['flag_promo'].fillna(0)


In [17]:
df_a[df_a['structure_level_4'].isnull()]

Unnamed: 0,time_key,sku,competitor,pvp_final,structure_level_4,flag_promo,chain_price,structure_level_2
6370647,2024-06-10,4607,competitorA,22.1034,,0.0,16.84,
6370650,2024-06-11,4607,competitorA,22.1034,,0.0,16.84,


In [18]:
df_a['structure_level_4'] = df_a['structure_level_4'].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_a['structure_level_4'] = df_a['structure_level_4'].fillna(0)


In [19]:
df_a['time_key'] = pd.to_datetime(df_a['time_key'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_a['time_key'] = pd.to_datetime(df_a['time_key'])


In [20]:
# diff_A column
df_a['diff_A'] = df_a.apply(
    lambda row: row['pvp_final'] - row['chain_price'] if row['competitor'] == 'competitorA' else None,
    axis=1
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_a['diff_A'] = df_a.apply(


In [21]:
df_a = df_a.sort_values(by=['sku', 'time_key'])

df_a['lag_diffA'] = df_a.groupby('sku')['diff_A'].shift(1)


In [22]:
df_a = df_a.sort_values(by=['structure_level_4', 'time_key'])

df_a['lag_diffA_sl4'] = df_a.groupby('structure_level_4')['diff_A'].shift(1)

In [23]:
df_a = df_a.sort_values(by=['sku', 'time_key'])

def safe_zscore(x):
    if x.std() == 0 or len(x) == 1:
        return pd.Series([0] * len(x), index=x.index)
    return (x - x.mean()) / x.std()

df_a['diffA_std_sku'] = df_a.groupby('sku')['diff_A'].transform(safe_zscore)



In [24]:
df_a.isnull().sum()

time_key                0
sku                     0
competitor              0
pvp_final               0
structure_level_4       0
flag_promo              0
chain_price             0
structure_level_2       2
diff_A                  0
lag_diffA            3561
lag_diffA_sl4         366
diffA_std_sku           0
dtype: int64

In [25]:
df_a.head()

Unnamed: 0,time_key,sku,competitor,pvp_final,structure_level_4,flag_promo,chain_price,structure_level_2,diff_A,lag_diffA,lag_diffA_sl4,diffA_std_sku
1,2024-10-25,1128,competitorA,76.2,3020407.0,0.0,57.1,302.0,19.1,,0.0,0.0
4,2024-10-26,1128,competitorA,76.2,3020407.0,0.0,57.1,302.0,19.1,19.1,0.0,0.0
7,2024-10-27,1128,competitorA,76.2,3020407.0,0.0,57.1,302.0,19.1,19.1,0.0,0.0
10,2024-10-28,1128,competitorA,76.2,3020407.0,0.0,57.1,302.0,19.1,19.1,0.0,0.0
13,2024-10-27,1130,competitorA,39.368,3010811.0,0.0,34.54,301.0,4.828,,7.3047,0.0


In [26]:
## Model

In [None]:
#Libraries
import pandas as pd
import pickle
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.base import BaseEstimator, TransformerMixin
from scipy.stats import randint
from sklearn.model_selection import train_test_split, RandomizedSearchCV


In [28]:
df_a['time_key'] = pd.to_datetime(df_a['time_key'])

class TimeFeaturesExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, time_column='time_key'):
        self.time_column = time_column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X[self.time_column] = pd.to_datetime(X[self.time_column])

        X['month'] = X[self.time_column].dt.month
        X['day'] = X[self.time_column].dt.day
        X['dayofweek'] = X[self.time_column].dt.dayofweek

        X['is_christmas_season'] = X['month'].isin([12]).astype(int)
        X['is_new_year'] = X[self.time_column].apply(
            lambda x: (x.month == 12 and x.day >= 26) or (x.month == 1 and x.day <= 5)
        ).astype(int)
        X['is_summer'] = X['month'].isin([6, 7, 8]).astype(int)
        X['is_back_to_school'] = (X['month'] == 9).astype(int)
        X['is_black_friday'] = X[self.time_column].apply(
            lambda x: x.month == 11 and x.weekday() == 4 and 23 <= x.day <= 29
        ).astype(int)

        return X


In [31]:
# Function to calculate symmetric Mean Absolute Percentage Error (sMAPE)
def smape(y_true, y_pred):
    return 100/len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

# Feature sets
base_features = ['lag_diffA', 'lag_diffA_sl4', 'diffA_std_sku', 'chain_price']
time_features = ['month', 'day', 'dayofweek', 'is_christmas_season', 'is_new_year',
                 'is_summer', 'is_back_to_school', 'is_black_friday']
all_features = base_features + time_features

# Preprocessing pipeline
preprocessor = ColumnTransformer(transformers=[
    ('num', SimpleImputer(strategy='mean'), all_features)
])

# Clean data
df_model = df_a.dropna(subset=base_features + ['pvp_final']).copy()
X = df_model[['time_key'] + base_features]
y = df_model['pvp_final']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Complete pipeline
pipeline = Pipeline(steps=[
    ('time_feats', TimeFeaturesExtractor(time_column='time_key')),
    ('preproc', preprocessor),
    ('model', RandomForestRegressor(random_state=42))
])

# Hyperparameter tuning using RandomizedSearchCV
param_distributions = {
    'model__n_estimators': randint(50, 200),
    'model__max_depth': randint(5, 30),
    'model__min_samples_split': randint(2, 10),
    'model__min_samples_leaf': randint(1, 10)
}

search = RandomizedSearchCV(pipeline, param_distributions=param_distributions,
                            n_iter=20, cv=3, scoring='neg_mean_absolute_error',
                            random_state=42, n_jobs=-1, verbose=1)

# Train with hyperparameter tuning
search.fit(X_train, y_train)

# Best model
best_pipeline = search.best_estimator_

# Predict on test set
y_pred = best_pipeline.predict(X_test)

# Evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
smape_score = smape(y_test.values, y_pred)

print("=== Evaluation Metrics on Test Set ===")
print(f"MAE   : {mae:.4f}")
print(f"sMAPE : {smape_score:.2f}%")

# Save trained pipeline
with open('price_pipeline_A.pkl', 'wb') as f:
    pickle.dump(best_pipeline, f)

# Save historical features grouped by SKU
hist_data = {sku: df.sort_values('time_key') for sku, df in df_model.groupby('sku')}
with open('historical_features_A.pkl', 'wb') as f:
    pickle.dump(hist_data, f)


Fitting 3 folds for each of 20 candidates, totalling 60 fits
=== Evaluation Metrics on Test Set ===
MAE   : 0.1396
sMAPE : 0.31%


In [None]:
# This script loads a pre-trained pricing prediction pipeline using pickle
# and evaluates the model's performance per structure_level_2 group (product category),
# using MAE and sMAPE metrics on the test set. The model is NOT retrained.


# sMAPE function
def smape_vec(y_true, y_pred):
    denom = (np.abs(y_true) + np.abs(y_pred))
    denom[denom == 0] = 1e-8  # avoid division by zero
    return 100 / len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / denom)

# Load trained model pipeline
with open('price_pipeline_A.pkl', 'rb') as f:
    best_pipeline = pickle.load(f)

# Prepare test data
# Clean data
base_features = ['lag_diffA', 'lag_diffA_sl4', 'diffA_std_sku', 'chain_price']
df_model = df_a.dropna(subset=base_features + ['pvp_final']).copy()
X = df_model[['time_key'] + base_features]
y = df_model['pvp_final']

# Same split as during training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Predict
y_pred = best_pipeline.predict(X_test)

# Build evaluation DataFrame with structure_level_2 info
test_indices = X_test.index
df_eval = df_model.loc[test_indices, ['sku', 'pvp_final']].copy()
df_eval['y_pred'] = y_pred
df_eval.rename(columns={'pvp_final': 'y_true'}, inplace=True)

# Merge with structure_level_2 info
df_eval = df_eval.merge(
    df_a[['sku', 'structure_level_2']].drop_duplicates(subset='sku'),
    on='sku', how='left'
)

# Compute MAE and sMAPE grouped by structure_level_2
agg_metrics = (
    df_eval.groupby('structure_level_2')
           .apply(lambda df: pd.Series({
               'MAE': mean_absolute_error(df['y_true'], df['y_pred']),
               'sMAPE': smape_vec(df['y_true'].values, df['y_pred'].values)
           }))
           .reset_index()
)

print("\n=== Performance by structure_level_2 ===")
agg_metrics



=== Performance by structure_level_2 ===


  .apply(lambda df: pd.Series({


Unnamed: 0,structure_level_2,MAE,sMAPE
0,101.0,0.40361,0.837079
1,102.0,0.188885,0.502936
2,103.0,0.104572,0.265327
3,104.0,0.237328,0.514641
4,105.0,0.154859,0.345744
5,106.0,0.0723,0.201194
6,201.0,0.218063,0.154137
7,202.0,0.115612,0.362712
8,301.0,0.132328,0.383125
9,302.0,0.109856,0.257381


In [29]:
# Prediction
def predict_with_pipeline(sku, target_date_str):
    import pickle
    import pandas as pd

    # Load saved artifacts
    with open('price_pipeline_A.pkl', 'rb') as f:
        pipeline = pickle.load(f)
    with open('historical_features_A.pkl', 'rb') as f:
        hist_data = pickle.load(f)

    target_date = pd.to_datetime(target_date_str)

    if sku not in hist_data:
        raise ValueError(f'SKU {sku} not found.')

    df = hist_data[sku].copy()
    df['time_key'] = pd.to_datetime(df['time_key'])

    # Find a reference row with same day and month from past data
    same_day = df[
        (df['time_key'].dt.day == target_date.day) &
        (df['time_key'].dt.month == target_date.month) &
        (df['time_key'] < target_date)
    ]

    if not same_day.empty:
        ref_row = same_day.sort_values('time_key').iloc[-1]
    else:
        # Fallback to most recent available date before target
        df_filtered = df[df['time_key'] < target_date]
        if df_filtered.empty:
            raise ValueError(f'Not enough historical data for SKU {sku} before {target_date.date()}.')
        ref_row = df_filtered.sort_values('time_key').iloc[-1]

    # Assemble input features
    input_data = {
        'time_key': target_date,
        'lag_diffA': ref_row['lag_diffA'],
        'lag_diffA_sl4': ref_row['lag_diffA_sl4'],
        'diffA_std_sku': ref_row['diffA_std_sku'],
        'chain_price': ref_row['chain_price']
    }

    input_df = pd.DataFrame([input_data])
    prediction = pipeline.predict(input_df)[0]

    return prediction

# Example usage
sku_input = 1130
date_input = "2025-10-27"

try:
    result = predict_with_pipeline(sku_input, date_input)
    print(f"Predicted price for SKU {sku_input} on {date_input}: {result:.2f}")
except Exception as e:
    print(f"Error: {e}")


Predicted price for SKU 1130 on 2025-10-27: 39.23


In [33]:
## Model for competitor B

In [33]:
# Filtering competitor B
df_b = df[df['competitor']=='competitorB']

df_b.head()

Unnamed: 0,time_key,sku,competitor,pvp_final,structure_level_4,flag_promo,chain_price,structure_level_2
2,2024-10-25,1128,competitorB,75.7261,3020407.0,,57.1,302.0
5,2024-10-26,1128,competitorB,75.7261,3020407.0,,57.1,302.0
8,2024-10-27,1128,competitorB,75.7261,3020407.0,,57.1,302.0
11,2024-10-28,1128,competitorB,75.7261,3020407.0,,57.1,302.0
14,2024-10-27,1130,competitorB,39.368,3010811.0,,34.54,301.0


In [34]:
df_b.isnull().sum()

time_key                   0
sku                        0
competitor                 0
pvp_final                  0
structure_level_4          2
flag_promo           1585087
chain_price                0
structure_level_2          2
dtype: int64

In [35]:
df_b['time_key'] = pd.to_datetime(df_b['time_key'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_b['time_key'] = pd.to_datetime(df_b['time_key'])


In [36]:
#In the feature flag_promo, the null values correspond to values that were previously imputed. Therefore, I choose to fill these null values with 0, since I cannot confirm that a promotion exists.

df_b['flag_promo'] = df_b['flag_promo'].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_b['flag_promo'] = df_b['flag_promo'].fillna(0)


In [37]:
# diff_B column
df_b['diff_B'] = df_b.apply(
    lambda row: row['pvp_final'] - row['chain_price'] if row['competitor'] == 'competitorB' else None,
    axis=1
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_b['diff_B'] = df_b.apply(


In [38]:
df_b = df_b.sort_values(by=['sku', 'time_key'])

df_b['lag_diffB'] = df_b.groupby('sku')['diff_B'].shift(1)


In [39]:
df_b = df_b.sort_values(by=['structure_level_4', 'time_key'])

df_b['lag_diffB_sl4'] = df_b.groupby('structure_level_4')['diff_B'].shift(1)

In [40]:
df_b = df_b.sort_values(by=['sku', 'time_key'])

def safe_zscore(x):
    if x.std() == 0 or len(x) == 1:
        return pd.Series([0] * len(x), index=x.index)
    return (x - x.mean()) / x.std()

df_b['diffB_std_sku'] = df_b.groupby('sku')['diff_B'].transform(safe_zscore)


In [41]:
df_b.isnull().sum()

time_key                0
sku                     0
competitor              0
pvp_final               0
structure_level_4       2
flag_promo              0
chain_price             0
structure_level_2       2
diff_B                  0
lag_diffB            3561
lag_diffB_sl4         367
diffB_std_sku           0
dtype: int64

In [42]:
df_b[df_b['structure_level_4'].isnull()]

Unnamed: 0,time_key,sku,competitor,pvp_final,structure_level_4,flag_promo,chain_price,structure_level_2,diff_B,lag_diffB,lag_diffB_sl4,diffB_std_sku
6370648,2024-06-10,4607,competitorB,25.6077,,0.0,16.84,,8.7677,,,0.0
6370651,2024-06-11,4607,competitorB,25.6077,,0.0,16.84,,8.7677,8.7677,,0.0


In [43]:
df_b['structure_level_4'] = df_b['structure_level_4'].fillna(0)

In [44]:
df_b['time_key'] = pd.to_datetime(df_b['time_key'])

class TimeFeaturesExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, time_column='time_key'):
        self.time_column = time_column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X[self.time_column] = pd.to_datetime(X[self.time_column])

        X['month'] = X[self.time_column].dt.month
        X['day'] = X[self.time_column].dt.day
        X['dayofweek'] = X[self.time_column].dt.dayofweek

        X['is_christmas_season'] = X['month'].isin([12]).astype(int)
        X['is_new_year'] = X[self.time_column].apply(
            lambda x: (x.month == 12 and x.day >= 26) or (x.month == 1 and x.day <= 5)
        ).astype(int)
        X['is_summer'] = X['month'].isin([6, 7, 8]).astype(int)
        X['is_back_to_school'] = (X['month'] == 9).astype(int)
        X['is_black_friday'] = X[self.time_column].apply(
            lambda x: x.month == 11 and x.weekday() == 4 and 23 <= x.day <= 29
        ).astype(int)

        return X

In [47]:
#Pipeline 

# Function to calculate symmetric Mean Absolute Percentage Error (sMAPE)
def smape(y_true, y_pred):
    return 100/len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

# Feature sets
base_features = ['lag_diffB', 'lag_diffB_sl4', 'diffB_std_sku', 'chain_price']
time_features = ['month', 'day', 'dayofweek', 'is_christmas_season', 'is_new_year',
                 'is_summer', 'is_back_to_school', 'is_black_friday']
all_features = base_features + time_features

# Preprocessing pipeline
preprocessor = ColumnTransformer(transformers=[
    ('num', SimpleImputer(strategy='mean'), all_features)
])

# Clean data
df_model = df_b.dropna(subset=base_features + ['pvp_final']).copy()
X = df_model[['time_key'] + base_features]
y = df_model['pvp_final']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Complete pipeline
pipeline = Pipeline(steps=[
    ('time_feats', TimeFeaturesExtractor(time_column='time_key')),
    ('preproc', preprocessor),
    ('model', RandomForestRegressor(random_state=42))
])

# Hyperparameter tuning using RandomizedSearchCV
param_distributions = {
    'model__n_estimators': randint(50, 200),
    'model__max_depth': randint(5, 30),
    'model__min_samples_split': randint(2, 10),
    'model__min_samples_leaf': randint(1, 10)
}

search = RandomizedSearchCV(pipeline, param_distributions=param_distributions,
                            n_iter=20, cv=3, scoring='neg_mean_absolute_error',
                            random_state=42, n_jobs=-1, verbose=1)

# Train with hyperparameter tuning
search.fit(X_train, y_train)

# Best model
best_pipeline = search.best_estimator_

# Predict on test set
y_pred = best_pipeline.predict(X_test)

# Evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
smape_score = smape(y_test.values, y_pred)

print("=== Evaluation Metrics on Test Set ===")
print(f"MAE   : {mae:.4f}")
print(f"sMAPE : {smape_score:.2f}%")

# Save trained pipeline
with open('price_pipeline_B.pkl', 'wb') as f:
    pickle.dump(best_pipeline, f)

# Save historical features grouped by SKU
hist_data = {sku: df.sort_values('time_key') for sku, df in df_model.groupby('sku')}
with open('historical_features_B.pkl', 'wb') as f:
    pickle.dump(hist_data, f)


Fitting 3 folds for each of 20 candidates, totalling 60 fits
=== Evaluation Metrics on Test Set ===
MAE   : 0.1039
sMAPE : 0.24%


In [None]:
# This script loads a pre-trained pricing prediction pipeline using pickle
# and evaluates the model's performance per structure_level_2 group (product category),
# using MAE and sMAPE metrics on the test set. The model is NOT retrained.


# sMAPE function
def smape_vec(y_true, y_pred):
    denom = (np.abs(y_true) + np.abs(y_pred))
    denom[denom == 0] = 1e-8  # avoid division by zero
    return 100 / len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / denom)

# Load trained model pipeline
with open('price_pipeline_B.pkl', 'rb') as f:
    best_pipeline = pickle.load(f)

# Prepare test data
# Clean data
base_features = ['lag_diffB', 'lag_diffB_sl4', 'diffB_std_sku', 'chain_price']
df_model = df_b.dropna(subset=base_features + ['pvp_final']).copy()
X = df_model[['time_key'] + base_features]
y = df_model['pvp_final']

# Same split as during training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Predict
y_pred = best_pipeline.predict(X_test)

# Build evaluation DataFrame with structure_level_2 info
test_indices = X_test.index
df_eval = df_model.loc[test_indices, ['sku', 'pvp_final']].copy()
df_eval['y_pred'] = y_pred
df_eval.rename(columns={'pvp_final': 'y_true'}, inplace=True)

# Merge with structure_level_2 info
df_eval = df_eval.merge(
    df_a[['sku', 'structure_level_2']].drop_duplicates(subset='sku'),
    on='sku', how='left'
)

# Compute MAE and sMAPE grouped by structure_level_2
agg_metrics = (
    df_eval.groupby('structure_level_2')
           .apply(lambda df: pd.Series({
               'MAE': mean_absolute_error(df['y_true'], df['y_pred']),
               'sMAPE': smape_vec(df['y_true'].values, df['y_pred'].values)
           }))
           .reset_index()
)

print("\n=== Performance by structure_level_2 ===")
agg_metrics


=== Performance by structure_level_2 ===


  .apply(lambda df: pd.Series({


Unnamed: 0,structure_level_2,MAE,sMAPE
0,101.0,0.355255,0.804147
1,102.0,0.238346,0.667546
2,103.0,0.068878,0.202307
3,104.0,0.256241,0.6675
4,105.0,0.149516,0.415602
5,106.0,0.05384,0.161512
6,201.0,0.226665,0.174871
7,202.0,0.087208,0.260618
8,301.0,0.078692,0.220556
9,302.0,0.072088,0.167356


In [25]:
# Objective: Load historical features and a trained model pipeline to predict the price for a given SKU on a target date.

# Prediction function
def predict_with_pipeline(sku, target_date_str):
    # Load artifacts
    with open('price_pipeline_B.pkl', 'rb') as f:
        pipeline = pickle.load(f)
    with open('historical_features_B.pkl', 'rb') as f:
        hist_data = pickle.load(f)

    target_date = pd.to_datetime(target_date_str)

    if sku not in hist_data:
        raise ValueError(f'SKU {sku} not found.')

    df = hist_data[sku].copy()
    df['time_key'] = pd.to_datetime(df['time_key'])

    # Look for the same day in previous months/years
    same_day = df[
        (df['time_key'].dt.day == target_date.day) &
        (df['time_key'].dt.month == target_date.month) &
        (df['time_key'] < target_date)
    ]

    if not same_day.empty:
        ref_row = same_day.sort_values('time_key').iloc[-1]
    else:
        df_filtered = df[df['time_key'] < target_date]
        if df_filtered.empty:
            raise ValueError(f'Not enough historical data for SKU {sku} before {target_date.date()}.')
        ref_row = df_filtered.sort_values('time_key').iloc[-1]

    # Build input
    input_data = {
        'time_key': target_date,
        'lag_diffB': ref_row['lag_diffB'],
        'lag_diffB_sl4': ref_row['lag_diffB_sl4'],
        'diffB_std_sku': ref_row['diffB_std_sku'],
        'chain_price': ref_row['chain_price']
    }

    input_df = pd.DataFrame([input_data])
    prediction = pipeline.predict(input_df)[0]

    return prediction

# Example usage
sku_input = 1130
data_input = "2025-10-27"

try:
    result = predict_with_pipeline(sku_input, data_input)
    print(f"Predicted price for SKU {sku_input} on {data_input}: {result:.2f}")
except Exception as e:
    print(f"Error: {e}")


Predicted price for SKU 1130 on 2025-10-27: 39.34


# Retraining model

# Retrain Model for competitorA

In [49]:
df_clean_a= pd.read_csv('df_clean_a.csv')
df_clean_a.head()

Unnamed: 0,sku,time_key,pvp_is_competitorA_actual
0,4555,2024-11-07,34.54
1,2506,2024-11-07,34.54
2,1913,2024-11-13,29.85
3,3554,2024-12-15,43.22
4,4178,2024-12-16,24.13


In [46]:
df_clean_a = df_clean_a.rename(columns={'pvp_is_competitorA_actual': 'pvp_final'})


In [47]:
df_retrain_a = pd.concat([df_a, df_clean_a], ignore_index=True)


In [48]:
df_retrain_a.isnull().sum()

time_key                0
sku                     0
competitor            498
pvp_final               0
structure_level_4     498
flag_promo            498
chain_price           498
diff_A                498
lag_diffA            4059
lag_diffA_sl4         864
diffA_std_sku         498
dtype: int64

In [49]:
# Fill Nan struture_level_4
map_structure = (
    df.dropna(subset=['structure_level_4'])
      .drop_duplicates(subset=['sku'])[['sku', 'structure_level_4']]
      .set_index('sku')['structure_level_4']
      .to_dict()
)

df_retrain_a['structure_level_4'] = df_retrain_a['structure_level_4'].fillna(
    df_retrain_a['sku'].map(map_structure)
)

In [50]:
# Objective: Calculate monthly average values per SKU to impute missing values in specific features.

# List of features with missing values
missing_features = ['lag_diffA', 'lag_diffA_sl4', 'diffA_std_sku', 'chain_price']

# Function to compute monthly averages per SKU
def calculate_monthly_averages(hist_data, features):
    averages = {}
    for feat in features:
        for sku, df_sku in hist_data.items():
            df = df_sku.copy()
            df['month'] = df['time_key'].dt.month
            monthly_avg = df.groupby('month')[feat].mean().reset_index()
            for _, row in monthly_avg.iterrows():
                key = (sku, row['month'])
                if key not in averages:
                    averages[key] = {}
                averages[key][feat] = row[feat]
    return averages

# Function to impute NaNs using the monthly averages
def impute_with_monthly_average(new_df, monthly_averages, features):
    new_df['time_key'] = pd.to_datetime(new_df['time_key'], errors='coerce')
    new_df['month'] = new_df['time_key'].dt.month

    for feat in features:
        if feat not in new_df.columns:
            new_df[feat] = np.nan

        for idx, row in new_df.iterrows():
            sku = row['sku']
            month = row['month']
            key = (sku, month)

            if pd.isna(new_df.at[idx, feat]):
                value = monthly_averages.get(key, {}).get(feat, np.nan)
                new_df.at[idx, feat] = value

    return new_df

# --- Execution ---

# Load historical data from pickle file
with open('historical_features_A.pkl', 'rb') as f:
    hist_data = pickle.load(f)

# Calculate monthly averages per SKU and feature
monthly_averages = calculate_monthly_averages(hist_data, missing_features)

# Example: Impute missing values in df_retrain_a
# df_retrain_a must contain at least 'sku' and 'time_key' columns
imputed_df = impute_with_monthly_average(df_retrain_a, monthly_averages, missing_features)

# Check how many values are still NaN
print(imputed_df[missing_features].isna().sum())



lag_diffA        59
lag_diffA_sl4    28
diffA_std_sku    24
chain_price      24
dtype: int64


In [51]:
# Drop rows with missing values in the specified features
imputed_df = imputed_df.dropna(subset=['lag_diffA', 'chain_price', 'lag_diffA_sl4', 'diffA_std_sku'])



In [52]:
imputed_df['time_key'] = pd.to_datetime(imputed_df['time_key'])

class TimeFeaturesExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, time_column='time_key'):
        self.time_column = time_column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X[self.time_column] = pd.to_datetime(X[self.time_column])

        X['month'] = X[self.time_column].dt.month
        X['day'] = X[self.time_column].dt.day
        X['dayofweek'] = X[self.time_column].dt.dayofweek

        X['is_christmas_season'] = X['month'].isin([12]).astype(int)
        X['is_new_year'] = X[self.time_column].apply(
            lambda x: (x.month == 12 and x.day >= 26) or (x.month == 1 and x.day <= 5)
        ).astype(int)
        X['is_summer'] = X['month'].isin([6, 7, 8]).astype(int)
        X['is_back_to_school'] = (X['month'] == 9).astype(int)
        X['is_black_friday'] = X[self.time_column].apply(
            lambda x: x.month == 11 and x.weekday() == 4 and 23 <= x.day <= 29
        ).astype(int)

        return X

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imputed_df['time_key'] = pd.to_datetime(imputed_df['time_key'])


In [None]:
# Function to calculate symmetric Mean Absolute Percentage Error (sMAPE)
def smape(y_true, y_pred):
    return 100/len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

# Feature sets
base_features = ['lag_diffA', 'lag_diffA_sl4', 'diffA_std_sku', 'chain_price']
time_features = ['month', 'day', 'dayofweek', 'is_christmas_season', 'is_new_year',
                 'is_summer', 'is_back_to_school', 'is_black_friday']
all_features = base_features + time_features

# Preprocessing pipeline
preprocessor = ColumnTransformer(transformers=[
    ('num', SimpleImputer(strategy='mean'), all_features)
])

# Clean data
df_model = imputed_df.dropna(subset=base_features + ['pvp_final']).copy()
X = df_model[['time_key'] + base_features]
y = df_model['pvp_final']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Complete pipeline
pipeline = Pipeline(steps=[
    ('time_feats', TimeFeaturesExtractor(time_column='time_key')),
    ('preproc', preprocessor),
    ('model', RandomForestRegressor(random_state=42))
])

# Hyperparameter tuning using RandomizedSearchCV
param_distributions = {
    'model__n_estimators': randint(50, 200),
    'model__max_depth': randint(5, 30),
    'model__min_samples_split': randint(2, 10),
    'model__min_samples_leaf': randint(1, 10)
}

search = RandomizedSearchCV(pipeline, param_distributions=param_distributions,
                            n_iter=20, cv=3, scoring='neg_mean_absolute_error',
                            random_state=42, n_jobs=-1, verbose=1)

# Train with hyperparameter tuning
search.fit(X_train, y_train)

# Best model
best_pipeline = search.best_estimator_

# Predict on test set
y_pred = best_pipeline.predict(X_test)

# Evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
smape_score = smape(y_test.values, y_pred)

print("=== Evaluation Metrics on Test Set ===")
print(f"MAE   : {mae:.4f}")
print(f"sMAPE : {smape_score:.2f}%")

# Save trained pipeline
with open('price_pipeline_A_retrain.pkl', 'wb') as f:
    pickle.dump(best_pipeline, f)

# Save historical features grouped by SKU
hist_data = {sku: df.sort_values('time_key') for sku, df in df_model.groupby('sku')}
with open('historical_features_A_retrain.pkl', 'wb') as f:
    pickle.dump(hist_data, f)


Fitting 3 folds for each of 20 candidates, totalling 60 fits




=== Evaluation Metrics on Test Set ===
MAE   : 0.1406
sMAPE : 0.31%


# Retrain model for competitorB

In [53]:
df_clean_b= pd.read_csv('df_clean_b.csv')
df_clean_b.head()

Unnamed: 0,sku,time_key,pvp_is_competitorB_actual
0,4555,2024-11-07,34.54
1,2506,2024-11-07,34.54
2,1913,2024-11-13,29.33
3,3554,2024-12-15,43.22
4,4178,2024-12-16,24.13


In [54]:
df_clean_b = df_clean_b.rename(columns={'pvp_is_competitorB_actual': 'pvp_final'})

In [55]:
df_retrain_b = pd.concat([df_b, df_clean_b], ignore_index=True)

In [56]:
df_retrain_b.isnull().sum()

time_key                0
sku                     0
competitor            498
pvp_final               0
structure_level_4     498
flag_promo            498
chain_price           498
diff_B                498
lag_diffB            4059
lag_diffB_sl4         865
diffB_std_sku         498
dtype: int64

In [57]:
# Fill Nan struture_level_4
map_structure = (
    df.dropna(subset=['structure_level_4'])
      .drop_duplicates(subset=['sku'])[['sku', 'structure_level_4']]
      .set_index('sku')['structure_level_4']
      .to_dict()
)

df_retrain_b['structure_level_4'] = df_retrain_b['structure_level_4'].fillna(
    df_retrain_b['sku'].map(map_structure)
)

In [58]:
# Objective: Impute missing values in specific features using monthly averages per SKU based on historical data (version B).

# List of features with missing values
missing_features = ['lag_diffB', 'lag_diffB_sl4', 'diffB_std_sku', 'chain_price']

# Function to calculate monthly averages per SKU
def calculate_monthly_averages(hist_data, features):
    averages = {}
    for feat in features:
        for sku, df_sku in hist_data.items():
            df = df_sku.copy()
            df['month'] = df['time_key'].dt.month
            monthly_avg = df.groupby('month')[feat].mean().reset_index()
            for _, row in monthly_avg.iterrows():
                key = (sku, row['month'])
                if key not in averages:
                    averages[key] = {}
                averages[key][feat] = row[feat]
    return averages

# Function to impute NaNs using the monthly averages
def impute_with_monthly_average(new_df, monthly_averages, features):
    new_df['time_key'] = pd.to_datetime(new_df['time_key'], errors='coerce')
    new_df['month'] = new_df['time_key'].dt.month

    for feat in features:
        if feat not in new_df.columns:
            new_df[feat] = np.nan

        for idx, row in new_df.iterrows():
            sku = row['sku']
            month = row['month']
            key = (sku, month)

            if pd.isna(new_df.at[idx, feat]):
                value = monthly_averages.get(key, {}).get(feat, np.nan)
                new_df.at[idx, feat] = value

    return new_df

# --- Execution ---

# Load historical data from pickle file
with open('historical_features_B.pkl', 'rb') as f:
    hist_data = pickle.load(f)

# Calculate monthly averages per SKU and feature
monthly_averages = calculate_monthly_averages(hist_data, missing_features)

# Example: Impute missing values in df_retrain_b
# df_retrain_b must contain at least 'sku' and 'time_key' columns
imputed_df_b = impute_with_monthly_average(df_retrain_b, monthly_averages, missing_features)

# Check how many missing values remain
print(imputed_df_b[missing_features].isna().sum())



lag_diffB        60
lag_diffB_sl4    30
diffB_std_sku    24
chain_price      24
dtype: int64


In [None]:
imputed_df_b = imputed_df_b.dropna(subset=['lag_diffB', 'chain_price','lag_diffB_sl4', 'diffB_std_sku'])

In [None]:
imputed_df_b['time_key'] = pd.to_datetime(imputed_df_b['time_key'])

class TimeFeaturesExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, time_column='time_key'):
        self.time_column = time_column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X[self.time_column] = pd.to_datetime(X[self.time_column])

        X['month'] = X[self.time_column].dt.month
        X['day'] = X[self.time_column].dt.day
        X['dayofweek'] = X[self.time_column].dt.dayofweek

        X['is_christmas_season'] = X['month'].isin([12]).astype(int)
        X['is_new_year'] = X[self.time_column].apply(
            lambda x: (x.month == 12 and x.day >= 26) or (x.month == 1 and x.day <= 5)
        ).astype(int)
        X['is_summer'] = X['month'].isin([6, 7, 8]).astype(int)
        X['is_back_to_school'] = (X['month'] == 9).astype(int)
        X['is_black_friday'] = X[self.time_column].apply(
            lambda x: x.month == 11 and x.weekday() == 4 and 23 <= x.day <= 29
        ).astype(int)

        return X

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_novo_imputado_b['time_key'] = pd.to_datetime(df_novo_imputado_b['time_key'])


In [None]:
# Function to calculate symmetric Mean Absolute Percentage Error (sMAPE)
def smape(y_true, y_pred):
    return 100/len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

# Feature sets
base_features = ['lag_diffB', 'lag_diffB_sl4', 'diffB_std_sku', 'chain_price']
time_features = ['month', 'day', 'dayofweek', 'is_christmas_season', 'is_new_year',
                 'is_summer', 'is_back_to_school', 'is_black_friday']
all_features = base_features + time_features

# Preprocessing pipeline
preprocessor = ColumnTransformer(transformers=[
    ('num', SimpleImputer(strategy='mean'), all_features)
])

# Clean data
df_model = imputed_df_b.dropna(subset=base_features + ['pvp_final']).copy()
X = df_model[['time_key'] + base_features]
y = df_model['pvp_final']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Complete pipeline
pipeline = Pipeline(steps=[
    ('time_feats', TimeFeaturesExtractor(time_column='time_key')),
    ('preproc', preprocessor),
    ('model', RandomForestRegressor(random_state=42))
])

# Hyperparameter tuning using RandomizedSearchCV
param_distributions = {
    'model__n_estimators': randint(50, 200),
    'model__max_depth': randint(5, 30),
    'model__min_samples_split': randint(2, 10),
    'model__min_samples_leaf': randint(1, 10)
}

search = RandomizedSearchCV(pipeline, param_distributions=param_distributions,
                            n_iter=20, cv=3, scoring='neg_mean_absolute_error',
                            random_state=42, n_jobs=-1, verbose=1)

# Train with hyperparameter tuning
search.fit(X_train, y_train)

# Best model
best_pipeline = search.best_estimator_

# Predict on test set
y_pred = best_pipeline.predict(X_test)

# Evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
smape_score = smape(y_test.values, y_pred)

print("=== Evaluation Metrics on Test Set ===")
print(f"MAE   : {mae:.4f}")
print(f"sMAPE : {smape_score:.2f}%")

# Save trained pipeline
with open('price_pipeline_B_retrain.pkl', 'wb') as f:
    pickle.dump(best_pipeline, f)

# Save historical features grouped by SKU
hist_data = {sku: df.sort_values('time_key') for sku, df in df_model.groupby('sku')}
with open('historical_features_B_retrain.pkl', 'wb') as f:
    pickle.dump(hist_data, f)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
=== Evaluation Metrics on Test Set ===
MAE   : 0.1071
sMAPE : 0.25%
