In [1]:
import pandas as pd
import numpy as np
import pickle
from datetime import timedelta
from sklearn.preprocessing import OrdinalEncoder
import random
with open('best_xgb_2.pkl', 'rb') as file:
    xgb = pickle.load(file)

In [3]:
product = pd.read_excel('Dataset/Product Details.xlsx')
store = pd.read_excel('Dataset/Store Details.xlsx')
upc_encoding = pd.read_excel('Dataset/StoreID UPC Encoding.xlsx', sheet_name='Product')
store_encoding = pd.read_excel('Dataset/StoreID UPC Encoding.xlsx', sheet_name='Store')
elasticity = pd.read_excel('Dataset/Price Elasticity.xlsx')
data = pd.read_excel('Dataset/FINAL SUBSET_preprocessed_data.xlsx')   

#### Functions to forecast unit sales

In [14]:
# Functions to forecast next week unit sales
def get_store_upc_data(store_id, upc):
    store_upc_data = data[(data['UPC'] == upc) & (data['STORE_ID'] == store_id)]
    return store_upc_data

# To pre-process user inputs for demand forecasting
def processing_pipeline(input_data):
  
    processed_input = input_data.copy()

    store_id = processed_input['STORE_ID'].iloc[0]
    upc = processed_input['UPC'].iloc[0]
    store_upc_data = get_store_upc_data(store_id, upc)
    store_upc_data = store_upc_data.sort_values(by='WEEK_END_DATE', ascending=False)

    # Get the next week date of last date using  timedelta(days=7) 
    processed_input['WEEK_END_DATE'] = store_upc_data['WEEK_END_DATE'].max() + timedelta(days=7) 

    # Extract YEAR and WEEK_NUM 
    processed_input['YEAR'] = processed_input['WEEK_END_DATE'].dt.year
    processed_input['WEEK_NUM'] = processed_input['WEEK_END_DATE'].dt.isocalendar().week

    # Frequecy encoding of STORE_ID
    processed_input = processed_input.merge(store_encoding, on='STORE_ID', how='left')
    # Frequecy encoding of UPC
    processed_input = processed_input.merge(upc_encoding, on='UPC', how='left')

    # Get SEG_VALUE_NAME based on STORE_ID
    processed_input = processed_input.merge(store[['STORE_ID', 'SEG_VALUE_NAME']], on='STORE_ID', how='left')

    # Ordinal encoding of SEG_VALUE_NAME
    encoder = OrdinalEncoder(categories=[['VALUE', 'MAINSTREAM', 'UPSCALE']]) 
    processed_input['SEG_VALUE_NAME_ORDINAL'] = encoder.fit_transform(processed_input[['SEG_VALUE_NAME']]).astype(int) 

    # Get SALES_AREA_SIZE_NUM based on STORE_ID
    processed_input = processed_input.merge(store[['STORE_ID', 'SALES_AREA_SIZE_NUM']], on='STORE_ID', how='left')

    # Get Category based on UPC 
    processed_input = processed_input.merge(product[['UPC', 'CATEGORY']], on='UPC', how='left')

    # One-hot encoding of CATEGORY
    product_categories = ['BAG SNACKS', 'COLD CEREAL', 'FROZEN PIZZA', 'ORAL HYGIENE PRODUCTS']
    for cat in product_categories:
        processed_input.loc[:,cat] = 0
    category = processed_input['CATEGORY']
    processed_input[category] = 1

    # Get UNIT_SALES_LOG_LAG1 and UNIT_SALES_LOG_LAG2
    last_data = store_upc_data.head(1)     # Get the last week of sales data (first row)
    last2_data = store_upc_data.iloc[[1]]  # Get the second last week of sales data (second row)
    processed_input['UNIT_SALES_LOG_LAG1'] = last_data['UNIT_SALES_LOG'].iloc[0]
    processed_input['UNIT_SALES_LOG_LAG2'] = last2_data['UNIT_SALES_LOG'].iloc[0]

    # Get the needed features for forecasting
    features = ['YEAR', 'WEEK_NUM', 'STORE_ID_COUNT', 'UPC_COUNT', 'UNIT_SALES_LOG_LAG1', 'UNIT_SALES_LOG_LAG2',
                'PRICE', 'FEATURE', 'DISPLAY', 'SEG_VALUE_NAME_ORDINAL', 'SALES_AREA_SIZE_NUM',
                'BAG SNACKS', 'COLD CEREAL', 'FROZEN PIZZA', 'ORAL HYGIENE PRODUCTS']

    return processed_input[features]

# To forecast demand based on user inputs
def predict(processed_input):
    y_pred = xgb.predict(processed_input)
    y_pred = np.exp(y_pred[0])
    pred_unit_sales = np.round(y_pred).astype(int)
    return pred_unit_sales


#### Data Preparation for Reinforcement Learning

In [40]:
data = pd.read_excel('Dataset/FINAL SUBSET_preprocessed_data.xlsx')  
elasticity = pd.read_excel('Dataset/Price Elasticity.xlsx')
 
elasticity_copy = elasticity.copy()
elasticity_copy.drop(columns=['PRICE_ELASTICITY'], inplace=True)
elasticity_copy.rename(columns={'ABS_PRICE_ELASTICITY': 'PRICE_ELASTICITY'}, inplace=True)
elasticity_copy = elasticity_copy.sort_values(by=['STORE_ID', 'UPC'])

# Extract subset of stores for training - 3 stores
df2_subset = elasticity_copy[elasticity_copy['STORE_ID'].isin([2277, 15755, 25253])]  # 2277: UPSCALE, 15755: VALUES, 25253: MAINSTREAM
df2_subset['PRICE'] = np.nan
df2_subset.reset_index(inplace=True)
df2_subset.drop(columns=['index'], inplace=True)

# Generate random price within min and max price range for each store-upc combination
for i in range(len(df2_subset)):
    store_id = df2_subset['STORE_ID'].iloc[i]
    upc = df2_subset['UPC'].iloc[i]
    store_upc_data = get_store_upc_data(store_id, upc)  # Extract historical data for specific store and product
    min = store_upc_data['PRICE'].min()
    max = store_upc_data['PRICE'].max()
    random_price = random.uniform(min, max)    
    df2_subset.at[i,'PRICE'] = round(random_price, 2)
df3 = df2_subset.copy()
df3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2_subset['PRICE'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2_subset.drop(columns=['index'], inplace=True)


Unnamed: 0,STORE_ID,UPC,PRICE_ELASTICITY,PRICE
0,2277,1111009477,0.030736,1.26
1,2277,1111009497,0.004954,1.43
2,2277,1111009507,0.017499,1.24
3,2277,1111038078,0.092314,1.32
4,2277,1111038080,0.110819,1.79
...,...,...,...,...
106,25253,31254742735,0.014843,3.15
107,25253,31254742835,0.366440,5.04
108,25253,88491201426,0.667977,3.37
109,25253,88491201427,0.694300,2.80


##### Forecast Unit Sales for Different Feature and Dispaly values

In [41]:
df3_00 = df3.copy()

df3_00['FEATURE'] = 0
df3_00['DISPLAY'] = 0
df3_00['FORECASTED_UNIT_SALES'] = np.nan

for i in range (len(df3_00)):
    store_upc_input = df3_00.iloc[[i]]
    processed_input = processing_pipeline(store_upc_input)
    pred_unit_sales = predict(processed_input)
    df3_00.at[i,'FORECASTED_UNIT_SALES'] = pred_unit_sales
df3_00

Unnamed: 0,STORE_ID,UPC,PRICE_ELASTICITY,PRICE,FEATURE,DISPLAY,FORECASTED_UNIT_SALES
0,2277,1111009477,0.030736,1.26,0,0,173.0
1,2277,1111009497,0.004954,1.43,0,0,139.0
2,2277,1111009507,0.017499,1.24,0,0,41.0
3,2277,1111038078,0.092314,1.32,0,0,11.0
4,2277,1111038080,0.110819,1.79,0,0,9.0
...,...,...,...,...,...,...,...
106,25253,31254742735,0.014843,3.15,0,0,5.0
107,25253,31254742835,0.366440,5.04,0,0,2.0
108,25253,88491201426,0.667977,3.37,0,0,24.0
109,25253,88491201427,0.694300,2.80,0,0,37.0


In [42]:
df3_10 = df3.copy()

df3_10['FEATURE'] = 1
df3_10['DISPLAY'] = 0
df3_10['FORECASTED_UNIT_SALES'] = np.nan

for i in range (len(df3_10)):
    store_upc_input = df3_10.iloc[[i]]
    processed_input = processing_pipeline(store_upc_input)
    pred_unit_sales = predict(processed_input)
    df3_10.at[i,'FORECASTED_UNIT_SALES'] = pred_unit_sales
df3_10

Unnamed: 0,STORE_ID,UPC,PRICE_ELASTICITY,PRICE,FEATURE,DISPLAY,FORECASTED_UNIT_SALES
0,2277,1111009477,0.030736,1.26,1,0,253.0
1,2277,1111009497,0.004954,1.43,1,0,163.0
2,2277,1111009507,0.017499,1.24,1,0,55.0
3,2277,1111038078,0.092314,1.32,1,0,20.0
4,2277,1111038080,0.110819,1.79,1,0,14.0
...,...,...,...,...,...,...,...
106,25253,31254742735,0.014843,3.15,1,0,9.0
107,25253,31254742835,0.366440,5.04,1,0,3.0
108,25253,88491201426,0.667977,3.37,1,0,33.0
109,25253,88491201427,0.694300,2.80,1,0,59.0


In [43]:
df3_01 = df3.copy()

df3_01['FEATURE'] = 0
df3_01['DISPLAY'] = 1
df3_01['FORECASTED_UNIT_SALES'] = np.nan

for i in range (len(df3_01)):
    store_upc_input = df3_01.iloc[[i]]
    processed_input = processing_pipeline(store_upc_input)
    pred_unit_sales = predict(processed_input)
    df3_01.at[i,'FORECASTED_UNIT_SALES'] = pred_unit_sales
df3_01

Unnamed: 0,STORE_ID,UPC,PRICE_ELASTICITY,PRICE,FEATURE,DISPLAY,FORECASTED_UNIT_SALES
0,2277,1111009477,0.030736,1.26,0,1,221.0
1,2277,1111009497,0.004954,1.43,0,1,168.0
2,2277,1111009507,0.017499,1.24,0,1,47.0
3,2277,1111038078,0.092314,1.32,0,1,13.0
4,2277,1111038080,0.110819,1.79,0,1,12.0
...,...,...,...,...,...,...,...
106,25253,31254742735,0.014843,3.15,0,1,6.0
107,25253,31254742835,0.366440,5.04,0,1,2.0
108,25253,88491201426,0.667977,3.37,0,1,19.0
109,25253,88491201427,0.694300,2.80,0,1,49.0


In [44]:
df3_11 = df3.copy()

df3_11['FEATURE'] = 1
df3_11['DISPLAY'] = 1
df3_11['FORECASTED_UNIT_SALES'] = np.nan

for i in range (len(df3_11)):
    store_upc_input = df3_11.iloc[[i]]
    processed_input = processing_pipeline(store_upc_input)
    pred_unit_sales = predict(processed_input)
    df3_11.at[i,'FORECASTED_UNIT_SALES'] = pred_unit_sales
df3_11

Unnamed: 0,STORE_ID,UPC,PRICE_ELASTICITY,PRICE,FEATURE,DISPLAY,FORECASTED_UNIT_SALES
0,2277,1111009477,0.030736,1.26,1,1,314.0
1,2277,1111009497,0.004954,1.43,1,1,193.0
2,2277,1111009507,0.017499,1.24,1,1,63.0
3,2277,1111038078,0.092314,1.32,1,1,23.0
4,2277,1111038080,0.110819,1.79,1,1,20.0
...,...,...,...,...,...,...,...
106,25253,31254742735,0.014843,3.15,1,1,9.0
107,25253,31254742835,0.366440,5.04,1,1,3.0
108,25253,88491201426,0.667977,3.37,1,1,25.0
109,25253,88491201427,0.694300,2.80,1,1,71.0


In [45]:
df3_expanded = pd.DataFrame()

for i in range (len(df3_00)):
    df3_expanded = pd.concat([df3_expanded, df3_00.iloc[i].to_frame().T], ignore_index=True)
    df3_expanded = pd.concat([df3_expanded, df3_10.iloc[i].to_frame().T], ignore_index=True)
    df3_expanded = pd.concat([df3_expanded, df3_01.iloc[i].to_frame().T], ignore_index=True)
    df3_expanded = pd.concat([df3_expanded, df3_11.iloc[i].to_frame().T], ignore_index=True)

cols = ['STORE_ID', 'UPC', 'FEATURE', 'DISPLAY', 'FORECASTED_UNIT_SALES']
df3_expanded[cols] = df3_expanded[cols].astype('int64')
df3_expanded.to_excel('Dataset/Subset Data Random Price Optimization.xlsx', index=False) 

### Reinforcement Learning

In [7]:
subset = pd.read_excel('Dataset/Subset Data Random Price Optimization.xlsx')
subset

Unnamed: 0,STORE_ID,UPC,PRICE_ELASTICITY,PRICE,FEATURE,DISPLAY,FORECASTED_UNIT_SALES
0,2277,1111009477,0.030736,1.26,0,0,173
1,2277,1111009477,0.030736,1.26,1,0,253
2,2277,1111009477,0.030736,1.26,0,1,221
3,2277,1111009477,0.030736,1.26,1,1,314
4,2277,1111009497,0.004954,1.43,0,0,139
...,...,...,...,...,...,...,...
439,25253,88491201427,0.694300,2.80,1,1,71
440,25253,88491212971,0.058936,2.45,0,0,18
441,25253,88491212971,0.058936,2.45,1,0,32
442,25253,88491212971,0.058936,2.45,0,1,30


In [8]:
# Create dictionary to map STORE_ID and UPC to index.
# To simplify representation of STORE_ID and UPC for TD3 learning bcs current values are random number

upc_to_idx = {upc: idx for idx, upc in enumerate(subset['UPC'].unique())}
store_id_to_idx = {store_id: idx for idx, store_id in enumerate(subset['STORE_ID'].unique())}

def get_idx_store_id(store_id):
    store_id_idx = store_id_to_idx.get(store_id, -1)
    return store_id_idx 

def get_idx_upc(upc):
    upc_idx = upc_to_idx.get(upc, -1)
    return upc_idx 

def get_store_id_by_idx(idx):
    for key, value in store_id_to_idx.items():
        if value == idx:
            return key
    return 'STORE ID not found'

def get_upc_by_idx(idx):
    for key, value in upc_to_idx.items():
        if value == idx:
            return key
    return 'UPC not found'

# print(get_idx_store_id(25253))
# print(get_idx_upc(7192100337))
print(upc_to_idx)
print(store_id_to_idx)

{1111009477: 0, 1111009497: 1, 1111009507: 2, 1111038078: 3, 1111038080: 4, 1111085319: 5, 1111085345: 6, 1111085350: 7, 1111087395: 8, 1111087396: 9, 1111087398: 10, 1600027527: 11, 1600027528: 12, 1600027564: 13, 2840004768: 14, 2840004770: 15, 3000006340: 16, 3000006560: 17, 3000006610: 18, 3700031613: 19, 3700044982: 20, 3800031829: 21, 3800031838: 22, 3800039118: 23, 7192100336: 24, 7192100337: 25, 7192100339: 26, 7218063052: 27, 7797502248: 28, 7797508004: 29, 7797508006: 30, 31254742725: 31, 31254742735: 32, 31254742835: 33, 88491201426: 34, 88491201427: 35, 88491212971: 36}
{2277: 0, 15755: 1, 25253: 2}


#### Custom Environment Setup

In [9]:
import gymnasium as gym
from gymnasium.spaces import Discrete, Box, Dict

In [18]:
class OptimalPricingEnv(gym.Env):    # pass gym.Env to OptimalPricingEnv class to inherit properties and methods from Env
    def __init__(self, dataset, n_stores, n_upcs, timesteps_per_episode):
        super(OptimalPricingEnv, self).__init__()
        
        # Define action space (price)
        self.action_space = Box(low=0.00, high=1.00, shape=(1,), dtype=np.float32)   
        # Define observation/state space 
        self.observation_space = Dict({
            'store_id': Discrete(n_stores),                                           
            'upc': Discrete(n_upcs),                                                  
            'feature': Discrete(2),                                                    
            'display': Discrete(2),                                                    
            'elasticity': Box(low=0.00, high=8.00, shape=(1,), dtype=np.float32),      
            'forecasted_unit_sales': Box(low=1, high=300, shape=(1,), dtype=np.int32) 
        })
        # print(self.action_space)
        # print(self.observation_space)

        # Initialize parameters
        self.dataset = dataset    
        self.dataset_idx = 0           # Index to keep track of the current episode (store-upc)
        self.timesteps_per_episode = timesteps_per_episode 
        self.episode_num = 0
        self.episode_info = []

    
    def reset(self, seed=None):
        self.episode_num += 1
        self.step_count = 0      # Counter to keep track number of steps within an episode
        self.terminated = False  # If True, reset() will be called
       
        # Extract current episode state from dataset
        self.current_eps = self.dataset.iloc[self.dataset_idx]

        # Format observation space 
        store_id_idx = get_idx_store_id(int(self.current_eps["STORE_ID"]))
        upc_idx =  get_idx_upc(int(self.current_eps["UPC"]))
        self.state = {
            'store_id': store_id_idx,
            'upc': upc_idx,
            'feature': int(self.current_eps["FEATURE"]),
            'display': int(self.current_eps['DISPLAY']),
            'elasticity': np.array([np.float32(self.current_eps["PRICE_ELASTICITY"])]),        
            'forecasted_unit_sales': np.array([np.int32(self.current_eps["FORECASTED_UNIT_SALES"])])  
        }

        self.dataset_idx = (self.dataset_idx + 1) % len(self.dataset)  # Loop over the dataset

        self.current_price = np.float32(self.current_eps["PRICE"])
        self.current_unit_sales = self.current_eps["FORECASTED_UNIT_SALES"]
        self.current_revenue = self.current_price * self.current_unit_sales
        self.price_min_bound = self.current_price * 0.8
        self.price_max_bound = self.current_price * 1.2
       
        info = {
            'current_price': self.current_price,
            'current_unit_sales': self.current_unit_sales,
            'current_revenue': self.current_revenue,
            'min_price_bound':self.price_min_bound,
            'max_price_bound': self.price_max_bound,
        }
        self.episode_info.append(info)
        return self.state, {}
    

    def step(self, action): 
        self.step_count += 1
        normalized_price = action[0]
        range = self.price_max_bound - self.price_min_bound
        new_price = (normalized_price * range) + self.price_min_bound
        new_price = np.round(new_price, 2)  
        
        # Apply action - Get the forecasted unit sales based on new price
        new_unit_sales = self.predict_unit_sales(new_price)
        # Get reward from environment in response to new price
        self.reward, new_revenue, price_changes_pct = self.get_reward(new_price, new_unit_sales)

        info = {
            'step': self.step_count,
            'state': self.state,
            'new_price': new_price,
            'price_changes':np.round(price_changes_pct, 2),
            'new_unit_sales': new_unit_sales,
            'new_revenue': np.round(new_revenue, 2),
            'reward': self.reward
        }
        self.episode_info.append(info)

        if self.step_count >= self.timesteps_per_episode:
            self.terminated = True   # when True, reset() will be called upon this function returns 
        else:
            self.state, _ = self.get_next_state(new_unit_sales)
 
        return self.state, self.reward, self.terminated, False, {}   # Return observation, reward, terminated, truncated, info
    

    def get_next_state(self, new_unit_sales): 
        self.next_state = {
            'store_id': self.state['store_id'],
            'upc': self.state['upc'],
            'feature': self.state['feature'],  
            'display': self.state['display'], 
            'elasticity': self.state['elasticity'],
            'forecasted_unit_sales': np.array([np.int32(new_unit_sales)])
        }
        return self.next_state, {}
    

    def predict_unit_sales(self, new_price):
        store_id = get_store_id_by_idx(self.state['store_id'])
        upc = get_upc_by_idx(self.state['upc'])

        state_input = []
        state_input.append({
                'STORE_ID': store_id,
                'UPC': upc,
                'PRICE': new_price,
                'FEATURE': self.state['feature'],
                'DISPLAY': self.state['display']})
        state_input = pd.DataFrame(state_input)
        processed_input = processing_pipeline(state_input)
        pred_unit_sales = predict(processed_input)
        return pred_unit_sales
    

    def get_reward(self, new_price, new_unit_sales):
        new_revenue = new_price * new_unit_sales
        relative_revenue_improvement = (new_revenue - self.current_revenue) / self.current_revenue
        price_changes_pct = (new_price - self.current_price) / self.current_price

        # Penalize price actions out of min and max bound
        penalty_value = 0
        if new_price < self.price_min_bound or new_price > self.price_max_bound:
            if price_changes_pct > 0.2 and price_changes_pct < 0.4:
                penalty_value = -0.1
            else: 
                penalty_value = -0.2
        reward = relative_revenue_improvement + penalty_value
        return reward, new_revenue, price_changes_pct

In [11]:
n_stores = subset['STORE_ID'].nunique()
n_upcs = subset['UPC'].nunique()
timesteps_per_episode = 150
n_episode = len(subset)
total_timesteps = timesteps_per_episode * n_episode

print('Num of unique STORE_ID: ', n_stores)
print('Num of unique UPC: ', n_upcs)
print('Timesteps per Episode: ', timesteps_per_episode)
print('Num of Episode (Len of Training Data): ', n_episode)
print('Total Timesteps: ', total_timesteps)

Num of unique STORE_ID:  3
Num of unique UPC:  37
Timesteps per Episode:  150
Num of Episode (Len of Training Data):  444
Total Timesteps:  66600


#### TD3 (Twin Delayed Deep Deterministic Policy Gradient)

In [6]:
from stable_baselines3 import TD3
from stable_baselines3.common.noise import NormalActionNoise
from stable_baselines3.common.evaluation import evaluate_policy

##### Check Environment

In [19]:
# Check environment setup to ensure it's compatible with the model API
from stable_baselines3.common.env_checker import check_env
env = OptimalPricingEnv(subset, n_stores, n_upcs, timesteps_per_episode)
check_env(env)  

Box(0.0, 1.0, (1,), float32)
Dict('display': Discrete(2), 'elasticity': Box(0.0, 8.0, (1,), float32), 'feature': Discrete(2), 'forecasted_unit_sales': Box(1, 300, (1,), int32), 'store_id': Discrete(3), 'upc': Discrete(37))




##### Training TD3

In [22]:
# Define directory path to save tensorboard
import os
log_path = os.path.join('RL_Training', 'Logs')
training_log_path = os.path.join(log_path, 'TD3_5')
os.makedirs(training_log_path, exist_ok=True)

In [24]:
# Initialize environment
env = OptimalPricingEnv(subset, n_stores, n_upcs, timesteps_per_episode)

# Define action noise for exploration
action_noise = NormalActionNoise(mean=np.zeros(env.action_space.shape), sigma=0.5)

# Initialize model/agent to learn
model = TD3('MultiInputPolicy', env, action_noise=action_noise, verbose=1, seed=42, tensorboard_log=training_log_path)

# Train model
model.learn(total_timesteps=total_timesteps) 

# Save model
model.save("TD3_5")

Box(0.0, 1.0, (1,), float32)
Dict('display': Discrete(2), 'elasticity': Box(0.0, 8.0, (1,), float32), 'feature': Discrete(2), 'forecasted_unit_sales': Box(1, 300, (1,), int32), 'store_id': Discrete(3), 'upc': Discrete(37))
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to RL_Training\Logs\TD3_6\TD3_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 150      |
|    ep_rew_mean     | 6.74     |
| time/              |          |
|    episodes        | 4        |
|    fps             | 5        |
|    time_elapsed    | 101      |
|    total_timesteps | 600      |
| train/             |          |
|    actor_loss      | 0.453    |
|    critic_loss     | 0.00239  |
|    learning_rate   | 0.001    |
|    n_updates       | 499      |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 150      |
|    ep_rew_mean     | 0.974   

In [25]:
# Save episode info into txt file
with open('TD3_5_Episode_Info.txt', 'w') as file:
    for entry in env.episode_info:
        file.write(str(entry) + '\n\n')

In [None]:
# Save the TD3_5 as the best price optimization model
# model.save("TD3_5")

### Policy Evaluation

In [20]:
td3_5 = TD3.load("TD3_5")

In [31]:
env = OptimalPricingEnv(subset, n_stores, n_upcs, timesteps_per_episode)
mean_reward, std_reward = evaluate_policy(td3_5, env, n_eval_episodes=100, deterministic=True)
print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

Box(0.0, 1.0, (1,), float32)
Dict('display': Discrete(2), 'elasticity': Box(0.0, 8.0, (1,), float32), 'feature': Discrete(2), 'forecasted_unit_sales': Box(1, 300, (1,), int32), 'store_id': Discrete(3), 'upc': Discrete(37))




Mean reward: 17.21 +/- 36.67


In [33]:
env = OptimalPricingEnv(subset, n_stores, n_upcs, timesteps_per_episode)
mean_reward, std_reward = evaluate_policy(td3_5, env, n_eval_episodes=200, deterministic=True)
print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

Box(0.0, 1.0, (1,), float32)
Dict('display': Discrete(2), 'elasticity': Box(0.0, 8.0, (1,), float32), 'feature': Discrete(2), 'forecasted_unit_sales': Box(1, 300, (1,), int32), 'store_id': Discrete(3), 'upc': Discrete(37))




Mean reward: 27.80 +/- 45.75


In [17]:
print("Learning rate: ", td3_5.learning_rate)    # Control size of steps the model takes while updating policy. Too high causes unstable training, too low cause slow learning
print("Action noise: ", td3_5.action_noise)      # Determine std deviation of noise added to help exploration
print("Batch size: ", td3_5.batch_size)          # Num of samples used for each training update. Larger batch size provides more stable updates
print("Replay buffer size: ", td3_5.buffer_size)

print("Policy delay: ", td3_5.policy_delay)
print("Gradient steps: ", td3_5.gradient_steps) 
print("Tau: ", td3_5.tau)  
print("Gamma: ", td3_5.gamma)  
print("Train freq: ", td3_5.train_freq)

Learning rate:  0.001
Action noise:  NormalActionNoise(mu=[0.], sigma=0.5)
Batch size:  256
Replay buffer size:  1000000
Policy delay:  2
Gradient steps:  1
Tau:  0.005
Gamma:  0.99
Train freq:  TrainFreq(frequency=1, unit=<TrainFrequencyUnit.STEP: 'step'>)


In [29]:
td3_5.get_parameters()

{'policy': OrderedDict([('actor.mu.0.weight',
               tensor([[ 0.1828,  0.0358,  0.0166,  ...,  0.1055,  1.0080,  0.3162],
                       [ 0.0450,  0.0264,  0.0333,  ...,  0.0450,  0.0312, -0.0376],
                       [ 0.1034, -0.0599, -0.1375,  ..., -0.0565, -0.1224, -0.1466],
                       ...,
                       [-0.1369, -0.1548, -0.1437,  ..., -0.0033,  0.1461, -0.1301],
                       [-0.2088,  0.0423, -0.2331,  ...,  0.6952, -0.7777, -0.8462],
                       [-0.0668, -0.0676, -0.1001,  ..., -0.0206, -0.1287,  0.0826]])),
              ('actor.mu.0.bias',
               tensor([-1.7674e-01, -4.8455e-02, -5.0287e-02, -7.7326e-02, -1.7325e-01,
                       -3.2889e-01,  8.0978e-02, -1.1028e-01,  2.5909e-02, -8.6514e-02,
                       -9.4157e-02, -1.0734e-02, -1.5403e-01, -6.4628e-02,  5.2658e-02,
                       -2.7716e-01, -1.7453e-01, -2.8183e-01, -1.3922e-01, -1.9502e-02,
                       -1.0

In [30]:
# Accessing model parameters
td3_5.policy.state_dict()   # or using td3_5.get_parameters(). Both return same results

OrderedDict([('actor.mu.0.weight',
              tensor([[ 0.1828,  0.0358,  0.0166,  ...,  0.1055,  1.0080,  0.3162],
                      [ 0.0450,  0.0264,  0.0333,  ...,  0.0450,  0.0312, -0.0376],
                      [ 0.1034, -0.0599, -0.1375,  ..., -0.0565, -0.1224, -0.1466],
                      ...,
                      [-0.1369, -0.1548, -0.1437,  ..., -0.0033,  0.1461, -0.1301],
                      [-0.2088,  0.0423, -0.2331,  ...,  0.6952, -0.7777, -0.8462],
                      [-0.0668, -0.0676, -0.1001,  ..., -0.0206, -0.1287,  0.0826]])),
             ('actor.mu.0.bias',
              tensor([-1.7674e-01, -4.8455e-02, -5.0287e-02, -7.7326e-02, -1.7325e-01,
                      -3.2889e-01,  8.0978e-02, -1.1028e-01,  2.5909e-02, -8.6514e-02,
                      -9.4157e-02, -1.0734e-02, -1.5403e-01, -6.4628e-02,  5.2658e-02,
                      -2.7716e-01, -1.7453e-01, -2.8183e-01, -1.3922e-01, -1.9502e-02,
                      -1.0951e-01, -2.3881e-01, -2

In [2]:
# HEURISTIC RULES (Derive new unit sales based on PED formula)
# PED = [ln Predicted_Demand - ln Optimal_Demand] / [ln Input_Price - ln Optimal_Price]
ped = 0.022
ped = -ped
print('PED: ', ped)
pred_unit_sales = 26
input_price = 3.13
optimal_price = 3.4

optimal_unit_sales_log = np.log(pred_unit_sales) - (ped * (np.log(input_price) - np.log(optimal_price)))
optimal_unit_sales = np.exp(optimal_unit_sales_log)

print('Optimal unit sales: ', optimal_unit_sales)
print('Revenue based on current price: ', input_price * pred_unit_sales)
print('Revenue based on optimal price: ', optimal_price * optimal_unit_sales)

PED:  -0.022
Optimal unit sales:  25.952714382567386
Revenue based on current price:  81.38
Revenue based on optimal price:  88.23922890072912
