In [3]:
## InventoryRotationEnv

In [10]:
import glob
import pandas as pd
import numpy as np
import random

In [22]:
"""
    1. DATA SELECTION
"""
# Path to the data (folders)
path = ['C:/Users/Win-10/Documents/Python Scripts/ku']

frames = []
for p in path:
    all_files = glob.glob(p + "/history_*.csv")
    for filename in all_files:
        df = pd.read_csv(filename, header=0, sep=',', index_col=None, low_memory=False)
        frames.append(df)

df_history = pd.concat(frames)

df_items = pd.read_csv(p + "/skus.csv", header=0, sep=',', index_col=None, low_memory=False)
df_locations = pd.read_csv(p + "/locations.csv", header=0, sep=',', index_col=None, low_memory=False)
df_promotions = pd.read_csv(p + "/promotions.csv", header=0, sep=',', index_col=None, low_memory=False)

# Remove temp variables from workspace
del path, all_files, frames, filename, p, df
    
# Step 1: Filter df_items by multiple skuName values
sku_names_to_filter = ['Dieninis kremas'] # ['Kūno apsauginis kremas','Lūpų dažai','Parfumuotas vanduo (EDP)','Kreminė pudra','Dieninis kremas','Lūpų blizgesiai','Akių šešėliai mono 1sp.','Tualetinis vanduo (EDT)']  # Example skuNames to filter by
df_items = df_items[df_items['categoryGroup'].isin(sku_names_to_filter)]

# Step 2: Filter df_history by the skuid values of the filtered df_items
df_history = df_history[df_history['skuID'].isin(df_items['skuid'])]

# Step 3: Filter data by the locations
df_locations = df_locations[df_locations['slid'].isin([262,238])]
df_history = df_history[df_history['slid'].isin([262,238])]
df_history = df_history[(df_history['atSiteQnt'] > 0) | (df_history['consumption'] > 0)]

# Set in chronological order
df_history['updateDate'] = pd.to_datetime(df_history['updateDate'])
df_history = df_history.sort_values(by='updateDate').reset_index(drop=True)

In [31]:
df_history.shape

(71918, 7)

In [32]:
df_history.describe()  # 71_918 -> 10_584

Unnamed: 0,slid,skuID,updateDate,atSiteQnt,consumption,purchasingPrice,salesPrice
count,71918.0,71918.0,71918,71918.0,71918.0,71918.0,71918.0
mean,254.999694,30499.587892,2024-01-03 07:59:24.359408896,123.851289,0.700159,25.338845,53.140266
min,238.0,533.0,2022-09-27 00:00:00,0.0,0.0,0.01,0.0001
25%,238.0,23743.0,2023-09-06 00:00:00,2.0,0.0,7.56,23.95
50%,262.0,33030.0,2024-01-12 00:00:00,2.0,0.0,21.02,43.7603
75%,262.0,38118.0,2024-05-24 00:00:00,10.0,0.0,35.305,74.3802
max,262.0,54142.0,2024-10-02 00:00:00,13656.0,1415.0,284.711,438.0165
std,10.908928,12209.92622,,731.927536,9.800807,21.292089,36.930782


In [26]:
df_history.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71918 entries, 0 to 71917
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   slid             71918 non-null  int64         
 1   skuID            71918 non-null  int64         
 2   updateDate       71918 non-null  datetime64[ns]
 3   atSiteQnt        71918 non-null  float64       
 4   consumption      71918 non-null  float64       
 5   purchasingPrice  71918 non-null  float64       
 6   salesPrice       71918 non-null  float64       
dtypes: datetime64[ns](1), float64(4), int64(2)
memory usage: 3.8 MB


In [23]:
temp = df_history[df_history['skuID'] == 48722]
temp.head()

Unnamed: 0,slid,skuID,updateDate,atSiteQnt,consumption,purchasingPrice,salesPrice
33881,262,48722,2023-12-29,42.0,0.0,3.96,20.6198
34038,262,48722,2023-12-30,42.0,0.0,3.96,20.6198
34180,262,48722,2023-12-31,42.0,0.0,3.96,20.6198
34307,262,48722,2024-01-01,42.0,0.0,3.96,20.6198
34425,262,48722,2024-01-02,42.0,0.0,3.96,20.6198


In [24]:
df = temp.copy()

# Convert updateDate to datetime and sort by time
df['updateDate'] = pd.to_datetime(df['updateDate'])
df = df.sort_values(by='updateDate').reset_index(drop=True)

# Extract year and week number
df['year'] = df['updateDate'].dt.year
df['week'] = df['updateDate'].dt.isocalendar().week

# Sort data by date to get the correct start and end of the week
df = df.sort_values(by=['slid', 'skuID', 'year', 'week', 'updateDate'])

# Group by location, SKU, year, and week
weekly_summary = df.groupby(['slid', 'skuID', 'year', 'week']).agg({
    'consumption': 'sum',                         # Total weekly consumption
    'purchasingPrice': 'mean',                    # Average purchasing price
    'salesPrice': 'mean',                         # Average sales price
    #'isNewItem': 'max'                            # If any day has True, week should be True
}).reset_index()


# Step 1: Compute absolute week number (ensuring correct weekly progression)
weekly_summary = weekly_summary.sort_values(by=['slid', 'skuID', 'year', 'week']).reset_index(drop=True)
weekly_summary['absolute_week'] = (weekly_summary['year'] - weekly_summary['year'].min()) * 52 + weekly_summary['week']
# Step 2: Find the first absolute week an item appeared in a location
weekly_summary['first_absolute_week'] = weekly_summary.groupby(['slid', 'skuID'])['absolute_week'].transform('min')
# Step 3: Compute remaining lock period (12-week lock from first appearance)
weekly_summary['lockWeeksRemaining'] = (12 - (weekly_summary['absolute_week'] - weekly_summary['first_absolute_week'])).clip(lower=0)
# Step 4: Drop unnecessary columns if needed
weekly_summary.drop(columns=['first_absolute_week','absolute_week'], inplace=True)


# Get the stock at the start and end of the week
stock_start = df.groupby(['slid', 'skuID', 'year', 'week']).first().reset_index()[['slid', 'skuID', 'year', 'week', 'atSiteQnt']]
stock_end = df.groupby(['slid', 'skuID', 'year', 'week']).last().reset_index()[['slid', 'skuID', 'year', 'week', 'atSiteQnt']]

# Rename columns for clarity
stock_start.rename(columns={'atSiteQnt': 'stock_start_week'}, inplace=True)
stock_end.rename(columns={'atSiteQnt': 'stock_end_week'}, inplace=True)

# Merge start and end stock with the weekly summary
weekly_summary = weekly_summary.merge(stock_start, on=['slid', 'skuID', 'year', 'week'])
weekly_summary = weekly_summary.merge(stock_end, on=['slid', 'skuID', 'year', 'week'])

temp = weekly_summary.copy()


# Filter out rows where stock_end_week is zero
df_non_zero_stock = temp[(temp['stock_end_week'] > 0) | (temp['consumption'] > 0)]

# Group by location, year, and week to count unique skuID
unique_items_count = df_non_zero_stock.groupby(['slid', 'year', 'week'])['skuID'].nunique().reset_index()
unique_items_count.rename(columns={'skuID': 'unique_items_count'}, inplace=True)

# Merge the unique items count with the original DataFrame
temp = temp.merge(unique_items_count, on=['slid', 'year', 'week'], how='left')

# Fill NaN values with 0 (in case there are weeks with no items)
temp['unique_items_count'] = temp['unique_items_count'].fillna(0).astype(int)


del stock_start, stock_end, weekly_summary, df, df_non_zero_stock, unique_items_count

In [25]:
temp.head()

Unnamed: 0,slid,skuID,year,week,consumption,purchasingPrice,salesPrice,lockWeeksRemaining,stock_start_week,stock_end_week,unique_items_count
0,262,48722,2023,52,0.0,3.96,20.6198,12,42.0,42.0,1
1,262,48722,2024,1,5.0,3.96,20.6198,11,42.0,37.0,1
2,262,48722,2024,2,7.0,3.96,20.6198,10,32.0,31.0,1
3,262,48722,2024,3,6.0,3.96,20.6198,9,30.0,25.0,1
4,262,48722,2024,4,7.0,3.96,20.6198,8,15.0,8.0,1


In [36]:
sorted(temp[['year', 'week']].drop_duplicates().values.tolist())

[[2023, 52],
 [2024, 1],
 [2024, 2],
 [2024, 3],
 [2024, 4],
 [2024, 5],
 [2024, 6],
 [2024, 18],
 [2024, 19],
 [2024, 20],
 [2024, 21],
 [2024, 22],
 [2024, 23],
 [2024, 24],
 [2024, 25],
 [2024, 26],
 [2024, 27],
 [2024, 28],
 [2024, 29],
 [2024, 30],
 [2024, 31],
 [2024, 32],
 [2024, 37],
 [2024, 38],
 [2024, 39],
 [2024, 40]]

In [None]:
class InventoryRotationEnv(gym.Env):
    def __init__(self, df_items, df_history, df_locations):
        super(InventoryRotationEnv, self).__init__()
        
        df_history['week'] = df_history['week'].astype('int32')
        df_history['year'] = df_history['year'].astype('int32')
        
        # Initialize dataframes
        self.df_items = df_items[['skuid', 'isGenerateFlow']].fillna(0).to_numpy()
        self.df_history = df_history[['skuID', 'slid', 'year', 'week', 'consumption', 
                                  'purchasingPrice','salesPrice', 'stock_start_week', 'stock_end_week',
                                  'lockWeeksRemaining', 'unique_items_count']].to_numpy(dtype=np.float32)
        self.df_locations = df_locations[['slid']].to_numpy()
        
        # Determine number of SKUs and locations
        self.num_skus = self.df_items.shape[0]
        self.num_locations = self.df_locations.shape[0]
        
        # Define the action space: continuous values between 0 and 1 for each SKU-location pair
        self.action_space = spaces.Box(low=0, high=1, shape=(self.num_skus * self.num_locations,), dtype=np.float32)
        
        # Define the observation space with various SKU-level features per location
        self.observation_space = spaces.Dict({
            'inventory': spaces.Box(low=0, high=1, shape=(self.num_skus, self.num_locations), dtype=np.float32),
            'salesPrice': spaces.Box(low=0, high=np.inf, shape=(self.num_skus, self.num_locations), dtype=np.float32),
            'purchasingPrice': spaces.Box(low=0, high=np.inf, shape=(self.num_skus, self.num_locations), dtype=np.float32),
            'delta_margin': spaces.Box(low=-np.inf, high=np.inf, shape=(self.num_skus, self.num_locations), dtype=np.float32),
            'delta_stock': spaces.Box(low=-np.inf, high=np.inf, shape=(self.num_skus, self.num_locations), dtype=np.float32),
            'delta_profit': spaces.Box(low=-np.inf, high=np.inf, shape=(self.num_skus, self.num_locations), dtype=np.float32),
            'lockWeeksRemaining': spaces.Box(low=0, high=np.inf, shape=(self.num_skus, self.num_locations), dtype=np.float32),
            'week': spaces.Box(low=1, high=53, shape=(self.num_skus, self.num_locations), dtype=np.int16),
        })
        
        # Create mappings for SKUs and locations
        self.sku_id_map = {sku: idx for idx, sku in enumerate(np.sort(self.df_items[:, 0]))}
        self.location_id_map = {loc: idx for idx, loc in enumerate(np.sort(self.df_locations[:, 0]))}
        
        # Initialize state and time tracking variables
        self.state = None
        self.current_step = 0
        self.current_step_year = None
        self.current_step_week = None
        
        # Extract and sort unique (year, week) combinations for time steps
        self.unique_year_weeks = np.unique(self.df_history[:, [2, 3]], axis=0)  # Columns for 'year' and 'week'
        self.max_steps = len(self.unique_year_weeks)

In [None]:
def reset(self, *, seed=None, options=None):
        """Resets the environment to a new random starting point in historical data."""
        super().reset(seed=seed)
    
        # Step 1: Select a random step in history
        self.current_step = np.random.randint(0, self.max_steps)
        self.current_step_year, self.current_step_week = self.unique_year_weeks[self.current_step]
    
        # Step 2: Initialize the observation dictionary
        obs = {key: np.zeros((self.num_skus, self.num_locations), dtype=np.float32)
               for key in self.observation_space.keys()}
        
        # Step 3: Filter current week's data from the numpy array
        current_week_mask = (self.df_history[:, 2] == self.current_step_year) & (self.df_history[:, 3] == self.current_step_week)
        current_week_data = self.df_history[current_week_mask]
    
        # Step 4: Populate the inventory based on stock_start_week > 0
        for row in current_week_data:
            sku_id, loc_id = int(row[0]), int(row[1])
            stock_start_week = row[7]  # stock_start_week column
    
            sku_idx = self.sku_id_map.get(sku_id, -1)
            loc_idx = self.location_id_map.get(loc_id, -1)
    
            if sku_idx >= 0 and loc_idx >= 0:
                if stock_start_week > 0:
                    obs['inventory'][sku_idx, loc_idx] = 1  # Mark as in stock
    
                # Store prices and lock periods in observation
                obs['salesPrice'][sku_idx, loc_idx] = row[6]  # salesPrice
                obs['purchasingPrice'][sku_idx, loc_idx] = row[5]  # purchasingPrice
                obs['week'][sku_idx, loc_idx] = self.current_step_week
                obs['lockWeeksRemaining'][sku_idx, loc_idx] = row[9]  # lockWeeksRemaining
    
        # Step 5: Rolling 4-week window to calculate delta features
        rolling_window_steps = 4
        start_step = max(0, self.current_step - rolling_window_steps)
        rolling_year_weeks = self.unique_year_weeks[start_step:self.current_step + 1]  # Include current week
    
        if rolling_year_weeks.size > 0:
            rolling_mask = np.isin(self.df_history[:, [2, 3]], rolling_year_weeks).all(axis=1)
            rolling_data = self.df_history[rolling_mask]
    
            # Step 6: Calculate delta features for each SKU-location pair
            for row in rolling_data:
                sku_id, loc_id = int(row[0]), int(row[1])
                consumption = row[4]
                purchasing_price = row[5]
                sales_price = row[6]
    
                sku_idx = self.sku_id_map.get(sku_id, -1)
                loc_idx = self.location_id_map.get(loc_id, -1)
    
                if sku_idx >= 0 and loc_idx >= 0:
                    obs['delta_stock'][sku_idx, loc_idx] += consumption
                    obs['delta_profit'][sku_idx, loc_idx] += consumption * (sales_price - purchasing_price)
                    obs['delta_margin'][sku_idx, loc_idx] += (sales_price - purchasing_price)
    
        # Step 7: Finalize state
        self.state = obs
    
        return self.state, {}

In [None]:
def step(self, action):
        """ Perform an environment step using the given action. """
    
        # **Step 0: Move to the next time step first (ensures reward calculation is correct)**
        self.current_step += 1
        done = self.current_step >= self.max_steps
        truncated = False  
        if done:
            return self.state, 0, done, truncated, {}
        
        """ Threshold or Top-N Approach """
        # Step 1: Convert action into (num_skus, num_locations) probabilities
        action_probs = action.reshape((self.num_skus, self.num_locations))
    
        # Step 2: Apply dynamic subset size per location
        current_inventory_per_location = np.sum(self.state['inventory'] == 1, axis=0)
        action_matrix = np.zeros_like(action_probs, dtype=np.int8)
    
        for loc_idx in range(self.num_locations):
            subset_size = current_inventory_per_location[loc_idx]
    
            # Get probabilities for the current location
            loc_probs = action_probs[:, loc_idx]
    
            # Select top SKUs based on probabilities
            top_indices = np.argsort(loc_probs)[-subset_size:]
            action_matrix[top_indices, loc_idx] = 1
        
        # action_matrix = (action >= 0.5).astype(np.int8).reshape((self.num_skus, self.num_locations))
        """ Threshold or Top-N Approach """
    
        # **Step 3: Apply inventory constraints (locks, permanent items)**    
        # Identify newly added items (previously 0, now suggested as 1)
        newly_added_mask = (action_matrix == 1) & (self.state['inventory'] == 0)
        
        # Identify items suggested for removal (previously 1, now suggested as 0)
        suggested_removal_mask = (action_matrix == 0) & (self.state['inventory'] == 1)
    
        # **Lock newly added items for 12 weeks**
        if np.any(newly_added_mask):
            self.state['lockWeeksRemaining'][newly_added_mask] = 12  # Lock for 12 weeks
    
        # **Ensure locked or permanent items are NOT removed**
        # Keep items locked by setting them back to 1 in the action matrix
        action_matrix[self.state['lockWeeksRemaining'] > 0] = 1
    
        # Keep important items (marked by isGenerateFlow == 1)
        important_item_mask = self.df_items[:, 1] == 1  # Assuming 'isGenerateFlow' is the second column
        action_matrix[important_item_mask, :] = 1  
    
        # **Allow removal only if lock period has expired**
        removable_items_mask = suggested_removal_mask & (self.state['lockWeeksRemaining'] == 0)
        action_matrix[removable_items_mask] = 0  # Remove items if lock expired
    
        # **Step 4: Compute reward based on the current action**
        reward = self._calculate_profit(action_matrix)  # Temporarily set reward to 0; we'll update the reward calculation later.
        #print(reward)
        # **Step 5: Decrease lock periods (moving towards being removable)**
        self.state['lockWeeksRemaining'] = np.maximum(self.state['lockWeeksRemaining'] - 1, 0)
    
        # **Step 6: Update year and week for the current step**
        self.current_step_year, self.current_step_week = self.unique_year_weeks[self.current_step]
    
        # **Step 7: Update observation directly based on the new action_matrix**
        self._update_observation(action_matrix)
    
        return self.state, reward, done, truncated, {}

In [None]:
def _update_observation(self, action_matrix):
        """Update the observation based on the current action and historical data."""
    
        # Step 1: Update the current stock state based on the action
        self.state['inventory'] = action_matrix
    
        # Step 2: Set the current week
        self.state['week'][:] = self.current_step_week
    
        # Step 3: Filter historical data for the current week using numpy
        current_week_data = self.df_history[
            (self.df_history[:, 2] == self.current_step_year) &  # year
            (self.df_history[:, 3] == self.current_step_week)    # week
        ]
    
        # Step 4: Iterate through SKUs and locations to update observation features
        for sku_idx in range(self.num_skus):
            sku_id = self.df_items[sku_idx, 0]  # SKU ID from df_items
    
            for loc_idx in range(self.num_locations):
                loc_id = self.df_locations[loc_idx, 0]  # Location ID from df_locations
    
                # Check if this SKU-location is active (inventory action keeps it)
                if action_matrix[sku_idx, loc_idx] == 1:
                    # Filter data for the specific SKU-location pair in the current week
                    sku_loc_data = current_week_data[
                        (current_week_data[:, 0] == sku_id) &  # skuID
                        (current_week_data[:, 1] == loc_id)    # slid
                    ]
    
                    if sku_loc_data.size > 0:
                        # Update sales and purchasing prices (average in case of multiple entries)
                        self.state['salesPrice'][sku_idx, loc_idx] = sku_loc_data[:, 6].mean()  # salesPrice
                        self.state['purchasingPrice'][sku_idx, loc_idx] = sku_loc_data[:, 5].mean()  # purchasingPrice
                
                        # Accumulate delta features over time
                        # Delta stock: total consumption in the current week
                        self.state['delta_stock'][sku_idx, loc_idx] += sku_loc_data[:, 4].sum()  # consumption
                    
                        # Delta profit: (consumption * (salesPrice - purchasingPrice))
                        profit_per_unit = (sku_loc_data[:, 6] - sku_loc_data[:, 5])  # salesPrice - purchasingPrice
                        total_profit = (sku_loc_data[:, 4] * profit_per_unit).sum()
                        self.state['delta_profit'][sku_idx, loc_idx] += total_profit
                    
                        # Delta margin: average difference between salesPrice and purchasingPrice
                        self.state['delta_margin'][sku_idx, loc_idx] += profit_per_unit.mean()

In [None]:
def _calculate_profit(self, action_matrix):
        """Compute the reward (profit) based on the state after applying the action."""
        total_profit = 0
        unsold_penalty = 0
        sold_elsewhere_bonus = 0
        high_sales_bonus = 0  # Bonus for selling high quantities
    
        # Step 1: Identify active SKU-location pairs from action
        active_indices = np.argwhere(action_matrix == 1)  # Where action suggests to keep items
        
        for sku_idx, loc_idx in active_indices:
            # Access state variables
            consumption = self.state['delta_stock'][sku_idx, loc_idx]  # Total consumption
            sales_price = self.state['salesPrice'][sku_idx, loc_idx]
            purchase_price = self.state['purchasingPrice'][sku_idx, loc_idx]
            stock_end_week = self._get_stock_end_week(sku_idx, loc_idx)  # Retrieve from history or state
    
            # Step 2: Calculate profit for the sold items
            profit = consumption * (sales_price - purchase_price)
            total_profit += profit
    
            # Step 3: Apply unsold penalty
            if stock_end_week > 0:
                # Potential lost profit if we didn't sell remaining stock
                potential_unsold_profit = stock_end_week * (sales_price - purchase_price)
                penalty = potential_unsold_profit * 0.1  # Apply 10% penalty to reflect partial loss
                unsold_penalty += penalty
    
            # Step 4: Reward if items sold elsewhere
            sold_elsewhere_qty = self._get_sold_elsewhere_quantity(sku_idx, loc_idx)
            sold_elsewhere_bonus += sold_elsewhere_qty * 0.1  # Reward 10% for sales elsewhere
            
            # **Step 5: Reward for high sales volume (regardless of profit)**
            high_sales_bonus += consumption * 0.05  # Reward based on quantity sold
            
        # Dynamic weights
        profit_weight = 1.0
        unsold_penalty_weight = 1.0
        sales_volume_weight = 1.0
        diversification_bonus_weight = 1.0
    
        # Final reward calculation
        reward = (
            total_profit * profit_weight                            # Profit from sold items
            - unsold_penalty * unsold_penalty_weight                # Penalty for unsold inventory
            + sold_elsewhere_bonus * diversification_bonus_weight   # Bonus for selling in other locations
            + high_sales_bonus * sales_volume_weight                # Bonus for high sales volume
        )
    
        return reward

    def _get_stock_end_week(self, sku_idx, loc_idx):
        """Retrieve the stock at the end of the week from historical data."""
        sku_id = self.df_items[sku_idx, 0]
        loc_id = self.df_locations[loc_idx, 0]
    
        # Filter historical data for current step
        current_week_data = self.df_history[
            (self.df_history[:, 0] == sku_id) &  # skuID
            (self.df_history[:, 1] == loc_id) &  # slid
            (self.df_history[:, 2] == self.current_step_year) &
            (self.df_history[:, 3] == self.current_step_week)
        ]
    
        if current_week_data.size > 0:
            return current_week_data[0, 8]  # stock_end_week
        return 0
    
    def _get_sold_elsewhere_quantity(self, sku_idx, loc_idx):
        """Retrieve the quantity sold in other locations for the same SKU."""
        sku_id = self.df_items[sku_idx, 0]
        loc_id = self.df_locations[loc_idx, 0]
    
        # Filter for sales in other locations in the current week
        sold_elsewhere = self.df_history[
            (self.df_history[:, 0] == sku_id) &  # skuID
            (self.df_history[:, 1] != loc_id) &  # Sold in other locations
            (self.df_history[:, 2] == self.current_step_year) &
            (self.df_history[:, 3] == self.current_step_week)
        ]
    
        if sold_elsewhere.size > 0:
            return sold_elsewhere[:, 4].sum()  # Total consumption elsewhere
        return 0

In [None]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnNoModelImprovement

env = InventoryRotationEnv(df_items, df_history, df_locations)
print(env.observation_space)

# Define stopping criteria (stop if no improvement after 5 evaluations)
early_stop_callback = StopTrainingOnNoModelImprovement(
    max_no_improvement_evals=5,  # Stop if no improvement after 5 evaluations
    min_evals=3,                # Minimum number of evaluations before stopping is considered (since it evaluates at 5,000, 10,000, and 15,000)
    verbose=1
)

# Define EvalCallback to evaluate and save best model
eval_callback = EvalCallback(
    env,
    callback_after_eval=early_stop_callback,
    eval_freq=5000,              # Evaluate the model every 5000 timesteps
    best_model_save_path='./best_model',  # Path to save the best model
    verbose=1
)

#(by default, PPO uses a batch size of 2048 timesteps per update)
#tensorboard --logdir=./ppo_tensorboard/ ARBA cd toDir THEN   tensorboard --logdir=logs
model = PPO('MultiInputPolicy', env,
    batch_size=1024,          # Keep batch size large
    n_steps=512,             # Increase rollout length
    learning_rate=2.0030562576078156e-05,     # Reduce learning rate for stability
    gamma=0.9606577859392695,               # Increase gamma to consider more future rewards
    gae_lambda=0.95,          # Adjust Generalized Advantage Estimation for smoother value function updates
    ent_coef=0.05568596693799925,            # Reduce entropy coefficient to stabilize training
    verbose=1,
    normalize_advantage=True,
    tensorboard_log="./logs/"
)

model.learn(total_timesteps=50_000, reset_num_timesteps=False, tb_log_name='PPO', callback=eval_callback)  # Is the total number of samples (env steps) to train on.

# Save/Load the trained model
#model.save("inventory_rotation_dien_kremas_128_dqn")
#model = DQN.load("inventory_rotation_dqn", tensorboard_log="./logs/")  

In [None]:
-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 54.9          |	The average episode length in timesteps (e.g., how many steps before an episode ends).
|    ep_rew_mean          | 6.97e+05      |	The average reward per episode (higher is better). Your rewards are large—check if they need normalization!
| time/                   |               |
|    fps                  | 1             |	Frames per second (very slow—may need optimization).
|    iterations           | 2             |	Number of policy updates so far.
|    time_elapsed         | 546           |	Time spent training (in seconds).
|    total_timesteps      | 2048          |	Total environment steps taken so far (taken actions) n_steps*iterations.
| train/                  |               |
|    approx_kl            | 0.00010914914 |	Measures how much the new policy deviates from the old one (should be small).
|    clip_fraction        | 0             |	Fraction of updates that were clipped (should be > 0, usually 0.1–0.3).
|    clip_range           | 0.2           |	The clipping range for PPO (default: 0.2).
|    entropy_loss         | -1.77e+03     |	Exploration vs. exploitation—higher entropy means more exploration.
|    explained_variance   | 5.96e-08      |	How well the value function explains variance (should be closer to 1).
|    learning_rate        | 0.0003        |	The learning rate used in training.
|    loss                 | 1.47e+10      |	Total policy loss (should decrease over time).
|    n_updates            | 30            |	The number of gradient updates applied to the neural network so far.
|    policy_gradient_loss | -0.00617      |	Measures how much policy gradients are changing (should be small and negative).
|    value_loss           | 3.2e+10       |	Measures the difference between predicted and actual rewards.
-------------------------------------------
Episode: 	A full sequence of steps from the start to termination (e.g., one complete inventory cycle).
Timestep: 	A single action taken by the agent in the environment.
n_step: 	The number of timesteps collected before an update (batch size for PPO).
Iteration: 	One cycle of collecting n_step timesteps and updating the policy.

In [None]:
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 52.6     |
|    ep_rew_mean     | 1.17e+07 |
| time/              |          |
|    fps             | 7        |
|    iterations      | 1        |
|    time_elapsed    | 64       |
|    total_timesteps | 45512    |
---------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 52.4          |
|    ep_rew_mean          | 1.17e+07      |
| time/                   |               |
|    fps                  | 7             |
|    iterations           | 2             |
|    time_elapsed         | 128           |
|    total_timesteps      | 46024         |
| train/                  |               |
|    approx_kl            | 1.1641532e-08 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -3.62e+03     |
|    explained_variance   | 5.96e-08      |
|    learning_rate        | 2e-05         |
|    loss                 | 4.45e+12      |
|    n_updates            | 880           |
|    policy_gradient_loss | -1.89e-05     |
|    std                  | 1             |
|    value_loss           | 8.9e+12       |
-------------------------------------------

In [None]:
import optuna
from stable_baselines3.common.evaluation import evaluate_policy

# Define the hyperparameter optimization function
def optimize_ppo(trial):
    # Define hyperparameter search space
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-3)
    batch_size = trial.suggest_categorical('batch_size', [512, 1024, 2048])
    gamma = trial.suggest_uniform('gamma', 0.90, 0.99)
    ent_coef = trial.suggest_uniform('ent_coef', 0.01, 0.1)
    steps = trial.suggest_categorical('n_steps', [512, 1024, 2048, 4096])

    # Create the environment
    env = SubprocVecEnv([lambda: InventoryRotationEnv(df_items, df_history, df_locations) for _ in range(4)])

    # Initialize PPO with suggested hyperparameters
    model = PPO(
        "MultiInputPolicy", 
        env,
        learning_rate=learning_rate,
        batch_size=batch_size,
        gamma=gamma,
        ent_coef=ent_coef,
        n_steps=steps,
        verbose=0
    )

    # Train the model for a limited number of timesteps
    model.learn(total_timesteps=50_000)

    # Evaluate the model
    mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=5, deterministic=True)

    # Return the mean reward as the objective for Optuna to maximize
    return mean_reward

# Use TPESampler for Bayesian optimization (default in Optuna)
#sampler = optuna.samplers.TPESampler()
# Use GridSampler for grid search
#sampler = optuna.samplers.GridSampler(search_space)
# Use RandomSampler for random search
#sampler = optuna.samplers.RandomSampler()

# Create the Optuna study
study = optuna.create_study(direction='maximize')   # sampler=sampler
study.optimize(optimize_ppo, n_trials=20)

# Print the best hyperparameters
print("Best Hyperparameters:", study.best_params)

In [None]:
[I 2025-02-09 15:28:42,064] Trial 1 finished with value: 10118461.741243605 and parameters: {'learning_rate': 0.00010804851334937822, 'batch_size': 512, 'gamma': 0.9805491857819207, 'ent_coef': 0.06140474468859541, 'n_steps': 512}. Best is trial 0 with value: 13127977.01602819.
[I 2025-02-09 16:00:51,776] Trial 2 finished with value: 2913960.15687677 and parameters: {'learning_rate': 2.7703267840918958e-05, 'batch_size': 2048, 'gamma': 0.955762882575184, 'ent_coef': 0.014447912424832017, 'n_steps': 1024}. Best is trial 0 with value: 13127977.01602819.
[I 2025-02-09 16:32:24,726] Trial 3 finished with value: 18277061.827805273 and parameters: {'learning_rate': 2.0030562576078156e-05, 'batch_size': 1024, 'gamma': 0.9606577859392695, 'ent_coef': 0.05568596693799925, 'n_steps': 512}. Best is trial 3 with value: 18277061.827805273.
[I 2025-02-09 17:08:16,387] Trial 4 finished with value: 3155552.9795123767 and parameters: {'learning_rate': 0.0007005014244145663, 'batch_size': 512, 'gamma': 0.9461849573888284, 'ent_coef': 0.03523756402106294, 'n_steps': 2048}. Best is trial 3 with value: 18277061.827805273.
[I 2025-02-09 17:48:29,556] Trial 5 finished with value: 11494704.03203483 and parameters: {'learning_rate': 4.154184255691518e-05, 'batch_size': 1024, 'gamma': 0.9259218320633796, 'ent_coef': 0.013232834429614315, 'n_steps': 4096}. Best is trial 3 with value: 18277061.827805273.
[I 2025-02-09 18:19:33,033] Trial 6 finished with value: 9789264.44600026 and parameters: {'learning_rate': 0.00021356131269591918, 'batch_size': 1024, 'gamma': 0.9694232356377241, 'ent_coef': 0.039723395972970775, 'n_steps': 1024}. Best is trial 3 with value: 18277061.827805273.
[I 2025-02-09 18:58:52,347] Trial 7 finished with value: 8834690.509522133 and parameters: {'learning_rate': 8.87631497509632e-05, 'batch_size': 1024, 'gamma': 0.9328494690731493, 'ent_coef': 0.04732696217214691, 'n_steps': 4096}. Best is trial 3 with value: 18277061.827805273.
[I 2025-02-09 19:38:16,527] Trial 8 finished with value: 7217952.033473587 and parameters: {'learning_rate': 1.628451062175235e-05, 'batch_size': 1024, 'gamma': 0.982159988616355, 'ent_coef': 0.011689457205937405, 'n_steps': 4096}. Best is trial 3 with value: 18277061.827805273.
[I 2025-02-09 20:12:45,773] Trial 9 finished with value: 2653417.2911010124 and parameters: {'learning_rate': 0.00012853721842564773, 'batch_size': 2048, 'gamma': 0.9874262367370226, 'ent_coef': 0.04641196757280243, 'n_steps': 2048}. Best is trial 3 with value: 18277061.827805273.
[I 2025-02-09 20:44:12,272] Trial 10 finished with value: 6644564.890546517 and parameters: {'learning_rate': 1.0872087991477762e-05, 'batch_size': 512, 'gamma': 0.90501519096706, 'ent_coef': 0.09244041989116991, 'n_steps': 512}. Best is trial 3 with value: 18277061.827805273.
[I 2025-02-09 21:15:26,979] Trial 11 finished with value: 15341236.574456334 and parameters: {'learning_rate': 4.031301213150346e-05, 'batch_size': 1024, 'gamma': 0.91285138422288, 'ent_coef': 0.07353452918359735, 'n_steps': 512}. Best is trial 3 with value: 18277061.827805273.
[I 2025-02-09 21:46:33,651] Trial 12 finished with value: 10905961.62867527 and parameters: {'learning_rate': 2.0942808348796655e-05, 'batch_size': 1024, 'gamma': 0.9008557404408029, 'ent_coef': 0.07336660823696259, 'n_steps': 512}. Best is trial 3 with value: 18277061.827805273.
[I 2025-02-09 22:17:46,292] Trial 13 finished with value: 12919903.453447958 and parameters: {'learning_rate': 4.58617706507172e-05, 'batch_size': 1024, 'gamma': 0.9631885681416206, 'ent_coef': 0.0807585026470101, 'n_steps': 512}. Best is trial 3 with value: 18277061.827805273.
[I 2025-02-09 22:48:54,888] Trial 14 finished with value: 9933245.678310104 and parameters: {'learning_rate': 1.233838850938475e-05, 'batch_size': 1024, 'gamma': 0.9138089004709146, 'ent_coef': 0.06410973351123077, 'n_steps': 512}. Best is trial 3 with value: 18277061.827805273.
[I 2025-02-09 23:19:50,989] Trial 15 finished with value: 11027867.461857762 and parameters: {'learning_rate': 2.9812019726707137e-05, 'batch_size': 2048, 'gamma': 0.9415836707521893, 'ent_coef': 0.0916988347931032, 'n_steps': 512}. Best is trial 3 with value: 18277061.827805273.
[I 2025-02-09 23:51:02,052] Trial 16 finished with value: 7357536.864688158 and parameters: {'learning_rate': 0.00036914429468202836, 'batch_size': 1024, 'gamma': 0.9470162323568629, 'ent_coef': 0.07200852815207352, 'n_steps': 512}. Best is trial 3 with value: 18277061.827805273.
[I 2025-02-10 00:22:03,080] Trial 17 finished with value: 4123708.2076502605 and parameters: {'learning_rate': 5.8012772078618953e-05, 'batch_size': 1024, 'gamma': 0.916429247424199, 'ent_coef': 0.030750688043332343, 'n_steps': 512}. Best is trial 3 with value: 18277061.827805273.
[I 2025-02-10 00:54:17,576] Trial 18 finished with value: 14234896.283732299 and parameters: {'learning_rate': 2.116160938034785e-05, 'batch_size': 2048, 'gamma': 0.969923252890909, 'ent_coef': 0.0558884914489124, 'n_steps': 1024}. Best is trial 3 with value: 18277061.827805273.
[I 2025-02-10 01:28:59,399] Trial 19 finished with value: 13464429.320878655 and parameters: {'learning_rate': 7.753344968664924e-05, 'batch_size': 512, 'gamma': 0.9577830145443704, 'ent_coef': 0.08194708748617878, 'n_steps': 2048}. Best is trial 3 with value: 18277061.827805273.

In [None]:
from stable_baselines3 import PPO

# Load the trained model
model = PPO.load("inventory_rotation_dien_kremas_PPO_23", tensorboard_log="./logs/")

env = InventoryRotationEnv(df_items, df_history, df_locations)
obs, _ = env.reset()

reward_history = []  # Track rewards for each step
step_count = 0
total_reward = 0
done = False

# Ensure ordered mappings for SKU and location
sku_id_map = {sku: idx for idx, sku in enumerate(sorted(df_items['skuid'].unique()))}
location_id_map = {loc: idx for idx, loc in enumerate(sorted(df_locations['slid'].unique()))}

# Reverse lookup maps
sku_reverse_map = {v: k for k, v in sku_id_map.items()}
location_reverse_map = {v: k for k, v in location_id_map.items()}

while not done and step_count < 10:  # Limit the test run to 10 steps
    action, _ = model.predict(obs, deterministic=True)  # Choose action
    prev_sku_state = obs['inventory'].copy()  # Store state before step
    
    # Take action in the environment
    obs, reward, done, truncated, info = env.step(action)  

    # Decode action matrix to see item movements
    action_matrix = action.reshape((env.num_skus, env.num_locations))
    
    # Check how the inventory has changed after the environment processed the action
    current_inventory = obs['inventory'].copy()
    
    # Count unique items per location
    unique_items_per_location = current_inventory.sum(axis=0)  # Sum along SKUs for each location
    
    # Log changes in inventory
    changes = []
    for sku_idx in range(env.num_skus):
        for loc_idx in range(env.num_locations):
            sku_id = sku_reverse_map.get(sku_idx, f"Unknown SKU {sku_idx}")
            loc_id = location_reverse_map.get(loc_idx, f"Unknown Loc {loc_idx}")
            
            if prev_sku_state[sku_idx, loc_idx] == 0 and current_inventory[sku_idx, loc_idx] == 1:
                changes.append(f"✅ Added {sku_id} to {loc_id}")
            elif prev_sku_state[sku_idx, loc_idx] == 1 and current_inventory[sku_idx, loc_idx] == 0:
                changes.append(f"❌ Removed {sku_id} from {loc_id}")
    
    # Print step results
    print(f"\n📌 Step {step_count + 1}: Reward={reward:.2f}, Done={done}")
    print(f"   📅 Date: Year {env.current_step_year}, Week {env.current_step_week}")
    if changes:
        print("\n🔄 Inventory Changes:")
        for change in changes[0:10]:
            print(f"   {change}")
    else:
        print("   ⚠ No inventory changes this step.")
        
    # Print unique items per location
    print("\n📊 Unique Items Per Location:")
    for loc_idx, unique_count in enumerate(unique_items_per_location):
        loc_id = location_reverse_map.get(loc_idx, f"Unknown Loc {loc_idx}")
        print(f"   📍 Location {loc_id}: {int(unique_count)} unique items in inventory.")

    # Track rewards
    reward_history.append(reward)
    total_reward += reward
    step_count += 1

print(f"\n🏆 Total Reward Over {step_count} Steps: {total_reward:.2f}")

In [None]:
❌ Removed 48963 from 238
✅ Added 49561 to 238
✅ Added 50429 to 262
✅ Added 50465 to 262
✅ Added 51181 to 262
✅ Added 51282 to 262
✅ Added 52338 to 238
✅ Added 52345 to 262
✅ Added 52557 to 262
✅ Added 52599 to 238
✅ Added 52607 to 262
✅ Added 53631 to 262