In [3]:
## InventoryRotationEnv

In [10]:
import glob
import pandas as pd
import numpy as np
import random

In [22]:
"""
    1. DATA SELECTION
"""
# Path to the data (folders)
path = ['C:/Users/Win-10/Documents/Python Scripts/ku']

frames = []
for p in path:
    all_files = glob.glob(p + "/history_*.csv")
    for filename in all_files:
        df = pd.read_csv(filename, header=0, sep=',', index_col=None, low_memory=False)
        frames.append(df)

df_history = pd.concat(frames)

df_items = pd.read_csv(p + "/skus.csv", header=0, sep=',', index_col=None, low_memory=False)
df_locations = pd.read_csv(p + "/locations.csv", header=0, sep=',', index_col=None, low_memory=False)
df_promotions = pd.read_csv(p + "/promotions.csv", header=0, sep=',', index_col=None, low_memory=False)

# Remove temp variables from workspace
del path, all_files, frames, filename, p, df
    
# Step 1: Filter df_items by multiple skuName values
sku_names_to_filter = ['Dieninis kremas'] # ['Kūno apsauginis kremas','Lūpų dažai','Parfumuotas vanduo (EDP)','Kreminė pudra','Dieninis kremas','Lūpų blizgesiai','Akių šešėliai mono 1sp.','Tualetinis vanduo (EDT)']  # Example skuNames to filter by
df_items = df_items[df_items['categoryGroup'].isin(sku_names_to_filter)]

# Step 2: Filter df_history by the skuid values of the filtered df_items
df_history = df_history[df_history['skuID'].isin(df_items['skuid'])]

# Step 3: Filter data by the locations
df_locations = df_locations[df_locations['slid'].isin([262,238])]
df_history = df_history[df_history['slid'].isin([262,238])]
df_history = df_history[(df_history['atSiteQnt'] > 0) | (df_history['consumption'] > 0)]

# Set in chronological order
df_history['updateDate'] = pd.to_datetime(df_history['updateDate'])
df_history = df_history.sort_values(by='updateDate').reset_index(drop=True)

In [31]:
df_history.shape

(71918, 7)

In [32]:
df_history.describe()  # 71_918 -> 10_584

Unnamed: 0,slid,skuID,updateDate,atSiteQnt,consumption,purchasingPrice,salesPrice
count,71918.0,71918.0,71918,71918.0,71918.0,71918.0,71918.0
mean,254.999694,30499.587892,2024-01-03 07:59:24.359408896,123.851289,0.700159,25.338845,53.140266
min,238.0,533.0,2022-09-27 00:00:00,0.0,0.0,0.01,0.0001
25%,238.0,23743.0,2023-09-06 00:00:00,2.0,0.0,7.56,23.95
50%,262.0,33030.0,2024-01-12 00:00:00,2.0,0.0,21.02,43.7603
75%,262.0,38118.0,2024-05-24 00:00:00,10.0,0.0,35.305,74.3802
max,262.0,54142.0,2024-10-02 00:00:00,13656.0,1415.0,284.711,438.0165
std,10.908928,12209.92622,,731.927536,9.800807,21.292089,36.930782


In [26]:
df_history.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71918 entries, 0 to 71917
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   slid             71918 non-null  int64         
 1   skuID            71918 non-null  int64         
 2   updateDate       71918 non-null  datetime64[ns]
 3   atSiteQnt        71918 non-null  float64       
 4   consumption      71918 non-null  float64       
 5   purchasingPrice  71918 non-null  float64       
 6   salesPrice       71918 non-null  float64       
dtypes: datetime64[ns](1), float64(4), int64(2)
memory usage: 3.8 MB


In [23]:
temp = df_history[df_history['skuID'] == 48722]
temp.head()

Unnamed: 0,slid,skuID,updateDate,atSiteQnt,consumption,purchasingPrice,salesPrice
33881,262,48722,2023-12-29,42.0,0.0,3.96,20.6198
34038,262,48722,2023-12-30,42.0,0.0,3.96,20.6198
34180,262,48722,2023-12-31,42.0,0.0,3.96,20.6198
34307,262,48722,2024-01-01,42.0,0.0,3.96,20.6198
34425,262,48722,2024-01-02,42.0,0.0,3.96,20.6198


In [24]:
df = temp.copy()

# Convert updateDate to datetime and sort by time
df['updateDate'] = pd.to_datetime(df['updateDate'])
df = df.sort_values(by='updateDate').reset_index(drop=True)

# Extract year and week number
df['year'] = df['updateDate'].dt.year
df['week'] = df['updateDate'].dt.isocalendar().week

# Sort data by date to get the correct start and end of the week
df = df.sort_values(by=['slid', 'skuID', 'year', 'week', 'updateDate'])

# Group by location, SKU, year, and week
weekly_summary = df.groupby(['slid', 'skuID', 'year', 'week']).agg({
    'consumption': 'sum',                         # Total weekly consumption
    'purchasingPrice': 'mean',                    # Average purchasing price
    'salesPrice': 'mean',                         # Average sales price
    #'isNewItem': 'max'                            # If any day has True, week should be True
}).reset_index()


# Step 1: Compute absolute week number (ensuring correct weekly progression)
weekly_summary = weekly_summary.sort_values(by=['slid', 'skuID', 'year', 'week']).reset_index(drop=True)
weekly_summary['absolute_week'] = (weekly_summary['year'] - weekly_summary['year'].min()) * 52 + weekly_summary['week']
# Step 2: Find the first absolute week an item appeared in a location
weekly_summary['first_absolute_week'] = weekly_summary.groupby(['slid', 'skuID'])['absolute_week'].transform('min')
# Step 3: Compute remaining lock period (12-week lock from first appearance)
weekly_summary['lockWeeksRemaining'] = (12 - (weekly_summary['absolute_week'] - weekly_summary['first_absolute_week'])).clip(lower=0)
# Step 4: Drop unnecessary columns if needed
weekly_summary.drop(columns=['first_absolute_week','absolute_week'], inplace=True)


# Get the stock at the start and end of the week
stock_start = df.groupby(['slid', 'skuID', 'year', 'week']).first().reset_index()[['slid', 'skuID', 'year', 'week', 'atSiteQnt']]
stock_end = df.groupby(['slid', 'skuID', 'year', 'week']).last().reset_index()[['slid', 'skuID', 'year', 'week', 'atSiteQnt']]

# Rename columns for clarity
stock_start.rename(columns={'atSiteQnt': 'stock_start_week'}, inplace=True)
stock_end.rename(columns={'atSiteQnt': 'stock_end_week'}, inplace=True)

# Merge start and end stock with the weekly summary
weekly_summary = weekly_summary.merge(stock_start, on=['slid', 'skuID', 'year', 'week'])
weekly_summary = weekly_summary.merge(stock_end, on=['slid', 'skuID', 'year', 'week'])

temp = weekly_summary.copy()


# Filter out rows where stock_end_week is zero
df_non_zero_stock = temp[(temp['stock_end_week'] > 0) | (temp['consumption'] > 0)]

# Group by location, year, and week to count unique skuID
unique_items_count = df_non_zero_stock.groupby(['slid', 'year', 'week'])['skuID'].nunique().reset_index()
unique_items_count.rename(columns={'skuID': 'unique_items_count'}, inplace=True)

# Merge the unique items count with the original DataFrame
temp = temp.merge(unique_items_count, on=['slid', 'year', 'week'], how='left')

# Fill NaN values with 0 (in case there are weeks with no items)
temp['unique_items_count'] = temp['unique_items_count'].fillna(0).astype(int)


del stock_start, stock_end, weekly_summary, df, df_non_zero_stock, unique_items_count

In [25]:
temp.head()

Unnamed: 0,slid,skuID,year,week,consumption,purchasingPrice,salesPrice,lockWeeksRemaining,stock_start_week,stock_end_week,unique_items_count
0,262,48722,2023,52,0.0,3.96,20.6198,12,42.0,42.0,1
1,262,48722,2024,1,5.0,3.96,20.6198,11,42.0,37.0,1
2,262,48722,2024,2,7.0,3.96,20.6198,10,32.0,31.0,1
3,262,48722,2024,3,6.0,3.96,20.6198,9,30.0,25.0,1
4,262,48722,2024,4,7.0,3.96,20.6198,8,15.0,8.0,1


In [36]:
sorted(temp[['year', 'week']].drop_duplicates().values.tolist())

[[2023, 52],
 [2024, 1],
 [2024, 2],
 [2024, 3],
 [2024, 4],
 [2024, 5],
 [2024, 6],
 [2024, 18],
 [2024, 19],
 [2024, 20],
 [2024, 21],
 [2024, 22],
 [2024, 23],
 [2024, 24],
 [2024, 25],
 [2024, 26],
 [2024, 27],
 [2024, 28],
 [2024, 29],
 [2024, 30],
 [2024, 31],
 [2024, 32],
 [2024, 37],
 [2024, 38],
 [2024, 39],
 [2024, 40]]

In [None]:
import gym
import numpy as np
from gym import spaces

class InventoryRotationEnv(gym.Env):
    def __init__(self, df_items, df_history, df_locations):
        super(InventoryRotationEnv, self).__init__()

        # Store dataset references
        self.df_items = df_items
        self.df_history = df_history
        self.df_locations = df_locations
        
        # Define SKU & location count
        self.num_skus = len(df_items)
        self.num_locations = len(df_locations)

        # Define action space (flattened MultiBinary)
        self.action_space = spaces.MultiBinary(self.num_skus * self.num_locations)  # spaces.Box

        # Define observation space (features per SKU-location pair)
        self.observation_space = spaces.Dict({
            'current_stock_state': spaces.MultiBinary((self.num_skus, self.num_locations)),
            'delta_margin': spaces.Box(low=-np.inf, high=np.inf, shape=(self.num_skus, self.num_locations), dtype=np.float32),
            'delta_stock': spaces.Box(low=-np.inf, high=np.inf, shape=(self.num_skus, self.num_locations), dtype=np.float32),
            'delta_profit': spaces.Box(low=-np.inf, high=np.inf, shape=(self.num_skus, self.num_locations), dtype=np.float32),
            'week': spaces.Box(low=1, high=53, shape=(self.num_skus, self.num_locations), dtype=np.int16),
            'unique_items_count': spaces.Box(low=0, high=np.inf, shape=(self.num_skus, self.num_locations), dtype=np.float32)  # Track per location
        })
        
        # Create sorted mappings for SKU and location indices
        self.sku_id_map = {sku: idx for idx, sku in enumerate(sorted(df_items['skuid'].unique()))}
        self.location_id_map = {loc: idx for idx, loc in enumerate(sorted(df_locations['slid'].unique()))}

        # Initialize environment state
        self.state = None
        self.current_step = 0
        self.current_step_year = None
        self.current_step_week = None
        
        # Extract unique (year, week) pairs and sort them
        self.unique_year_weeks = sorted(df_history[['year', 'week']].drop_duplicates().values.tolist())
        self.max_steps = len(self.unique_year_weeks)  # Total possible timesteps in history

In [None]:
def reset(self, *, seed=None, options=None):
    """ Resets the environment to a new random starting point in historical data. """
    super().reset(seed=seed)

    # Select a random week in history
    self.current_step = random.randint(0, self.max_steps - 1)
    self.current_step_year, self.current_step_week = self.unique_year_weeks[self.current_step]

    # Filter available SKUs based on stock levels at the end of the week
    filtered_data = self.df_history[
        (self.df_history['year'] == self.current_step_year) &
        (self.df_history['week'] == self.current_step_week) &
        (self.df_history['stock_end_week'] > 0)
    ][['slid', 'skuID', 'lockWeeksRemaining', 'unique_items_count']].drop_duplicates()

    # Initialize lock periods (ensures all start at 0)
    self.lock_periods = np.zeros((self.num_skus, self.num_locations), dtype=np.int8)

    # Update lock periods based on filtered SKUs
    for _, row in filtered_data.iterrows():
        sku_idx = self.sku_id_map.get(row['skuID'], -1)
        loc_idx = self.location_id_map.get(row['slid'], -1)
        if sku_idx >= 0 and loc_idx >= 0:
            self.lock_periods[sku_idx, loc_idx] = row['lockWeeksRemaining']

    # Remove unnecessary columns before passing to `_generate_observation`
    filtered_data = filtered_data[['slid', 'skuID', 'unique_items_count']].drop_duplicates()

    # Initialize state with updated filtered data
    self.state = self._generate_observation(filtered_data)

    # Return observation and empty info dictionary
    return self.state, {}

In [None]:
def _generate_observation(self, filtered_data, action_matrix=None):
    """Generate an observation based on filtered inventory data."""
    
    # Initialize observation with zeros
    obs = {key: np.zeros((self.num_skus, self.num_locations), dtype=np.float32) 
           for key in self.observation_space.keys()}
    
    rolling_window_steps = 4  # Use last 4 weeks
    
    # Get past 4 weeks + current week
    start_step = max(0, self.current_step - rolling_window_steps)
    end_step = self.current_step  # Current week is included
    rolling_year_weeks = self.unique_year_weeks[start_step:end_step + 1]  # Include current week
    
    if rolling_year_weeks:
        rolling_years, rolling_weeks = zip(*rolling_year_weeks)
    else:
        rolling_years, rolling_weeks = np.array([], dtype=np.int16), np.array([], dtype=np.int8)  # Empty arrays if no data

    # Populate observations
    for _, row in filtered_data.iterrows():
        sku_idx = self.sku_id_map.get(row['skuID'], -1)
        loc_idx = self.location_id_map.get(row['slid'], -1)

        if sku_idx >= 0 and loc_idx >= 0:
            # Update current stock state based on previous step's actions
            if action_matrix is not None:
                obs['current_stock_state'][sku_idx, loc_idx] = action_matrix[sku_idx, loc_idx]
            else:
                obs['current_stock_state'][sku_idx, loc_idx] = 1  # Default if no action history available
            
            # Store unique items per location (static, no accumulation)
            obs['unique_items_count'][sku_idx, loc_idx] = row['unique_items_count']
            
            # Add **week/year** information (seasonality feature)
            obs['week'][sku_idx, loc_idx] = self.current_step_week
            
            # Select all weeks in the rolling period
            sales_data = self.df_history[
                (self.df_history['skuID'] == row['skuID']) & 
                (self.df_history['slid'] == row['slid']) & 
                (self.df_history['year'].isin(rolling_years)) & 
                (self.df_history['week'].isin(rolling_weeks))
            ]

            if not sales_data.empty:
                obs['delta_stock'][sku_idx, loc_idx] = sales_data['consumption'].sum()
                obs['delta_profit'][sku_idx, loc_idx] = (
                    sales_data['consumption'] * 
                    (sales_data['salesPrice'] - sales_data['purchasingPrice'])
                ).mean()
                obs['delta_margin'][sku_idx, loc_idx] = (
                    sales_data['salesPrice'].mean() - sales_data['purchasingPrice'].mean()
                )

    return obs

In [None]:
def step(self, action):
    """ Perform an environment step using the given action. """
    
    # **Step 1: Move to the next step first (ensures reward calculation is correct)**
    self.current_step += 1
    done = self.current_step >= self.max_steps
    truncated = False  
    if done:
        return self.state, 0, done, truncated, {}

    # **Step 2: Convert action into (num_skus, num_locations) matrix**
    action_matrix = action.reshape((self.num_skus, self.num_locations))

    # **Step 3: Apply inventory constraints (locks, permanent items)**
    newly_added_mask = (action_matrix == 1) & (self.state['current_stock_state'] == 0)

    if np.any(newly_added_mask):
        # Assign 12-week lock period for newly added SKUs
        start_step = self.current_step
        end_step = min(self.current_step + 12, len(self.unique_year_weeks))

        for week_idx, lock_value in zip(range(start_step, end_step), reversed(range(1, end_step - start_step + 1))):
            self.lock_periods[newly_added_mask] = lock_value  

    # Ensure locked/permanent items are NOT removed
    action_matrix[self.lock_periods > 0] = 1  
    important_item_mask = self.df_items['isGenerateFlow'].fillna(0).values == 1  
    action_matrix[important_item_mask, :] = 1  

    # **Step 4: Compute reward for the previous step's action**
    reward = self._calculate_profit(action_matrix)

    # **Step 5: Reduce lock periods (items get closer to being removable)**
    self.lock_periods = np.maximum(self.lock_periods - 1, 0)

    # **Step 6: Update the current step year & week**
    self.current_step_year, self.current_step_week = self.unique_year_weeks[self.current_step]

    # **Step 7: Filter inventory data for the new step**
    filtered_data = self.df_history[
        (self.df_history['year'] == self.current_step_year) &
        (self.df_history['week'] == self.current_step_week) &
        (self.df_history['stock_end_week'] > 0)
    ][['slid', 'skuID', 'lockWeeksRemaining', 'unique_items_count']].drop_duplicates()

    # Refresh lock status for new inventory items
    for _, row in filtered_data.iterrows():
        sku_idx = self.sku_id_map.get(row['skuID'], -1)
        loc_idx = self.location_id_map.get(row['slid'], -1)
        if sku_idx >= 0 and loc_idx >= 0:
            self.lock_periods[sku_idx, loc_idx] = row['lockWeeksRemaining']

    # Drop unnecessary columns before generating new observations
    filtered_data = filtered_data[['slid', 'skuID', 'unique_items_count']].drop_duplicates()

    # **Step 8: Generate new observation state using the updated action_matrix**
    self.state = self._generate_observation(filtered_data, action_matrix)

    return self.state, reward, done, truncated, {}

In [None]:
def _calculate_profit(self, action_matrix):
    """ Compute the reward (profit) based on inventory decisions. """
    
    # Define the rolling window for the past 4 weeks
    rolling_window_steps = 4
    start_step = max(0, self.current_step - rolling_window_steps)
    end_step = self.current_step

    # Extract year-week combinations for the rolling window
    rolling_year_weeks = self.unique_year_weeks[start_step:end_step]
    rolling_years, rolling_weeks = zip(*rolling_year_weeks) if rolling_year_weeks else ([], [])

    # Filter historical data for these weeks
    history_window = self.df_history[
        (self.df_history['year'].isin(rolling_years)) &
        (self.df_history['week'].isin(rolling_weeks))
    ]

    # Find active SKUs & locations from the action matrix
    active_indices = np.argwhere(action_matrix == 1)
    active_pairs = [(self.df_items['skuid'].iloc[sku_idx], self.df_locations['slid'].iloc[loc_idx]) 
                    for sku_idx, loc_idx in active_indices]

    # Filter data for active SKU-location pairs
    active_data = history_window[history_window[['skuID', 'slid']].apply(tuple, axis=1).isin(active_pairs)].copy()

    # Find items sold in different locations (excluding active pairs)
    active_skus = list(set([sku for sku, _ in active_pairs]))
    items_sold_elsewhere = history_window[
        (history_window['skuID'].isin(active_skus)) &
        (~history_window[['skuID', 'slid']].apply(tuple, axis=1).isin(active_pairs))
    ].copy()

    # Compute mean profit for SKUs sold in other locations
    items_sold_elsewhere['sku_profit'] = (
        items_sold_elsewhere['consumption'] * (items_sold_elsewhere['salesPrice'] - items_sold_elsewhere['purchasingPrice'])
    )
    mean_profit_elsewhere = items_sold_elsewhere['sku_profit'].mean()

    # Initialize profit-related values
    total_profit, delta_profit, delta_sales, avg_past_profit = 0, 0, 0, 0
    unsold_penalty, inventory_penalty = 0, 0  

    if not active_data.empty:
        # Compute markup & profit per SKU
        active_data['markup'] = active_data['salesPrice'] - active_data['purchasingPrice']
        active_data['sku_profit'] = active_data['consumption'] * active_data['markup']

        # Compute rolling average profit per SKU-location
        rolling_avg_profit = active_data.groupby(['skuID', 'slid'])['sku_profit'].mean()

        # Compute changes in sales and profit
        active_data['delta_sales'] = active_data.groupby(['skuID', 'slid'])['consumption'].diff().fillna(0)
        active_data['delta_profit'] = active_data.groupby(['skuID', 'slid'])['sku_profit'].diff().fillna(0)

        # Aggregate profit-related values
        total_profit = active_data['sku_profit'].sum()
        delta_sales = active_data['delta_sales'].sum()
        delta_profit = active_data['delta_profit'].sum()
        avg_past_profit = rolling_avg_profit.mean()

        # Compute penalties for unsold inventory
        lost_profit_penalty_factor = 0.05  # 5% of potential lost profit
        stagnant_inventory_penalty = 0.02  # 2% extra penalty per unsold week
        max_unsold_penalty = 100  # Upper limit for penalty

        unsold_penalty = active_data.groupby(['skuID', 'slid']).apply(
            lambda x: 0 if (x['consumption'].sum() > 0 or x['stock_end_week'].mean() == 0) 
            else min(
                (lost_profit_penalty_factor * x['stock_end_week'].mean() * x['markup'].mean()) * 
                (1 + stagnant_inventory_penalty * x.shape[0]), 
                max_unsold_penalty
            )
        ).sum()

        # Compute inventory size penalty based on `unique_items_count`
        penalty_factor = 5  # Scaling factor for penalty
        tolerance = 0.1  # 10% tolerance

        for loc_idx in range(self.num_locations):
            selected_items = action_matrix[:, loc_idx].sum()
            expected_count = self.state['unique_items_count'][:, loc_idx].sum()  

            if expected_count > 0:  # Avoid division by zero
                deviation = abs(selected_items - expected_count) / expected_count
                if deviation > tolerance:
                    inventory_penalty += deviation * penalty_factor  # Apply penalty

    # Compute final reward (profit)
    profit = (
        total_profit
        + delta_profit
        - abs(delta_sales) * 0.1  # Penalize sales drop
        + avg_past_profit * 0.05  # Reward historical profitability
        - unsold_penalty  # Penalize stock without consumption
        - inventory_penalty  # Penalize inventory deviations
        + (0 if np.isnan(mean_profit_elsewhere) else mean_profit_elsewhere)  # Add profit from other locations
    )

    return profit

In [None]:
env = InventoryRotationEnv(df_items, df_history, df_locations)
print(env.observation_space)

#(by default, PPO uses a batch size of 2048 timesteps per update)
#tensorboard --logdir=./ppo_tensorboard/    tensorboard --logdir=logs
model = PPO('MultiInputPolicy', env, batch_size=128, n_steps=512, verbose=1, tensorboard_log="./logs/") # With default hyperparameters PPO, default batch size is 32
"""model = DQN(  
    'MlpPolicy',
    env,
    learning_rate=0.0005,
    buffer_size=50000,
    batch_size=64,
    gamma=0.99,
    exploration_fraction=0.1,
    exploration_final_eps=0.02,
    target_update_interval=1000,
    verbose=1
)"""
model.learn(total_timesteps=5_00, reset_num_timesteps=False, tb_log_name='PPO')  # Is the total number of samples (env steps) to train on.

# Save/Load the trained model
#model.save("inventory_rotation_dien_kremas_128_dqn")
#model = DQN.load("inventory_rotation_dqn", tensorboard_log="./logs/")  

In [None]:
-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 54.9          |	The average episode length in timesteps (e.g., how many steps before an episode ends).
|    ep_rew_mean          | 6.97e+05      |	The average reward per episode (higher is better). Your rewards are large—check if they need normalization!
| time/                   |               |
|    fps                  | 1             |	Frames per second (very slow—may need optimization).
|    iterations           | 2             |	Number of policy updates so far.
|    time_elapsed         | 546           |	Time spent training (in seconds).
|    total_timesteps      | 2048          |	Total environment steps taken so far (taken actions) n_steps*iterations.
| train/                  |               |
|    approx_kl            | 0.00010914914 |	Measures how much the new policy deviates from the old one (should be small).
|    clip_fraction        | 0             |	Fraction of updates that were clipped (should be > 0, usually 0.1–0.3).
|    clip_range           | 0.2           |	The clipping range for PPO (default: 0.2).
|    entropy_loss         | -1.77e+03     |	Exploration vs. exploitation—higher entropy means more exploration.
|    explained_variance   | 5.96e-08      |	How well the value function explains variance (should be closer to 1).
|    learning_rate        | 0.0003        |	The learning rate used in training.
|    loss                 | 1.47e+10      |	Total policy loss (should decrease over time).
|    n_updates            | 30            |	The number of gradient updates applied to the neural network so far.
|    policy_gradient_loss | -0.00617      |	Measures how much policy gradients are changing (should be small and negative).
|    value_loss           | 3.2e+10       |	Measures the difference between predicted and actual rewards.
-------------------------------------------
Episode: 	A full sequence of steps from the start to termination (e.g., one complete inventory cycle).
Timestep: 	A single action taken by the agent in the environment.
n_step: 	The number of timesteps collected before an update (batch size for PPO).
Iteration: 	One cycle of collecting n_step timesteps and updating the policy.

In [None]:
Numpy:
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 47.7      |
|    ep_rew_mean     | -2.78e+03 |
| time/              |           |
|    fps             | 4         |
|    iterations      | 1         |
|    time_elapsed    | 117       |
|    total_timesteps | 512       |
----------------------------------
Dataframe:
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 44.5     |
|    ep_rew_mean     | 6.12e+05 |
| time/              |          |
|    fps             | 1        |
|    iterations      | 1        |
|    time_elapsed    | 298      |
|    total_timesteps | 512      |
---------------------------------

In [None]:
Rewards:
152.66014652014653
317.8796296296296
112.25535714285715
215.52357142857144
189.86863636363634
171.93217391304344
237.0790476190476
79.33852040816326
40.96407966101695
56.39324811320755
97.71670853658537
78.59267717391303
49.172781753246745
63.70759652173913
53.65741978021978
54.4723560523446
30.159305172413795
34.32607542372882
52.500909
17.701305785123967
16.43823025210084
27.808721359223302
33.19273603603604
18.535129770992366
30.551248201438845
33.46037787610619
16.70941549295775
16.98556235521236
27.21990440251572
29.328689230769232
15.952490000000001
35.78095887850468
35.98402353479853
34.25624535464536
41.97139268571428
17.70746530612245
70.74843543307085
40.116223170731715
41.31056976516634
24.728312126245847
110.86117879746836
36.17067669172933
30.053537289915965
32.34550205479452
39.305570942662776
18.86539038461538
28.105964655172414
50.901824603174596
40.82102487512488
37.97891896551724
19.087261111111115
15.36086351020408
13.799410000000002
86.00157671232877
50.63279205069124
45.06187518796992
69.95995571428571
40.4267188902007
45.65917895752896
87.66577348484849
102.78078347107439
37.14156229508197
59.480953703703705
50.14444966442953
18.834866911764703
18.01388474576271
78.48724341085273
29.03997911164466
63.280750000000005
40.20862016806722
56.18068983050846
23.303794736842104
33.96447450980392
96.92913664122138
36.862581203007515
42.506973076923074
64.38074583761563
18.206317333333335
55.565354128440354
87.08721341463415
29.178195614035083
88.24345609756097
32.380883179723504
16.283656521739132
40.30242583333333
59.19365642256903
63.236048
36.91558455598455
36.701528405315614
82.62635780075188
mean_profit_diff_locations = nan
total_profit = 0.0
delta_profit = 0.0
delta_sales = 0.0
avg_past_profit = 0.0
penalty = 0.9247449040412903
unsold_penalty = 1219.7554933333333