# Inventory Optimization Using Reinforcement Learning 

In [1]:
!pip install torch torchvision torchaudio



In [2]:
import pandas as pd
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
import matplotlib.pyplot as plt

# ---------------------------------------------
# Data Preprocessing
# ---------------------------------------------

# Load the dataset
file_path = 'US_Regional_Sales_Data.csv'
data = pd.read_csv(file_path)

# Clean 'Unit Cost' and 'Unit Price' by removing $ signs and converting to float
data['Unit Cost'] = data['Unit Cost'].replace(r'[\$,]', '', regex=True).astype(float)  # Use raw string for regex
data['Unit Price'] = data['Unit Price'].replace(r'[\$,]', '', regex=True).astype(float)  # Use raw string for regex

# Ensure there are no remaining NaN values in the 'Unit Cost' column after conversion
data['Unit Cost'] = data['Unit Cost'].fillna(0)  # Use fillna without inplace=True

# Convert date columns to datetime format
data['OrderDate'] = pd.to_datetime(data['OrderDate'], format='%d/%m/%y', errors='coerce')
data['DeliveryDate'] = pd.to_datetime(data['DeliveryDate'], format='%d/%m/%y', errors='coerce')

# Feature Engineering: Create additional features
data['LeadTime'] = (data['DeliveryDate'] - data['OrderDate']).dt.days
data['TotalRevenue'] = data['Order Quantity'] * data['Unit Price']
data['TotalCost'] = data['Order Quantity'] * data['Unit Cost']

# Display the first few rows of the DataFrame to confirm changes
print(data.head())

   OrderNumber Sales Channel WarehouseCode ProcuredDate  OrderDate  ShipDate  \
0  SO - 000101      In-Store  WARE-UHY1004     31/12/17 2018-05-31   14/6/18   
1  SO - 000102        Online  WARE-NMK1003     31/12/17 2018-05-31   22/6/18   
2  SO - 000103   Distributor  WARE-UHY1004     31/12/17 2018-05-31   21/6/18   
3  SO - 000104     Wholesale  WARE-NMK1003     31/12/17 2018-05-31  2/6/2018   
4  SO - 000105   Distributor  WARE-NMK1003    10/4/2018 2018-05-31   16/6/18   

  DeliveryDate CurrencyCode  _SalesTeamID  _CustomerID  _StoreID  _ProductID  \
0   2018-06-19          USD             6           15       259          12   
1          NaT          USD            14           20       196          27   
2          NaT          USD            21           16       213          16   
3          NaT          USD            28           48       107          23   
4   2018-06-26          USD            22           49       111          26   

   Order Quantity  Discount Applied  U

In [3]:
# ---------------------------------------------
# Warehouse Environment Definition
# ---------------------------------------------

class WarehouseEnv:
    def __init__(self, data):
        self.data = data
        self.current_stock = {product_id: 0 for product_id in self.data['_ProductID'].unique()}
        self.action_space = [-1, 0, 1]  # -1: reduce stock, 0: maintain stock, 1: increase stock
        self.max_stock = 100  # Maximum stock level to prevent overstocking

    def reset(self):
        """Reset the environment to the initial state."""
        self.current_stock = {product_id: 0 for product_id in self.data['_ProductID'].unique()}
        return self.get_state()

    def get_state(self):
        """Get the current stock state."""
        return np.array([self.current_stock[product_id] for product_id in self.current_stock])

    def step(self, action):
        """Take an action and return the new state, reward, and done flag."""
        for product_id in self.current_stock:
            if action == 1 and self.current_stock[product_id] < self.max_stock:
                self.current_stock[product_id] += 1  # Increase stock
            elif action == -1 and self.current_stock[product_id] > 0:
                self.current_stock[product_id] -= 1  # Reduce stock

        reward = self.calculate_reward()
        done = False  # No terminal state for now
        return self.get_state(), reward, done, {}

    def calculate_reward(self):
        """Calculate the reward based on stock levels, carrying costs, and stockout penalties."""
        # Ensure that 'Unit Cost' is treated as a float
        carrying_costs = sum(
            stock * float(self.data.loc[self.data['_ProductID'] == product_id, 'Unit Cost'].values[0])
            for product_id, stock in self.current_stock.items()
        )

        # Stockout penalty (reduced penalty for stockouts to avoid overwhelming negative rewards)
        stockout_penalty = -sum(1 for stock in self.current_stock.values() if stock == 0) * 20

        # Reward for maintaining balanced stock levels (between 10 and 30 units)
        optimal_stock_reward = sum(1 for stock in self.current_stock.values() if 10 <= stock <= 30) * 30

        # Penalty for overstocking beyond 50 units
        overstock_penalty = -sum(1 for stock in self.current_stock.values() if stock > 50) * 10

        # Final reward calculation
        reward = -carrying_costs + stockout_penalty + optimal_stock_reward + overstock_penalty
        return reward

In [4]:
# ---------------------------------------------
# DQN Agent Definition
# ---------------------------------------------

class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.98  # Slightly increased gamma to consider future rewards more
        self.epsilon = 1.0  # Exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995  # Slower decay for more exploration
        self.learning_rate = 0.0005  # Reduced learning rate for better optimization
        self.model = self._build_model()

    def _build_model(self):
        model = nn.Sequential(
            nn.Linear(self.state_size, 24),
            nn.ReLU(),
            nn.Linear(24, 24),
            nn.ReLU(),
            nn.Linear(24, self.action_size)
        )
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model(torch.FloatTensor(state))  # Predict Q-values
        return torch.argmax(act_values).item()

    def replay(self, batch_size=64):  # Increased batch size for more diverse training data
        if len(self.memory) < batch_size:
            return
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target += self.gamma * torch.max(self.model(torch.FloatTensor(next_state))).item()
            target_f = self.model(torch.FloatTensor(state))
            target_f[action] = target

            criterion = nn.MSELoss()
            optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)
            self.model.zero_grad()
            loss = criterion(self.model(torch.FloatTensor(state)), target_f)
            loss.backward()
            optimizer.step()

    def update_epsilon(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

In [5]:
# Properly clean the 'Unit Cost' and 'Unit Price' columns by removing commas and dollar signs
data['Unit Cost'] = data['Unit Cost'].replace(r'[\$,]', '', regex=True).astype(float)
data['Unit Price'] = data['Unit Price'].replace(r'[\$,]', '', regex=True).astype(float)

In [6]:
print(data[['Unit Cost', 'Unit Price']].head())  # This should print numeric values

   Unit Cost  Unit Price
0    1001.18      1963.1
1    3348.66      3939.6
2     781.22      1775.5
3    1464.69      2324.9
4    1476.14      1822.4


In [7]:
# ---------------------------------------------
# Training the DQN Agent
# ---------------------------------------------

# Example usage
env = WarehouseEnv(data)
state_size = len(env.get_state())
action_size = len(env.action_space)

agent = DQNAgent(state_size, action_size)

# Training loop
episodes = 100  # Can increase this if needed
for e in range(episodes):
    state = env.reset()
    total_reward = 0
    for time in range(500):
        action = agent.act(state)
        new_state, reward, done, _ = env.step(action)
        agent.remember(state, action, reward, new_state, done)
        state = new_state
        total_reward += reward
        if done:
            break
    if len(agent.memory) > 32:
        agent.replay(32)
    agent.update_epsilon()
    print(f"Episode: {e+1}/{episodes}, Total Reward: {total_reward:.2f}, Epsilon: {agent.epsilon:.2f}")

Episode: 1/100, Total Reward: -2823463999.92, Epsilon: 0.99
Episode: 2/100, Total Reward: -2777443389.76, Epsilon: 0.99
Episode: 3/100, Total Reward: -2754627118.48, Epsilon: 0.99
Episode: 4/100, Total Reward: -2738833694.48, Epsilon: 0.98
Episode: 5/100, Total Reward: -2848659436.80, Epsilon: 0.98
Episode: 6/100, Total Reward: -2752773230.00, Epsilon: 0.97
Episode: 7/100, Total Reward: -2738918734.00, Epsilon: 0.97
Episode: 8/100, Total Reward: -2757476571.20, Epsilon: 0.96
Episode: 9/100, Total Reward: -2708909536.40, Epsilon: 0.96
Episode: 10/100, Total Reward: -2638894822.16, Epsilon: 0.95
Episode: 11/100, Total Reward: -2597647113.20, Epsilon: 0.95
Episode: 12/100, Total Reward: -2496203880.00, Epsilon: 0.94
Episode: 13/100, Total Reward: -2536694723.76, Epsilon: 0.94
Episode: 14/100, Total Reward: -2808541150.64, Epsilon: 0.93
Episode: 15/100, Total Reward: -2812761075.20, Epsilon: 0.93
Episode: 16/100, Total Reward: -2667756736.96, Epsilon: 0.92
Episode: 17/100, Total Reward: -2

In [None]:
# ---------------------------------------------
# Testing the DQN Agent
# ---------------------------------------------

# Test the agent after training
test_reward = 0
state = env.reset()
for time in range(500):
    action = agent.act(state)
    new_state, reward, done, _ = env.step(action)
    test_reward += reward
    state = new_state
    if done:
        break

print(f"Total reward during test run: {test_reward:.2f}")

# ---------------------------------------------
# Visualize Training Progress
# ---------------------------------------------

# Optional: Add a visualization for rewards during training (example for better analysis)
rewards = []
for e in range(episodes):
    state = env.reset()
    total_reward = 0
    for time in range(500):
        action = agent.act(state)
        new_state, reward, done, _ = env.step(action)
        agent.remember(state, action, reward, new_state, done)
        state = new_state
        total_reward += reward
        if done:
            break
    rewards.append(total_reward)
    if len(agent.memory) > 32:
        agent.replay(32)
    agent.update_epsilon()

# Plot rewards over episodes
plt.plot(range(episodes), rewards)
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.title('Total Reward over Episodes')
plt.show()

Total reward during test run: -2030285242.96


In [None]:
# ---------------------------------------------
# Evaluate Model Performance
# ---------------------------------------------

# Load the model before evaluation if you've saved it
agent.model.load_state_dict(torch.load('dqn_agent_model.pth'))

# Example evaluation: Use the final model to simulate a test run
state = env.reset()  # Reset the environment
test_reward = 0

# Run the test loop
for time in range(500):
    action = agent.act(state)  # Use the trained agent to choose an action
    next_state, reward, done, _ = env.step(action)
    state = next_state
    test_reward += reward
    if done:
        break

print(f"Total reward during test run: {test_reward}")