In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from collections import defaultdict


In [2]:
class RetailEnvironment:
    def __init__(self, data):
        self.data = data
        self.current_step = 0
        self.scaler = StandardScaler()
        self.scaled_data = self.scaler.fit_transform(self.data[['amount', 'units', 'total_sales']])
        
    def reset(self):
        self.current_step = 0
        return self._get_state()
    
    def step(self, action):
        # Action: 0 = lower price, 1 = keep price, 2 = raise price
        current_sale = self.scaled_data[self.current_step]
        
        if action == 0:
            reward = current_sale[2] * 0.9  # Lower price might increase sales but decrease profit
        elif action == 2:
            reward = current_sale[2] * 1.1  # Higher price might decrease sales but increase profit
        else:
            reward = current_sale[2]  # Keeping price stable
        
        self.current_step += 1
        done = self.current_step >= len(self.scaled_data)
        
        return self._get_state(), reward, done
    
    def _get_state(self):
        if self.current_step >= len(self.scaled_data):
            return np.zeros(3)
        return self.scaled_data[self.current_step]

In [3]:
class QLearningAgent:
    def __init__(self, state_size, action_size, learning_rate=0.1, discount_factor=0.95, epsilon=0.1):
        self.q_table = defaultdict(lambda: np.zeros(action_size))
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.epsilon = epsilon
        self.action_size = action_size
    
    def get_action(self, state):
        if np.random.rand() < self.epsilon:
            return np.random.randint(self.action_size)
        return np.argmax(self.q_table[tuple(state)])
    
    def update(self, state, action, reward, next_state):
        current_q = self.q_table[tuple(state)][action]
        next_max_q = np.max(self.q_table[tuple(next_state)])
        new_q = current_q + self.learning_rate * (reward + self.discount_factor * next_max_q - current_q)
        self.q_table[tuple(state)][action] = new_q


In [4]:
def train(env, agent, episodes):
    for episode in range(episodes):
        state = env.reset()
        done = False
        total_reward = 0
        
        while not done:
            action = agent.get_action(state)
            next_state, reward, done = env.step(action)
            agent.update(state, action, reward, next_state)
            state = next_state
            total_reward += reward
        
        if episode % 100 == 0:
            print(f"Episode {episode}, Total Reward: {total_reward}")

In [5]:
# Load and preprocess data
data = pd.read_csv('../Data/prepared_sales_data.csv')  # Replace with your actual file name
relevant_features = ['amount', 'units', 'total_sales']
data = data[relevant_features]

In [6]:
# Create environment and agent
env = RetailEnvironment(data)
agent = QLearningAgent(state_size=3, action_size=3)

In [7]:
# Train the agent
train(env, agent, episodes=1000)

Episode 0, Total Reward: 27597.990654651057
Episode 100, Total Reward: 31423.78484045255
Episode 200, Total Reward: 29795.674808005813
Episode 300, Total Reward: 31992.984882472523
Episode 400, Total Reward: 30843.186618884774
Episode 500, Total Reward: 30166.97174502747
Episode 600, Total Reward: 30115.199668913567
Episode 700, Total Reward: 30082.56163507632
Episode 800, Total Reward: 30530.2654191619
Episode 900, Total Reward: 30442.764910130856


In [8]:
# Example of using the trained agent
state = env.reset()
done = False
total_reward = 0
while not done:
    action = agent.get_action(state)
    next_state, reward, done = env.step(action)
    total_reward += reward
    state = next_state
print(f"Total reward with trained agent: {total_reward}")

Total reward with trained agent: 12154.051262385985
