In [1]:
import numpy as np
import random
import gymnasium as gym
from gymnasium import spaces


In [2]:
class BudgetAllocationEnv(gym.Env):
    def __init__(self):
        super(BudgetAllocationEnv, self).__init__()
        
        # Define action and observation space
        # Actions: 0 = Increase, 1 = Maintain, 2 = Decrease
        self.action_space = gym.spaces.Discrete(3)
        
        # States: 0 = Low ROI, 1 = Moderate ROI, 2 = High ROI
        self.observation_space = gym.spaces.Discrete(3)
        
        # Initialize state
        self.state = 0
        
        # Rewards per state-action pair
        self.reward_matrix = {
        (0, 0): -10,   # Low ROI, Increase (bad choice)
        (0, 1): -5,    # Low ROI, Maintain (neutral choice)
        (0, 2): 10,    # Low ROI, Decrease (good choice)
        (1, 0): 20,    # Moderate ROI, Increase (good choice)
        (1, 1): 5,     # Moderate ROI, Maintain (neutral choice)
        (1, 2): -5,    # Moderate ROI, Decrease (bad choice)
        (2, 0): -5,    # High ROI, Increase (bad choice)
        (2, 1): 15,    # High ROI, Maintain (good choice)
        (2, 2): -10    # High ROI, Decrease (bad choice)
}
    
    def reset(self):
        # Reset to a random initial state (can be improved)
        self.state = np.random.choice([0, 1, 2])
        return self.state
    
    def step(self, action):
        # Get reward based on current state and action
        reward = self.reward_matrix.get((self.state, action), 0)
        
        # Transition logic: for simplicity, we'll keep it random
        if action == 0 and self.state < 2:
            next_state = self.state + 1
        elif action == 2 and self.state > 0:
            next_state = self.state - 1
        else:
            next_state = self.state
        
        # Check if the task is done (e.g., after 10 steps)
        done = np.random.rand() > 0.95  # Arbitrary condition for episode end
        
        # Update state
        self.state = next_state
        
        return next_state, reward, done, {}
    
    def render(self):
        state_names = ['Low ROI', 'Moderate ROI', 'High ROI']

        print(f'Current State: {state_names[self.state]}')

In [3]:

# RL_ Framework_solution 
def monte_carlo_learning(env, num_episodes=1000):
    Q_table = np.zeros((2, 2, 3))  # Two channels, three actions
    returns = { (s1, s2, a): [] for s1 in range(2) for s2 in range(2) for a in range(3) }  # Initialize returns

    for ep in range(num_episodes):
        episode = []
        state = env.reset()

        while True:
            action = env.action_space.sample()  # Choose an action randomly
            next_state, reward, done, _ = env.step(action)
            episode.append((state, action, reward))
            if done:
                break
            state = next_state

        # Process the episode to update the Q-values
        G = 0
        for t in range(len(episode) - 1, -1, -1):
            state, action, reward = episode[t]
            G = reward + 0.9 * G  # Discounted reward
            state_idx = state  # state_idx is directly the state in this simple case
            returns[(state[0], state[1], action)].append(G)  # Store returns
            Q_table[state[0], state[1], action] = np.mean(returns[(state[0], state[1], action)])  # Update Q-value

    return Q_table

In [4]:
env = BudgetAllocationEnv()
Q_table = monte_carlo_learning(env, num_episodes=1000)  # Train agent
print("Learned Q-table:")
print(Q_table)

IndexError: invalid index to scalar variable.