In [1]:
import numpy as np
import gymnasium as gym
import matplotlib.pyplot as plt

In [2]:
def discretize(state, bins):
    state_bins = [np.digitize(state[i], bins[i]) - 1 for i in range(len(state))]
    state_bins = [max(0, min(len(bins[i])-1, state_bins[i])) for i in range(len(state_bins))]
    return tuple(state_bins)

In [3]:
def create_bins(num_bins):
    position_bins = np.linspace(-1.2, 0.6, num_bins)
    velocity_bins = np.linspace(-0.07, 0.07, num_bins)
    return [position_bins, velocity_bins]


In [4]:
def initialize_q_table(num_bins, num_actions):
    return np.zeros([num_bins, num_bins, num_actions])

In [11]:
def initialize_policy(bins, num_actions):
    policy = {}
    for pos in range(len(bins[0])):
        for vel in range(len(bins[1])):
            policy[(pos, vel)] = np.random.choice(num_actions)
    return policy

In [12]:
def policy_evaluation(env, q_table, policy, bins, gamma, theta, alpha):
    while True:
        delta = 0
        for pos in range(len(bins[0])):
            for vel in range(len(bins[1])):
                state = (pos, vel)
                action = policy[state]
                old_value = q_table[state + (action,)]
                env.reset()
                env.unwrapped.state = (bins[0][pos], bins[1][vel])  # Set to the discrete state
                next_state, reward, terminated, truncated, _ = env.step(action)
                next_state = discretize(next_state, bins)
                done = terminated or truncated
                if done:
                    new_value = reward
                else:
                    new_value = reward + gamma * np.max(q_table[next_state])
                q_table[state + (action,)] = (1 - alpha) * old_value + alpha * new_value
                delta = max(delta, np.abs(old_value - q_table[state + (action,)]))
        if delta < theta:
            break

In [13]:
def policy_improvement(q_table, policy, bins):
    policy_stable = True
    for pos in range(len(bins[0])):
        for vel in range(len(bins[1])):
            state = (pos, vel)
            old_action = policy[state]
            policy[state] = np.argmax(q_table[state])
            if old_action != policy[state]:
                policy_stable = False
    return policy_stable

In [14]:
def policy_iteration(env, q_table, bins, gamma, theta, episodes, alpha):
    policy = initialize_policy(bins, env.action_space.n)
    rewards = []
    for episode in range(episodes):
        policy_evaluation(env, q_table, policy, bins, gamma, theta, alpha)
        policy_stable = policy_improvement(q_table, policy, bins)
        
        state = discretize(env.reset()[0], bins)
        total_reward = 0
        done = False
        while not done:
            action = policy[state]
            next_state, reward, terminated, truncated, _ = env.step(action)
            next_state = discretize(next_state, bins)
            done = terminated or truncated
            state = next_state
            total_reward += reward

        rewards.append(total_reward)
        if policy_stable:
            break

    return rewards, q_table, policy

In [8]:
def plot_results(rewards, granularity, parameter, value, filename):
    plt.figure()
    plt.plot(range(len(rewards)), rewards, label=f'Granularity: {granularity}, {parameter}: {value}')
    plt.xlabel('Episodes')
    plt.ylabel('Total Reward')
    plt.title(f'Reward vs Episodes ({parameter}: {value})')
    plt.legend()
    plt.savefig(filename)
    plt.close()

In [9]:
# Grid map of policy
def plot_policy(policy, bins, filename):
    policy_grid = np.zeros((len(bins[0]), len(bins[1])))
    for pos in range(len(bins[0])):
        for vel in range(len(bins[1])):
            policy_grid[pos, vel] = policy[(pos, vel)]
    plt.figure()
    plt.imshow(policy_grid, origin='lower', cmap='viridis')
    plt.colorbar(ticks=[0, 1, 2], label='Action')
    plt.title('Policy Grid')
    plt.xlabel('Position')
    plt.ylabel('Velocity')
    plt.savefig(filename)
    plt.close()

In [16]:
env = gym.make('MountainCar-v0')
num_actions = env.action_space.n
episodes = 5000
gamma = 0.99
theta = 0.0001  # Convergence threshold

granularities = [ 20, 30]
parameter = 'learning_rate'
values = [0.1, 0.5, 0.9]

In [None]:
for granularity in granularities:
    for value in values:
        bins = create_bins(granularity)
        q_table = initialize_q_table(granularity, num_actions)
        rewards, q_table, policy = policy_iteration(env, q_table, bins, gamma, theta, episodes, value)
        plot_results(rewards, granularity, parameter, value, f'rewards_granularity_{granularity}_lr_{value}.png')
        if granularity == 20:
            plot_policy(policy, bins, f'policy_grid_granularity_{granularity}_lr_{value}.png')