In [None]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt

# <b> Monte Carlo Algorithm

In [None]:
def mc_policy_evaluation(env, policy, episodes, gamma=1.0):
    value_table = np.zeros(env.observation_space.n)
    returns = {state: [] for state in range(env.observation_space.n)}
    
    for _ in range(episodes):
        state = env.reset()[0]
        episode = []
        done = False
        
        while not done:
            action = policy(state)
            next_state, reward, done, _, _ = env.step(action)
            episode.append((state, action, reward))
            state = next_state
        
        G = 0
        for step in reversed(episode):
            state, action, reward = step
            G = gamma * G + reward
            if not any(s == state for s, _, _ in episode[:episode.index(step)]):
                returns[state].append(G)
                value_table[state] = np.mean(returns[state])
                
    return value_table

In [None]:
def random_policy(state):
    return np.random.choice(6)

In [None]:
env = gym.make('Taxi-v3')
episodes = 5000
value_table_mc = mc_policy_evaluation(env, random_policy, episodes)

plt.plot(value_table_mc)
plt.xlabel('State')
plt.ylabel('Value')
plt.title('Monte Carlo Value Function')
plt.show()

# <b> TD Learning Algorithm

In [None]:
def td_policy_evaluation(env, policy, episodes, alpha=0.1, gamma=1.0):
    value_table = np.zeros(env.observation_space.n)
    
    for _ in range(episodes):
        state = env.reset()[0]
        done = False
        
        while not done:
            action = policy(state)
            next_state, reward, done, _, _ = env.step(action)
            value_table[state] += alpha * (reward + gamma * value_table[next_state] - value_table[state])
            state = next_state
            
    return value_table

In [None]:
value_table_td = td_policy_evaluation(env, random_policy, episodes)

plt.plot(value_table_td)
plt.xlabel('State')
plt.ylabel('Value')
plt.title('TD Value Function')
plt.show()

# <b> Comparing Cumulative Rewards

In [None]:
def simulate(env, policy, episodes, algo):
    rewards = []
    cumulative_reward = 0
    
    for _ in range(episodes):
        state = env.reset()[0]
        episode_reward = 0
        done = False
        
        while not done:
            action = policy(state)
            next_state, reward, done, _, _ = env.step(action)
            episode_reward += reward
            state = next_state
        
        cumulative_reward += episode_reward
        rewards.append(cumulative_reward)
    
    return rewards

In [None]:
episodes = 5000
mc_rewards = simulate(env, random_policy, episodes, "MC")
td_rewards = simulate(env, random_policy, episodes, "TD")

plt.plot(mc_rewards, label='Monte Carlo')
plt.plot(td_rewards, label='TD Learning')
plt.xlabel('Episode')
plt.ylabel('Cumulative Reward')
plt.title('Cumulative Reward vs Episode')
plt.legend()
plt.show()

# <b> Parameter Tuning

In [None]:
alphas = [0.1, 0.5, 0.9]
gammas = [0.9, 0.95, 1.0]

In [None]:
for alpha in alphas:
    value_table_td = td_policy_evaluation(env, random_policy, episodes, alpha=alpha)
    plt.plot(value_table_td, label=f'alpha={alpha}')

plt.xlabel('State')
plt.ylabel('Value')
plt.title('TD Value Function with different alpha values')
plt.legend()
plt.show()

In [None]:
for gamma in gammas:
    value_table_td = td_policy_evaluation(env, random_policy, episodes, gamma=gamma)
    plt.plot(value_table_td, label=f'gamma={gamma}')

plt.xlabel('State')
plt.ylabel('Value')
plt.title('TD Value Function with different gamma values')
plt.legend()
plt.show()