In [None]:
import gym
import numpy as np
import time
import random
import matplotlib.pyplot as plt
import gym_gridworld
import gymgrid
from IPython.display import clear_output # Used to clear the ouput of a Jupyter cell.

In [None]:
def train_sarsa(agent, env, n_episodes):
    step = 0
    steps = np.zeros(n_episodes) # Steps after each episode
    total_rewards = np.zeros(n_episodes)
    for episode in range(n_episodes):
        state = env.reset()
        states = [state]
        action = agent.act(state)
        actions = [action]
        rewards = [0]
        episode_reward = 0
        done = False
        T = np.inf
        t = 0
        
        while True:
            if t < T:
                state_next, reward, done, info = env.step(action)
                states.append(state_next)
                rewards.append(reward)
                episode_reward += reward
                if done == True:
                    T = t + 1
                else:
                    action_next = agent.act(state_next)
                    actions.append(action_next)
            tau = t - agent.n + 1
            if tau >= 0:
                G = 0
                for i in range(tau + 1, min(tau + agent.n + 1, T + 1)):
                    G += np.power(agent.gamma, i - tau- 1) * rewards[i]
                if (tau + agent.n) < T:
                    G += np.power(agent.gamma, agent.n) * agent.Q[states[tau + agent.n], actions[tau + agent.n]]  
                agent.learn(states[tau], actions[tau], G)
                action = action_next
                step += 1
            if tau == T - 1:
                break
            t += 1
        
        steps[episode] = step
        total_rewards[episode] = episode_reward
        
    return total_rewards, steps

In [None]:
class SARSA():
    def __init__(self, n_states, n_actions, gamma, alpha, epsilon, n=1):
        self.n_states = n_states
        self.n_actions = n_actions
        self.gamma = gamma
        self.alpha = alpha
        self.epsilon = epsilon
        self.Q = np.zeros((n_states, n_actions))
        self.n = n
        
    def act(self, state):
        # You can use np.random.choice(self.n_actions) to get a random action
        # Implement epsilon-greedy policy
        choices = [np.argmax(self.Q[state, :]), np.random.choice(self.n_actions)]
        action = np.random.choice(choices, 1, p=[1 - self.epsilon, self.epsilon])
        return action[0]
            
    def learn(self, s, a, G):
        # Implement the TD(0) update of Q (see equation (6.7) in textbook)
        self.Q[s, a] = self.Q[s, a] + self.alpha * (G - self.Q[s,a])

In [None]:
env = gym.make('Taxi-v3')
episodes = 1000
n_values = [1, 2, 5, 10, 50]
train_repetition = 20
concat_mean_rewards = np.zeros((len(n_values), episodes))

for counter, n_value in enumerate(n_values):
    list_rewards = np.zeros((train_repetition, episodes))
    for training in range(0, train_repetition):
        agent = SARSA(env.observation_space.n, env.action_space.n, gamma=1.0, alpha=0.1, epsilon=0.1, n=n_value)
        total_rewards, _ = train_sarsa(agent, env, n_episodes=episodes)
        list_rewards[training,:] = total_rewards
    mean_rewards = np.mean(list_rewards, axis=0) # Compute mean column-wise
    concat_mean_rewards[counter,:] = mean_rewards

In [None]:
for counter, n_value in enumerate(n_values):
    plt.plot(range(1, episodes+1), concat_mean_rewards[counter,:])
    plt.title(f'n = {n_value}')
    plt.xlabel('Episodes')
    plt.ylabel('Average total reward')
    plt.show()

In [None]:
new_episodes = 10000
list_rewards = np.zeros((train_repetition, new_episodes))
for training in range(0, train_repetition):
    agent = SARSA(env.observation_space.n, env.action_space.n, gamma=1.0, alpha=0.1, epsilon=0.1, n=50)
    total_rewards, _ = train_sarsa(agent, env, n_episodes=new_episodes)
    list_rewards[training,:] = total_rewards
mean_rewards = np.mean(list_rewards, axis=0)

plt.plot(range(1, new_episodes+1), mean_rewards)
plt.title(f'n = 50')
plt.xlabel('Episodes')
plt.ylabel('Average total reward')
plt.show()