<a href="https://colab.research.google.com/github/LawZhou/mountain-car/blob/main/APS1080_A3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import gym
from tqdm import tqdm

In [None]:
class MountainCarBaseAgent():
    def __init__(self, env, num_episodes, bin, min_lr, epsilon, lr,
                 discount_factor, decay):
        self.bin = bin
        self.num_episodes = num_episodes
        self.min_lr = min_lr
        self.epsilon = epsilon
        self.discount_factor = discount_factor
        self.decay = decay
        self.learning_rate = lr

        self.env = env

        self.upper_bounds = self.env.observation_space.high
        self.lower_bounds = self.env.observation_space.low

        self.position_bins = np.linspace(self.lower_bounds[0], self.upper_bounds[0], num=self.bin)
        self.velocity_bins = np.linspace(self.lower_bounds[1], self.upper_bounds[1], num=self.bin)
        self.Q = np.zeros((self.bin, self.bin, self.env.action_space.n))

    def discretize_state(self, obs):
        discrete_pos = np.digitize(obs[0], bins=self.position_bins)
        discrete_vel = np.digitize(obs[1], bins=self.velocity_bins)
        discrete_state = np.array([discrete_pos, discrete_vel]).astype(np.int)
        return tuple(discrete_state)

    def choose_action(self, state, greedy=False):
        if not greedy:
            if np.random.random() < self.epsilon:
                return self.env.action_space.sample()
            else:
                return np.argmax(self.Q[state])
        else:
            return np.argmax(self.Q[state])


    def get_learning_rate(self):
        return max(self.min_lr, self.learning_rate - self.learning_rate*self.decay)

    def run(self):
        state = self.env.reset()
        total_reward = 0
        for ep in range(50000):
            state = self.discretize_state(state)
            action = self.choose_action(state, greedy=True)
            obs, reward, done, info = self.env.step(action)
            total_reward += reward
            if done:
                break
            state = obs
        return total_reward

    def run_episodes(self, play_eps=500):
        stepsRecorder = []
        for _ in range(play_eps):
            stepsRecorder.append(self.run())
        stepsRecorder = np.array(stepsRecorder)
        print(f'Finish with mean steps: {np.mean(stepsRecorder)} in {play_eps} episodes')

# Task 1
Develop a TD(0) controller using:

<ul>
<li>on-policy SARSA</li>
<li>on-policy expected SARSA</li>
<li>off-policy expected SARSA with a greedy control policy.</li>
</ul>

Compare the performance of your controllers.

In [None]:
class MountainCarTD0Agent(MountainCarBaseAgent):
    def __init__(self, env, mode='off_expected_sarsa', bin=20, num_episodes=1000, min_lr=0.1, epsilon=0.2, lr=1.0,
                 discount_factor=0.95, decay=0.25):
        super().__init__(env, num_episodes, bin, min_lr, epsilon, lr,
                 discount_factor, decay)
        self.mode = mode

    def train(self):
        if self.mode == 'off_expected_sarsa':
            self.train_off_expected_sarsa()
        elif self.mode == 'sarsa':
            self.train_sarsa()
        else:
            self.train_expected_sarsa()

    def train_off_expected_sarsa(self):
        for _ in tqdm(range(self.num_episodes)):
            state = self.env.reset()
            state = self.discretize_state(state)
            self.learning_rate = self.get_learning_rate()
            done = False
            while not done:
                action = self.choose_action(state)
                next_state, reward, done, _ = self.env.step(action)
                next_state = self.discretize_state(next_state)
                self.update_off_expected_sarsa_Q(state, action, reward, next_state)
                state = next_state

    def train_sarsa(self):
        for _ in tqdm(range(self.num_episodes)):
            state = self.env.reset()
            state = self.discretize_state(state)
            self.learning_rate = self.get_learning_rate()
            done = False
            action = self.choose_action(state)
            while not done:
                next_state, reward, done, _ = self.env.step(action)
                next_state = self.discretize_state(next_state)
                next_action = self.choose_action(next_state)
                self.update_sarsa_Q(state, action, reward, next_state, next_action)
                state = next_state
                action = next_action



    def train_expected_sarsa(self):
        for _ in tqdm(range(self.num_episodes)):
            state = self.env.reset()
            state = self.discretize_state(state)
            self.learning_rate = self.get_learning_rate()
            done = False
            while not done:
                action = self.choose_action(state)
                next_state, reward, done, _ = self.env.step(action)
                next_state = self.discretize_state(next_state)
                self.update_expected_sarsa_Q(state, action, reward, next_state)
                state = next_state

    def update_sarsa_Q(self, state, action, reward, next_state, next_action):
        self.Q[state][action] += self.learning_rate * (
                reward + self.discount_factor * self.Q[next_state][next_action] - self.Q[state][action])

    def update_off_expected_sarsa_Q(self, state, action, reward, next_state):
        # greedy policy
        self.Q[state][action] += self.learning_rate * (
                reward + self.discount_factor * np.max(self.Q[next_state]) - self.Q[state][action])

    def update_expected_sarsa_Q(self, state, action, reward, next_state):
        At_prob = 1 - self.epsilon
        random_action_prob = self.epsilon * (1 / self.env.action_space.n)
        best_action = np.argmax(self.Q[next_state])
        expected_sarsa = (At_prob+random_action_prob)*self.Q[next_state][best_action] # prob of best action * corresponding Q
        for ac in range(self.env.action_space.n):
            if ac != best_action:
                expected_sarsa += random_action_prob*self.Q[next_state][ac] # prob of non-greedy action * corresponding Q

        self.Q[state][action] += self.learning_rate * (
                reward + self.discount_factor * expected_sarsa - self.Q[state][action])

In [None]:
env = gym.make('MountainCar-v0')
bin = 20
num_episodes=5000
min_lr=0.1
epsilon=0.2
lr=1.0
discount_factor=0.95
lr_decay=0.25

In [None]:
print('train using off expected SARSA learning:')
env.reset()
agent = MountainCarTD0Agent(env, mode='off_expected_sarsa', 
                            num_episodes=num_episodes, 
                            min_lr=min_lr, epsilon=epsilon, 
                            lr=lr, discount_factor=discount_factor, 
                            decay=lr_decay)
agent.train()
agent.run_episodes()

  0%|          | 5/5000 [00:00<01:47, 46.67it/s]

train using off expected SARSA learning:


100%|██████████| 5000/5000 [01:41<00:00, 49.18it/s]


Finish with mean steps: -167.186 in 500 episodes


In [None]:
print('train using SARSA: ')
env.reset()
agent = MountainCarTD0Agent(env, mode='sarsa',
                            num_episodes=num_episodes, 
                            min_lr=min_lr, epsilon=epsilon, 
                            lr=lr, discount_factor=discount_factor, 
                            decay=lr_decay)
agent.train()
agent.run_episodes()

  0%|          | 5/5000 [00:00<01:40, 49.51it/s]

train using SARSA: 


100%|██████████| 5000/5000 [01:29<00:00, 56.03it/s]


Finish with mean steps: -155.772 in 500 episodes


In [None]:
print('train using expected SARSA: ')
env.reset()
agent = MountainCarTD0Agent(env, mode='expected_sarsa',
                            num_episodes=num_episodes, 
                            min_lr=min_lr, epsilon=epsilon, 
                            lr=lr, discount_factor=discount_factor, 
                            decay=lr_decay)
agent.train()
agent.run_episodes()

  0%|          | 4/5000 [00:00<02:13, 37.51it/s]

train using expected SARSA: 


100%|██████████| 5000/5000 [01:40<00:00, 49.98it/s]


Finish with mean steps: -135.36 in 500 episodes


|                           | Score  |
|---------------------------|--------|
| SARSA                     | -155.8 |
| on-policy expected SARSA  | -135.4 |
| off-policy expected SARSA | -167.2 |

On-policy expected SARSA achieves a higher score than SARSA which is expected in general as it eliminates the variance introduces by the random action in SARSA. Off-policy expected SARSA behaves exactly the same with Q-learning and it gets the worst result. On-policy often suffers from the diversity of the policy, resulting not enough sampling of Q. It also might be due to the choice of hyperparameters as well. 

# Task 2
Code controllers for TD(2), TD(3), and TD(4) using n-SARSA. Assess performance and compare against TD(0) and each other.

In [None]:
class MountainCarTDnAgent(MountainCarBaseAgent):
    def __init__(self, env, n, bin=20, num_episodes=1000, discount_factor=0.95, min_lr=0.1, lr=1.0,
                 decay=0.25, epsilon=0.2):
        super().__init__(env, num_episodes, bin, min_lr, epsilon, lr,
                         discount_factor, decay)
        self.n = n
        self.state_store = {}
        self.action_store = {}
        self.reward_store = {}

    def train(self):

        for ep in tqdm(range(self.num_episodes)):
            T = np.inf
            tau = 0
            t = -1

            state = self.env.reset()

            state = self.discretize_state(state)
            action = self.choose_action(state)
            self.state_store[0] = state
            self.action_store[0] = action

            self.learning_rate = self.get_learning_rate()

            while tau < (T - 1):
                t += 1
                if t < T:
                    state, reward, done, _ = self.env.step(action)
                    state = self.discretize_state(state)
                    self.state_store[(t+1) % (self.n+1)] = state
                    self.reward_store[(t+1) % (self.n+1)] = reward


                    if done:
                        T = t + 1
                    else:
                        action = self.choose_action(state)
                        self.action_store[(t+1) % (self.n+1)] = action
                tau = t - self.n + 1

                if tau >= 0:
                    G = np.sum([self.discount_factor**(i-tau-1) * self.reward_store[i % (self.n+1)] for i in range(tau+1, min(tau+self.n, T)+1)])
                    if tau + self.n < T:
                        state_tau = self.state_store[(tau + self.n) % (self.n+1)]
                        action_tau = self.action_store[(tau + self.n) % (self.n+1)]
                        G += (self.discount_factor ** self.n) * self.Q[state_tau][action_tau]
                    state_tau, action_tau = self.state_store[tau % (self.n+1)], self.action_store[tau % (self.n+1)]
                    self.Q[state_tau][action_tau] += self.learning_rate * (G-self.Q[state_tau][action_tau])


In [None]:
env = gym.make('MountainCar-v0')
bin = 20
num_episodes=5000
min_lr=0.1
epsilon=0.2
lr=1.0
discount_factor=0.95
lr_decay=0.25

In [None]:
print('train using n=2: ')
env.reset()
agent = MountainCarTDnAgent(env, n=2,
                            num_episodes=num_episodes, 
                            min_lr=min_lr, epsilon=epsilon, 
                            lr=lr, discount_factor=discount_factor, 
                            decay=lr_decay)
agent.train()
agent.run_episodes()

  0%|          | 5/5000 [00:00<01:52, 44.51it/s]

train using n=2: 


100%|██████████| 5000/5000 [01:53<00:00, 44.03it/s]


Finish with mean steps: -155.938 in 500 episodes


In [None]:
print('train using n=3: ')
env.reset()
agent = MountainCarTDnAgent(env, n=3,
                            num_episodes=num_episodes, 
                            min_lr=min_lr, epsilon=epsilon, 
                            lr=lr, discount_factor=discount_factor, 
                            decay=lr_decay)
agent.train()
agent.run_episodes()

  0%|          | 5/5000 [00:00<01:53, 43.99it/s]

train using n=3: 


100%|██████████| 5000/5000 [01:48<00:00, 46.05it/s]


Finish with mean steps: -162.28 in 500 episodes


In [None]:
print('train using n=4: ')
env.reset()
agent = MountainCarTDnAgent(env, n=4,
                            num_episodes=num_episodes, 
                            min_lr=min_lr, epsilon=epsilon, 
                            lr=lr, discount_factor=discount_factor, 
                            decay=lr_decay)
agent.train()
agent.run_episodes()

  0%|          | 5/5000 [00:00<01:54, 43.58it/s]

train using n=4: 


100%|██████████| 5000/5000 [01:50<00:00, 45.14it/s]


Finish with mean steps: -156.316 in 500 episodes


|                           | Score  |
|---------------------------|--------|
| TD(2)                     | -155.9 |
|  TD(2)                    | -162.3 |
|  TD(4)                    | -156.3 |

Overall, the performance between the three n-step SARSA doesn't vary very much. Compared to the performance of TD(0), n-step SARSA achieves similar results with SARSA but off-policy expected SARSA still outperforms other methods.  