# Mountain Car QL

[Reference](https://ha-nguyen-39691.medium.com/playing-mountain-car-with-q-learning-and-sarsa-4e7327f9e35c)

In [13]:
import gym
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import random


In [22]:
N_EPS = 100000
ALPHA = 0.1
GAMMA = 0.9
EPSILON = 0.2
INTERVAL = 1000
STEPS = 30

Q_LEARNING = "QLEARNING"
SARSA = "SARSA"

ENV_NAME = "MountainCar-v0"

In [21]:
class QTable:
    def __init__(
        self,
        steps=STEPS,
        agent_type=Q_LEARNING,
        alpha=ALPHA,
        gamma=GAMMA,
        env_name=ENV_NAME,
    ):
        shape = (steps, steps)

        env = gym.make(env_name)

        action_space_n = env.action_space.n
        shape += action_space_n

        self.q_table = np.zeros(shape)

        self.env_low = env.observation_space.low
        self.env_high = env.observation_space.high
        self.discretized_env = (self.env_high - self.env_low) / steps

        self.steps = steps
        self.agent_type = agent_type

        self.alpha = alpha
        self.gamma = gamma

    def _discretize(self, state):
        discretized_state = (state - self.env_low) / self.discretized_env
        discretized_state.astype(int)

        return discretized_state

    def _q_func(self, reward, q_current_value, q_forward_value):
        alpha = self.alpha
        gamma = self.gamma

        q_value = q_current_value + alpha * (
            reward + gamma * q_forward_value - q_current_value
        )
        return q_value

    def _get_sub_table(self, state):
        q_sub_table = self.q_table

        discrete_state = self._discretize(state)

        for index in discrete_state:
            q_sub_table = q_sub_table[index]

        return q_sub_table

    def get_value(self, state, action=None):
        q_value = self._get_sub_table(state)

        if action is not None:
            q_value = q_value[action]

        return q_value

    def _update_value(self, state, action, update_value):
        q_sub_table = self._get_sub_table(state)

        q_sub_table[action] = update_value

    def update_table(self, state, forward_state, action, reward):
        agent_type = self.agent_type

        q_current = self.get_value(state, action)
        q_forward = self.get_value(forward_state)
        
        if agent_type == Q_LEARNING:
            q_forward = np.max(q_forward)
        else:
            q_forward = np.average(q_forward)

        update_value = self._q_func(reward, q_current, q_forward)
        self._update_value(state, action, update_value)


In [15]:
# space discretization
def getState(state, env_low = env_low, env_high = env_high, bins = bins):
    """Returns the discretized position and velocity of an observation"""
    discretized_env = (env_high - env_low) / bins
    discretized_pos = int((state[0] - env_low[0]) / discretized_env[0])
    discretized_vel = int((state[1] - env_low[1]) / discretized_env[1])
    return discretized_pos, discretized_vel

In [17]:
# choose action
def chooseAction(pos, vel, q_table, epsilon = epsilon):
    """Choose action based on an epsilon greedy strategy"""
    if random.random() < epsilon: # explore
        action = env.action_space.sample()
    else: # exploit
        action = np.argmax(q_table[pos][vel])
    return action

In [18]:
env = gym.make(ENV_NAME)

q_table_q = QTable()
rewards_q = []  # training

for ep in range(N_EPS):
    state = env.reset()
    current_reward = 0
    done = False  # discretize the state

    while not done:
        # render for the last 10 episodes
        if ep >= (N_EPS - 10):
            env.render()  # next action
        action = np.argmax(q_table_q.get_value(state))
        next_state, reward, done, info = env.step(action)
        # discretize next state
        next_pos, next_vel = getState(next_state)
        if done and next_state[0] >= env.goal_position:
            q_table_q._update_value(next_state, action, reward)

        else:
            # update Q value: Q(S, A) <-- Q(S, A) + alpha [R + gamma * Q(S', A') - Q(S, A)]
            q_table_q.update_table(state, next_state, action, reward)

        # reassign state, action, reward
        state = next_state
        current_reward += reward  # update EPSILON
    if EPSILON > 0:
        EPSILON *= (N_EPS - 2) / N_EPS

    # periodically print out result
    if ep % INTERVAL == 0:
        print("Game no.: ", ep, "EPSILON: ", EPSILON, "with reward: ", current_reward)
    rewards_q.append(current_reward)
    env.close()


Game no.:  0 epsilon:  0.19999600003999962 with reward:  -200.0
Game no.:  10000 epsilon:  0.16374287572423812 with reward:  -154.0
Game no.:  20000 epsilon:  0.1340613279519637 with reward:  -200.0
Game no.:  30000 epsilon:  0.10976013199201175 with reward:  -148.0
Game no.:  40000 epsilon:  0.08986399552315864 with reward:  -169.0
Game no.:  50000 epsilon:  0.07357441672877993 with reward:  -168.0
Game no.:  60000 epsilon:  0.060237637615224854 with reward:  -176.0
Game no.:  70000 epsilon:  0.04931840640802185 with reward:  -146.0
Game no.:  80000 epsilon:  0.04037849601877588 with reward:  -149.0
Game no.:  90000 epsilon:  0.033059116453387716 with reward:  -166.0


error: display Surface quit

### Professor recommended to move on to Double DQN as continuous MountainCar is difficult to discretize and requires a lot of training