In [None]:
!pip install gym

In [1]:

# --- SANTIAGO BOBADILLA
# --- 13/10/2021
# ___________________________



# --- Libraries.
# ______________

import logging
import threading

import gym
import numpy as np
import matplotlib.pyplot as plt



# --- Thread Log.
# _______________

format = "%(asctime)s: %(message)s"
logging.basicConfig(format = format, level = logging.INFO, datefmt="%H:%M:%S")



# --- Q Learning Multi Thread.
# ______________________________

class QL (threading.Thread):



    # --- Static variables of the monitor.
    # ____________________________________

    # .. Lock for concurrency and counter for tests.

    _counter = 0
    _lock = threading.Lock()

    # .. Q Table and common parameters.
    #       .. Each thread has his environment, but we created an initial reference for the dimensions of the Q Table.

    _ref_inicial = gym.make("MountainCar-v0")
    _ref_inicial.reset()

    DESCRETE_OS_SIZE = [40] * len(_ref_inicial.observation_space.high)
    DESCRETE_OS_WIN_SIZE  = (_ref_inicial.observation_space.high - _ref_inicial.observation_space.low) / DESCRETE_OS_SIZE

    q_table = np.random.uniform(low = -2, high = 0, size = ( DESCRETE_OS_SIZE  + [_ref_inicial.action_space.n] ))

    LEARNING_RATE = 0.1 
    DISCOUNT = 0.95



    # --- Constructor of each thread
    # _______________________________

    def __init__ (self, id_thread: int):

        # .. ID's

        super().__init__()
        self.id_thread = id_thread

        # .. Environment

        self.env =gym.make("MountainCar-v0")
        self.env.reset()

        # .. Algorithm parameters

        self.EPISODES = 1000
        self.SHOW_EVERY = 100

        self.epsilon = 0.5
        self.START_EPSILON_DECAY = 1
        self.END_EPSILON_DECAY = self.EPISODES // 2
        self.epsilon_decay_value = self.epsilon / (self.END_EPSILON_DECAY - self.START_EPSILON_DECAY)

        # .. Result parameters

        self.eps_reward = []
        self.aggr_ep_reward = { 'eps' : [], 'avg' : [], 'min' : [], 'max' : [] }

        # .. Local own information

        self.discrete_state = 0
        self.episode_reward = 0
        self.discrete_state = 0
        self.done = False
        self.action = 0
        self.new_state = 0
        self.reward = 0
        self.new_discrete_state = 0
        self._state = 0
        self.average_reward = 0



    # --- Helper function to fit values to the Q ~ Table
    # ___________________________________________________

    def get_descrete_state(self, state):
        self._state = (state - self.env.observation_space.low) / self.DESCRETE_OS_WIN_SIZE
        return tuple(self._state.astype(np.int))


    # --- RUN Q Learning
    # ___________________

    def Q_Learning(self):

        for episode in range(self.EPISODES):

            self.episode_reward = 0

            self.discrete_state = self.get_descrete_state(self.env.reset())
            self.done = False

            while not self.done:

                if np.random.random() > self.epsilon:

                    # .. Concurrency lock
                    ## -->
                    QL._lock.acquire()

                    self.action = np.argmax(QL.q_table[self.discrete_state])

                    QL._lock.release()
                    # # <---

                else:                                                                                               
                    self.action = np.random.randint(0, self.env.action_space.n)

                self.new_state, self.reward, self.done, _ = self.env.step(self.action)
                self.episode_reward += self.reward
                self.new_discrete_state = self.get_descrete_state(self.new_state)

                # .. Concurrency lock
                # -->
                QL._lock.acquire()

                if not self.done:

                    new_q = (1-QL.LEARNING_RATE)* QL.q_table[self.discrete_state + (self.action,)] + QL.LEARNING_RATE * (self.reward + (np.max(QL.q_table[self.new_discrete_state]) * QL.DISCOUNT))
                    QL.q_table[self.discrete_state + (self.action,)] = new_q

                elif self.new_state[0] >= self.env.goal_position: 

                    print(f"We made it to the flag on episode {episode}")
                    QL.q_table[self.discrete_state + (self.action,)] = 0 

                QL._lock.release()
                # <---

                self.discrete_state = self.new_discrete_state

            if self.END_EPSILON_DECAY >= episode >= self.START_EPSILON_DECAY:
                self.epsilon -= self.epsilon_decay_value

            self.eps_reward.append(self.episode_reward)

            if not episode % self.SHOW_EVERY:

                average_reward = sum(self.eps_reward[-self.SHOW_EVERY:])/len(self.eps_reward[-self.SHOW_EVERY:])

                self.aggr_ep_reward['eps'].append(episode)
                self.aggr_ep_reward['avg'].append(average_reward)
                self.aggr_ep_reward['min'].append(min(self.eps_reward[-self.SHOW_EVERY:]))
                self.aggr_ep_reward['max'].append(max(self.eps_reward[-self.SHOW_EVERY:]))

                print(f"Thread {self.id_thread} --> Episode: {episode} Avg: {average_reward} Min: {min(self.eps_reward[-self.SHOW_EVERY:])} Max: {max(self.eps_reward[-self.SHOW_EVERY:])}" )

        self.env.close()

    # --- RUN 
    # _________

    def run(self):
        self.Q_Learning()


# --- MAIN
# _________

NUM_THREADS = 5

for i in range(NUM_THREADS):
    thread = QL(i)
    thread.start()



Thread 0 --> Episode: 0 Avg: -200.0 Min: -200.0 Max: -200.0
Thread 2 --> Episode: 0 Avg: -200.0 Min: -200.0 Max: -200.0
Thread 1 --> Episode: 0 Avg: -200.0 Min: -200.0 Max: -200.0
