# Reinforcement Learning exercise with Q-learning
by Mickey Krekels

The goal of this project is to learn how to implement Q-learning in combination with the game/simulation library Gym form OpenAI.

## Dowloading Libaries 


In [31]:
!pip install tensorflow
!pip install gym
!pip install keras
!pip install keras-rl2



## Loading imports


In [29]:
import gym
import numpy as np
import matplotlib.pyplot as plt

import random
from keras.models import Sequential
from keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

from collections import deque
from keras.models import Sequential
from keras.layers import InputLayer
from keras.layers import Dense

from os import system

# The Taxi game 

For this exercise, the game Taxi-v3 is going to be used from the gym library. <br> 

The goal of the game is to pick up passengers as the taxi player and bring them to the desired destination in the least amount of moves. <br> 

There are **6 actions** used in the game these include:
- [0] : go south
- [1] : go north
- [2] : go east 
- [3] : go west 
- [4] : pick up passenger
- [5] : drop off passenger





## Creating an costum environment

In [4]:
game_name = 'Taxi-v3'
env = gym.make(game_name)
action_space = env.action_space 
action_space.n

6

In [3]:
env = gym.make(game_name)
print(f'number of observation spaces: {env.observation_space}')
print(f'number of action spaces: {env.action_space.n}')

number of observation spaces: Box([-1.2  -0.07], [0.6  0.07], (2,), float32)
number of action spaces: 3


Testing the game environment

In [106]:
env = gym.make(game_name, render_mode='human') #<----- for rendering the environment

observation, _ = env.reset()
max_steps =20
for i in range(max_steps):
    action = env.action_space.sample()
    try:
        observation, reward, terminated, truncated, _ = env.step(action)
    except OverflowError:
        break
    
    if truncated:
        observation, _ = env.reset()

env.close()

Initial state of the environment when the reset method is called

In [None]:
env = gym.make(game_name)
print(env.reset())
env.close()

(array([-0.533374,  0.      ], dtype=float32), {})


The state of the environment when the step method is called (for example, a = 1).

In [108]:
env = gym.make(game_name)
env.reset()
print(env.step(1))
env.close()


(array([-5.8181584e-01,  4.3764384e-04], dtype=float32), -1.0, False, False, {})


Looking at the lenght of the options aviable

In [109]:
env = gym.make(game_name)
nb_actions = env.action_space.n
env.close()
nb_actions

3

Looking at the shape of the observation space

In [110]:
env = gym.make(game_name)
state_shape  = env.observation_space.shape
env.close()
state_shape

(2,)

## Using Q-learning 

In [55]:
env = gym.make(game_name) # 1. Load Environment and Q-table structure
Q = np.zeros([env.observation_space.n,env.action_space.n])
eta = .628
gma = .9
epis = 50000
rev_list = [] # rewards per episode calculate
for i in range(epis):
    s , _= env.reset()
    rAll = 0
    d = False
    j = 0
    while j < 99:
        j+=1
        a = np.argmax(Q[s,:] + np.random.randn(1,env.action_space.n)*(1./(i+1))) # 2. Choose an action by greedily (with noise) picking from Q table
        result = env.step(a) # collect new state & reward from environment
        s1 = result[0]
        r = result[1]
        d = result[2]
        Q[s,a] = Q[s,a] + eta*(r + gma*np.max(Q[s1,:]) - Q[s,a]) #Update Q-Table with new data
        rAll += r
        s = s1
        if d == True:
            break
    rev_list.append(rAll)
print("Reward Sum on all episodes " + str(sum(rev_list)/epis))
print("Final Values Q-Table")
print(Q)

Reward Sum on all episodes 7.27398
Final Values Q-Table
[[ 0.          0.          0.          0.          0.          0.        ]
 [-4.4888983  -4.77215384 -4.67332178 -4.77215384  1.62261467 -6.28      ]
 [-2.92267944 -3.22368354 -3.24928484 -3.22368354  7.7147     -6.28      ]
 ...
 [-2.76961539 -2.5877595  -2.76961539 -2.99686615 -6.28       -6.28      ]
 [-3.64923622 -3.43770778 -3.64923622 -4.04696578 -6.28       -6.28      ]
 [-1.2165616  -0.628      -0.628       8.87809126 -6.28       -6.28      ]]


In about 47 seconds, the model is trained to play Taxi-v3 game.

### testing the trained model 

In [61]:
env = gym.make(game_name, render_mode='human') #<----- for rendering the environment with visualisation
s,_ = env.reset()
d = False
while d != True:
    a = np.argmax(Q[s,:] + np.random.randn(1,env.action_space.n)*(1./(i+1))) # Choose action from Q table
    # Collect new state & reward from environment
    result = env.step(a)
    s1 = result[0]
    r = result[1]
    d = result[2]
    Q[s,a] = Q[s,a] + eta*(r + gma*np.max(Q[s1,:]) - Q[s,a]) #Update Q-Table with new data
    s = s1
env.close()

## Deep Q-learning

In [3]:
game_name = 'Taxi-v3'
env = gym.make(game_name)
discount_factor = 0.95
eps = 0.5
eps_decay_factor = 0.999
num_episodes=500

In [4]:
model = Sequential()
model.add(InputLayer(batch_input_shape=(1, env.observation_space.n)))
model.add(Dense(20, activation='relu'))
model.add(Dense(env.action_space.n, activation='linear'))
model.compile(loss='mse', optimizer='adam', metrics=['mae'])

In [13]:
import time

for i in range(num_episodes):
    state = env.reset()
    eps *= eps_decay_factor
    j = 0
    done = False
    while j < 100:
        j+=1
        if np.random.random() < eps:
            action = np.random.randint(0, env.action_space.n)
        else:
            action = np.argmax(model.predict(np.identity(env.observation_space.n)[state:state + 1]))
        # env.render()
        result = env.step(action)
        new_state = result[0]
        reward = result[1]
        done = result[2]
        target = reward + discount_factor * np.max(model.predict(np.identity(env.observation_space.n)[new_state:new_state + 1]))
        target_vector = model.predict(np.identity(env.observation_space.n)[state:state + 1])[0]
        target_vector[action] = target
        model.fit(np.identity(env.observation_space.n)[state:state + 1], target_vector.reshape(-1, env.action_space.n), epochs=1, verbose=0)
        state = new_state
        if done:
            print("Training done in episode: {}/{}, score: {}, e: {:.2}".format(i, num_episodes, j, eps))
            model.save('taxi_model.h5')
            break
    if i % 100 == 0:
        print("Episode {} finished after {} timesteps".format(i, j))
    if done:
        break

env.close()

Episode 0 finished after 100 timesteps
Training done in episode: 76/500, score: 29, e: 0.39


In [30]:
env = gym.make(game_name) #<----- for rendering the environment with visualisation
model.load_weights('taxi_model.h5')

s = env.reset()
d = False
while d != True:
    env.render(mode = "human")
    action = np.argmax(model.predict(np.identity(env.observation_space.n)[state:state + 1]))
    # Collect new state & reward from environment
    result = env.step(action)
    s1 = result[0]
    r = result[1]
    d = result[2]
    s = s1
    system('clear')
env.close()


+---------+
|R: | : :[35mG[0m|
|[43m [0m: | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+

+---------+
|[43mR[0m: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+
  (North)
+---------+
|[43mR[0m: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+
  (North)
+---------+
|[43mR[0m: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+
  (North)
+---------+
|[43mR[0m: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+
  (North)
+---------+
|[43mR[0m: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+
  (North)
+---------+
|[43mR[0m: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+
  (North)
+---------+
|[43mR[0m: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+
  (North)
+---------+
|[43mR[0m: 

## Conclusion 

For this notebook exercise, I used Q-learning to train an AI to play the 'Taxi-v3' game that is available on the OpenAI gym. The code was simple to implement and most of the problems that occurred were caused by outdated documentation. <br>

This exercise showed me that the Gym library an excellent tool is for learning reinforcement learning. This newly learned knowledge will be used for my open program project!