# Setup the requirement libraries

In [None]:
!pip install gym
!pip install numpy

# Setup an environment

In [5]:
def setup_environment(env_name):
    import gym
    env = gym.make(env_name).env
    env.reset()  # reset environment to a new, random state
    env.render()
    print("Action Space {}".format(env.action_space))
    print("State Space {}".format(env.observation_space))
    return env

## Choose your environment from the available environments below

In [6]:
environment_names=["Taxi-v3","FrozenLake-v1","CliffWalking-v0"]
env=setup_environment(environment_names[0])
env.render()

  logger.warn(


Action Space Discrete(6)
State Space Discrete(500)


# Try to take random actions to achieve the goal

In [7]:
def random_action_to_end(env):
    epochs = 0
    penalties, reward = 0, 0
    frames = []  # for animation
    done = False
    while not done:
      # automatically selects one random action
        action = env.action_space.sample()
        state, reward, done, info = env.step(action)
        if reward == -10:
            penalties += 1

        # Put each rendered frame into dict for animation
        frames.append({
            'frame': env.render(mode='ansi'),
            'state': state,
            'action': action,
            'reward': reward
            }
        )
        epochs += 1

    print("Timesteps taken: {}".format(epochs))
    print("Penalties incurred: {}".format(penalties))
    return frames

In [8]:
def print_frames(frames):
    from IPython.display import clear_output
    from time import sleep
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        #print(frame['frame'].getvalue())
        print(frame['frame'])
        print(f"Timestep: {i + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")
        sleep(.1)

In [13]:
frames=random_action_to_end(env)

Timesteps taken: 96
Penalties incurred: 19


In [14]:
print_frames(frames)

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35m[34;1m[43mY[0m[0m[0m| : |B: |
+---------+
  (Dropoff)

Timestep: 96
State: 410
Action: 5
Reward: 20


# Train the agent using Q-learning algorithm

In [24]:
import random
from IPython.display import clear_output
import numpy as np
def train_the_agent(env,alpha,gamma,epsilon,training_steps,decay_steps,decay=False):
    q_table = np.zeros([env.observation_space.n, env.action_space.n])
    alpha_decay=1e-3
    gamma_decay=1e-3
    epsilon_decay=1e-3

    for i in range(1,training_steps ):
        state = env.reset()
        epochs, penalties, reward, = 0, 0, 0
        done = False
        if decay and not i % decay_steps :
            alpha-=alpha_decay
            gamma-=gamma_decay
            epsilon-=epsilon_decay
            print(alpha,gamma,epsilon)
        while not done:
            if random.uniform(0, 1) < epsilon:
                action = env.action_space.sample() # Explore action space
            else:
                action = np.argmax(q_table[state]) # Exploit learned values

            next_state, reward, done, info = env.step(action)

            old_value = q_table[state, action]
            next_max = np.max(q_table[next_state])

            new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
            q_table[state, action] = new_value

            if reward == -10:
                penalties += 1

            state = next_state
            epochs += 1

        if i % 100 == 0:
            clear_output(wait=True)
            print(f"Episode: {i}")

    print("Training finished.\n")
    return q_table

In [54]:
q_table=train_the_agent(env,alpha=0.1,gamma=0.6,epsilon=0.1,decay_steps=10000,training_steps=100001,decay=True)

Episode: 100000
Training finished.



# Evaluation
Evaluate agent's performance after Q-learning

In [77]:
def evaluate(q_table,episodes):
    print("Start evaluation")
    total_epochs, total_penalties = 0, 0
    for _ in range(episodes):
        # Choose random initial state
        state = env.reset()
        epochs, penalties, reward = 0, 0, 0
        done = False
        frames=[]
        while not done:
            action = np.argmax(q_table[state])
            # print(action)
            state, reward, done, info = env.step(action)
            # Put each rendered frame into dict for animation
            frames.append({
            'frame': env.render(mode='ansi'),
            'state': state,
            'action': action,
            'reward': reward
            }
            )

            if reward == -10:
                penalties += 1

            epochs += 1

        total_penalties += penalties
        total_epochs += epochs
    print_frames(frames)
    print(f"Results after {episodes} episodes:")
    print(f"Average timesteps per episode: {total_epochs / episodes}")
    print(f"Average penalties per episode: {total_penalties / episodes}")
    return total_epochs,total_penalties

In [79]:
total_epochs,total_penalties=evaluate(q_table=q_table,episodes=100)

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35m[34;1m[43mB[0m[0m[0m: |
+---------+
  (Dropoff)

Timestep: 9
State: 475
Action: 5
Reward: 20
Results after 100 episodes:
Average timesteps per episode: 13.16
Average penalties per episode: 0.0


# Grid search to get the best hyperparameter

In [43]:
alpha=[0.1,0.3,0.5,0.9]
gamma=[0.3,0.6,0.8,0.9]
epsilon=[0.7,0.6,0.8,0.9]
best_alpha,best_gamma,best_epsilon=0,0,0
mini_penalties=999999999999999999
mini_epochs=9999999999999999
for ep in epsilon :
    for al in alpha :
        for gm in gamma :
            returned_q_table=train_the_agent(env=env,alpha=al,gamma=gm,epsilon=ep,decay_steps=1000,training_steps=10000,decay=False)
            total_epochs,total_penalties=evaluate(returned_q_table,100)
            if total_penalties<=mini_penalties:
                mini_penalties=total_penalties
                best_alpha=al
                best_gamma=gm
                best_epsilon=ep
            if  total_epochs<=mini_epochs:
                total_epochs=mini_epochs
                best_alpha=al
                best_gamma=gm
                best_epsilon=ep
print(best_alpha,best_gamma,best_epsilon)

Episode: 9900
Training finished.

Start evaluation
Results after 100 episodes:
Average timesteps per episode: 13.09
Average penalties per episode: 0.0
0.9 0.9 0.9


# Train with the best hyper parameters

In [57]:
best_q_table=train_the_agent(env,best_alpha,best_gamma,best_epsilon,training_steps=100000,decay_steps=10000,decay=True)

Episode: 99900
Training finished.



# Visualization of trained model over number of episodes

In [59]:
total_epochs, total_penalties = 0, 0
episodes = 500
for episode in range(episodes):
    state = env.reset()
    epochs, penalties, reward = 0, 0, 0

    done = False
    frames=[]
    while not done:
        action = np.argmax(best_q_table[state])
        state, reward, done, info = env.step(action)
        # Put each rendered frame into dict for animation
        frames.append({
        'frame': env.render(mode='ansi'),
        'state': state,
        'action': action,
        'reward': reward
        }
        )
        if reward == -10:
            penalties += 1

        epochs += 1
    total_penalties += penalties
    total_epochs += epochs
    clear_output(wait=False)
    print_frames(frames=frames)
    print(f"Episode {episode}")

print(f"Results after {episodes} episodes:")
print(f"Average timesteps per episode: {total_epochs / episodes}")
print(f"Average penalties per episode: {total_penalties / episodes}")

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35m[34;1m[43mY[0m[0m[0m| : |B: |
+---------+
  (Dropoff)

Timestep: 12
State: 410
Action: 5
Reward: 20
Episode 499
Results after 500 episodes:
Average timesteps per episode: 12.97
Average penalties per episode: 0.0
