# Q-Learning (Q-table)
 Erster Versuch einer Implementierung von Q-Learning
### Mithilfe des Tutorials von https://www.datacamp.com/tutorial/introduction-q-learning-beginner-tutorial

 Die wichtigsten Begriffe:
 - Environment  : Die Umgebung, welche auf die Aktionen des Agent reagiert
 - Agent        : An entity that acts and operates within an environment
 - State        : Der momentane Zustand des Environment
 - Reward       : Die Rückmeldung des Environment darauf, ob eine Action gute oder schlechte Resultate erbracht hat (reward/penalty)
 - Action       : Eine vom Agent ausgeführte Aktion, welche dann Einfluss auf das Environment und die weiteren Actions hat

 ![Reinforcement Learning](pics\RL_overview.webp)
 

## Install, initialize and import stuff

In [1]:
%%capture 
%pip install pyglet==1.5.1
%pip install PyOpenGL   # %apt install python-opengL
%pip install imageio[ffmpeg]    #%apt install ffmpeg
# %apt install xvfb
# %pip3 install pyvirtualdisplay


In [2]:
# use this if used with WSL (for gif)
# import os
# from pyvirtualdisplay import Display
# virtual_display = Display(visible=True, size=(1400,900))
# os.environ["DISPLAY"] = "localhost:0.0"  # Use "localhost:0.0" for Xming
# virtual_display.start()


In [3]:
%pip install gym==0.24
%pip install pygame
%pip install numpy
%pip install imageio imageio_ffmpeg
%pip install tqdm


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.



In [4]:
import numpy as np
import gym
import random
import imageio
from tqdm.notebook import trange



## Actual code

In [16]:
env = gym.make("FrozenLake-v1",map_name="4x4",is_slippery=False)
env.reset()
env.render()
print("Observation Space", env.observation_space) # prints out the observation space, which is the set of all possible states that the agent can observe in the environment.
print("Sample observation", env.observation_space.sample()) #  a randomly generated state from the observation space

Observation Space Discrete(16)
Sample observation 2


In [6]:
print("Action Space Shape", env.action_space.n)
print("Action Space Sample", env.action_space.sample())

Action Space Shape 4
Action Space Sample 2


### Create and Initialize the Q-table


In [7]:
state_space = env.observation_space.n
print("There are ", state_space, " possible states")

action_space = env.action_space.n
print("There are ", action_space, " possible actions")

Qtable_frozenlake = np.zeros((state_space, action_space)) # create a 16 x 4 Array for the q-table


There are  16  possible states
There are  4  possible actions


### Policies

In [8]:
def epsilon_greedy_policy(Qtable, state, epsilon):
  """
  acting policy
  1. Generates random number between 0 & 1
  2. if number greater than epsilon -> exploitation (action with highest value to the current state) 
  3. else -> exploration (random action)
  """
  random_int = random.uniform(0,1)
  if random_int > epsilon:
    action = np.argmax(Qtable[state])
  else:
    action = env.action_space.sample()
  return action

def greedy_policy(Qtable, state):
  """
  updating policy
  """
  action = np.argmax(Qtable[state]) # the action that the agent should take in order to maximize its reward
  return action

### Hyperparameters

In [9]:
# Hyperparameters, finetuning will give better results

# Training parameters
n_training_episodes = 10000
learning_rate = 0.7        

# Evaluation parameters -> number of tries from the start till termination (hole or goal)
n_eval_episodes = 100      

# Environment parameters
env_id = "FrozenLake-v1"   
max_steps = 99             
gamma = 0.95               
eval_seed = []             

# Exploration parameters
max_epsilon = 1.0           
min_epsilon = 0.05           
decay_rate = 0.0005           

### Model Training

In [10]:
def train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, env, max_steps, Qtable):
  for episode in trange(n_training_episodes):
 
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)
    # Reset the environment
    state = env.reset()
    step = 0
    done = False

    # repeat
    for step in range(max_steps):
   
      action = epsilon_greedy_policy(Qtable, state, epsilon)

   
      new_state, reward, done, info = env.step(action)

   
      Qtable[state][action] = Qtable[state][action] + learning_rate * (reward + gamma * np.max(Qtable[new_state]) - Qtable[state][action])

      # If done, finish the episode
      if done:
        break
     
      # Our state is the new state
      state = new_state
  return Qtable

In [11]:
Qtable_frozenlake = train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, env, max_steps, Qtable_frozenlake)

  0%|          | 0/10000 [00:00<?, ?it/s]

In [12]:
Qtable_frozenlake

array([[0.73509189, 0.77378094, 0.77378094, 0.73509189],
       [0.73509189, 0.        , 0.81450625, 0.77378094],
       [0.77378094, 0.857375  , 0.77378094, 0.81450625],
       [0.81450625, 0.        , 0.77378094, 0.77378094],
       [0.77378094, 0.81450625, 0.        , 0.73509189],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.9025    , 0.        , 0.81450625],
       [0.        , 0.        , 0.        , 0.        ],
       [0.81450625, 0.        , 0.857375  , 0.77378094],
       [0.81450625, 0.9025    , 0.9025    , 0.        ],
       [0.857375  , 0.95      , 0.        , 0.857375  ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.9025    , 0.95      , 0.857375  ],
       [0.9025    , 0.95      , 1.        , 0.9025    ],
       [0.        , 0.        , 0.        , 0.        ]])

### Evaluation

In [13]:
def evaluate_agent(env, max_steps, n_eval_episodes, Q, seed):

  episode_rewards = []
  for episode in range(n_eval_episodes):
    if seed:
      state = env.reset(seed=seed[episode])
    else:
      state = env.reset()
    step = 0
    done = False
    total_rewards_ep = 0
   
    for step in range(max_steps):
      # Take the action (index) that have the maximum reward
      action = np.argmax(Q[state][:])
      new_state, reward, done, info = env.step(action)
      total_rewards_ep += reward
       
      if done:
        break
      state = new_state
    episode_rewards.append(total_rewards_ep)
  mean_reward = np.mean(episode_rewards)
  std_reward = np.std(episode_rewards)

  return mean_reward, std_reward

In [14]:
# Evaluate our Agent
mean_reward, std_reward = evaluate_agent(env, max_steps, n_eval_episodes, Qtable_frozenlake, eval_seed)
print(f"Mean_reward={mean_reward:.2f} +/- {std_reward:.2f}")


Mean_reward=1.00 +/- 0.00


### Visualizing the Result

In [15]:
# TODO ändere den code so, dass es nicht ein gif erstellt, sondern die verschiedenen frames als einzelne bilder ausgibt