# Reinforcement Learning demo with Q-table
- Copy of RL_demo.ipynb 
- with the help of tutorial.ipynb and https://www.youtube.com/watch?v=Vrro7W7iW2w

## import libs, initalize and plot environment 

In [None]:
# import gym
from enum import Enum
import time
import matplotlib.pyplot as plt
import numpy as np
import random
from tqdm.notebook import trange
# from gym.envs.toy_text.frozen_lake import generate_random_map
from IPython.display import clear_output, display
import pandas as pd
# from scipy.spatial.distance import cityblock


## Initialize Gym Environment (unused)


In [None]:
# # env=gym.make('FrozenLake-v1', desc=generate_random_map(size=8)) #for random map

# # desc=["SFFF", "FHFH", "FFFH", "HFFG"] # modify for different environment: S=Start, F=Frozen, H=Hole, G=Goal, Original Environment
# desc=["SFHF", "HFHH", "HFFF", "HHHG"] 
# env=gym.make("FrozenLake-v1",map_name="4x4",desc=desc,is_slippery=False)

# env.reset()
# env.render()

## Initialize own environment

In [None]:

# def Set_Env_Dict(dictionary:dict,key,value:int,hole:bool,start:bool,end:bool):
#     """
#     Do Stuff
#     """
#     dictionary[key]={"Q-Value":value,
#                     "Hole":hole,
#                     "Start":start,
#                     "End":end}
#     return dictionary



# env_dict = {0:{"Q-Value":0,
#                "Hole":False,
#                "Start":True,
#                "End":False}}
# for i in range(1,16):
#     Set_Env_Dict(env_dict,i,)


## Initialize some visualizing functions used further below

In [None]:
# todo: Passe Achsenbeschriftungen zu den States unds Actions an 

def plotQtable(data=np.zeros((16,4))) :  # data = 2d array describing the actions/state correlation  
    clear_output(wait=True)
    fig, ax = plt.subplots()
    ax.table(cellText=data, loc='center')
    # print(f"printed the display")   # debugging
    display(fig)

def updateQtablePlot(Udata):    # updates q-table, resets if no data is given
    fig, ax = plt.subplots()
    clear_output(wait=True)
    ax.table(cellText=Udata, loc='center')
    ax.axis("off")
    display(fig)

## Test loop with random steps, no learning

In [None]:
# # ctrl+a ctrl+§ to toggle comment if in focus
# iteration=0
# while iteration<3:
#     randAct=env.action_space.sample()
#     returnVal=env.step(randAct)
#     env.render()
#     print(f"Iteration: {iteration}, Action: {randAct}")
#     time.sleep(1)
#     if returnVal[2]:
#         env.reset()
#         env.render()
#         iteration+=1
#         time.sleep(1)



## Policies

In [None]:

def epsilon_greedy_policy(Qtable, state, epsilon):
  """
  acting policy
  1. Generates random number between 0 & 1
  2. if number greater than epsilon -> exploitation (action with highest value to the current state) 
  3. else -> exploration (random action)
  """
  random_int = random.uniform(0,1)
  if random_int > epsilon:
    action = np.argmax(Qtable[state])
  else:
    # action = random.sample([0,1,2,3],1)
    action = random.randint(0,3)
  return action

def greedy_policy(Qtable, state):
  """
  updating policy
  """
  action = np.argmax(Qtable[state]) # the action that the agent should take in order to maximize its reward
  return action

## Create and Initialize the Q-table


In [None]:
class Action_Movements(Enum):
    LEFT=-1
    DOWN=4
    RIGHT=1
    UP=-4

States = np.arange(0,16,1)
state_space = len(States)
action_space = len(Action_Movements)

Qtable_frozenlake = np.zeros((state_space, action_space)) # create a 16 x 4 Array for the q-table
plotQtable(Qtable_frozenlake) # <=========== todo: plot anpassen mit state & action beschreibung

## Set up dictionary with information about Start, Hole and Goal

In [None]:
Start_state = 0
Goal_state = 15
hole_arr = [2, 3, 4, 6, 7, 8, 12, 13, 14]      # self defined placement of holes in Environment
state_history=[]                            # to see history of state changes (debugging)
action_history=[]                           # to see history of actions (debugging)

env_dict = {Start_state:{"Hole":False,  
               "Start":True,
               "Goal":False}}
for key in range(1,16):
    env_dict[key] = {"Hole":False,
                "Start":False,
                "Goal":False}

for key in hole_arr:        # Set holes in desired place, change hole_arr for other placement
    env_dict[key]["Hole"]=True

env_dict[Goal_state]["Goal"]=True


## Create own Step Function

In [None]:
def own_step(state, action):
    Goal = False

    match action:   # Converting action from [0,1,2,3] to [-1,4,1,-4] -> second way is what happens in a 4x4 field if certain move is made
        case 0:
            action=Action_Movements.LEFT.value
        case 1:
            action=Action_Movements.DOWN.value
        case 2:
            action=Action_Movements.RIGHT.value
        case 3:
            action=Action_Movements.UP.value

    newState = state + action   # newState is also the id or key for the dictionary
    
    if newState > state_space or newState < 0:
        newState = state  # stay in same position if trying to go over the boundaries

    if env_dict[newState]["Hole"]==True:    
        reward = -10        # reward if fallen into a hole
        newState = state    # stay in same state if trying to fall into a hole
    elif env_dict[newState]["Goal"]==True:
        reward=10       # reward if goal is reached
        Goal=True       # goal is reached
    else:
        reward = -1     # reward for everything besides hole and goal
    
    return newState, reward, Goal

## Hyperparameters

In [None]:
# Hyperparameters, finetuning will give better results

# Training parameters
n_training_episodes = 1000       # <================================== reduce for demo purpose, Original: 10000
learning_rate = 0.7               # Original: 0.7

# Evaluation parameters -> number of tries from the start till termination (hole or goal)
n_eval_episodes = 100     

# Environment parameters
env_id = "FrozenLake-v1"   
max_steps = 100            
gamma = 0.95               
eval_seed = []             

# Exploration parameters
max_epsilon = 1.0           
min_epsilon = 0.05           
decay_rate = 0.0005 # Original: decay_rate = 0.0005           

### Show demo with rendering or show q-table, can't do both 

In [None]:
show_qtable = True        # if FALSE: reduce n_training_episoded to 10 or 100, 10000 will take to long/ needs to be interrupted
show_rendering = not show_qtable 

In [None]:
def state_to_coords (state:int): 
    """
    takes the given state of an agent in a 4x4 matrix and converts them to x & y coordinats for the manhatten distance
    only works, if the start is in the top left and the goal in the bottom right. x coords positive left to right, y coords positive top to bottom
    sets the x coordinate with modulo 4 since the environment is 4x4, Ex: State 13, 13%4 = 1 = x coordinate (starting with 0)
    sets the y coordinate with floor 4 since the environment is 4x4, Ex: State 13, 13//4 = 3 = y coordinate (starting with 0)
    todo: (maybe) extend for bigger environments (for ex: 8x8 -> up to 64 states)

        Variables:
            State: an integer of value 0 to 15
        Returns:
            coords: an array with coords[0] = X-Coordinate & coords[1] = Y-Coordinate
    """
    if state > 15: return False # if state is somehow bigger than the environment, return false, null or similar (error code also possible)
    coords = [state % 4,state // 4]  

    return coords

## Model Training

In [None]:
def train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, max_steps, Qtable):
  for episode in trange(n_training_episodes):
 
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)

    # Reset the environment
    state = Start_state
    state_history.append(state)

    if show_rendering:
      print(f"outer loop, episode {episode}")

    step = 0
    done = False

    # repeat
    for step in range(max_steps): 
      action = epsilon_greedy_policy(Qtable, state, epsilon)
      action_history.append(action)

      # print(f"step: {step},action: {action}")
   
      new_state, reward, done = own_step(state, action)
      state_history.append(new_state)  
      
      Qtable[state][action] = Qtable[state][action] + learning_rate * (reward + gamma * np.max(Qtable[new_state]) - Qtable[state][action])
      
      # If done, finish the episode
      if done:
        break
     
      # Our state is the new state
      state = new_state
  if show_qtable:
    updateQtablePlot(Qtable)  # show resulting Q-table
  return Qtable

In [None]:
#BEFORE RUNNING THIS CELL -> Check if the Nr of Iteration is the one you want to prevent waiting too long/needing to interrupt
# Trains the Agent on this specific environment
Qtable_frozenlake = train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, max_steps, Qtable_frozenlake)

### Save Q-Table to Excel

In [None]:
pdQtable = pd.DataFrame(Qtable_frozenlake)
# pdQtable.to_Excel("Qtable_frozenlake.xlsx") # not able to append as is 
# possible todo: append strings of states and/or actions to qtable so it is labeled in the excel
with pd.ExcelWriter('Qtable_frozenlake.xlsx',mode='a',if_sheet_exists="new") as writer: 
    pdQtable.to_excel(writer, sheet_name='noGymQTable1000')

## Evaluation

In [None]:
def evaluate_agent(env, max_steps, n_eval_episodes, Q, seed):

  episode_rewards = []
  for episode in range(n_eval_episodes):
    state = Start_state
    step = 0
    done = False
    total_rewards_ep = 0
   
    for step in range(max_steps):
      # Take the action (index) that have the maximum reward
      # action = np.argmax(Q[state][:]) # todo: shouldn't here be the greedy_policy() ?
      action = greedy_policy(Qtable_frozenlake,state)
      new_state, reward, done, info = env.step(action)
      total_rewards_ep += reward
       
      if done:
        break
      state = new_state
    episode_rewards.append(total_rewards_ep)
  mean_reward = np.mean(episode_rewards)
  std_reward = np.std(episode_rewards)

  return mean_reward, std_reward