# Setup

In [None]:
import numpy as np
import pickle
import random
from tensorflow.keras import layers
import gym
import gym_line_follower  # to register environment

# Creating enviroment
env = gym.make("LineFollower-v0")

# Paremeters
x_step, y_step, rotor_step are defining number of states and size of a Q table.

In [None]:
x_step = 0.025 # Jump between discretized x states
y_step = 0.025 # Jump between discretized y states
rotor_step = 0.25 # Jump between discretized rotor value states was 0.1
# Q-learning parameters
episodes = 1000
epsilon = 0.1 # Chance of exploration
alpha = 0.1 # Learning rate
gamma = 0.6 # Discount factor

# Initializations

We define lists containing possible state values and possible action values. We do it so we will be able to discretize our observation space and rotor values into state space and action space.


In [None]:
states_rotor =  np.arange(0, 1 + rotor_step, rotor_step) # Possible rotor actions
# Create a list of tuples, with every possible state
states_x = np.arange(0, 0.3 + x_step, x_step) # Possible x states
states_y = np.arange(-0.2, 0.2 + y_step, y_step) # Possible y states
states = [(x, y) for y in states_y for x in states_x]
# Create a list of tuples, with every possible action
actions = [(rotor_one_state, rotor_two_state) for rotor_one_state in states_rotor for rotor_two_state in states_rotor]
extreme_actions = [(1.0, 1.0), (0.0, 1.0), (1.0, 0.0), (0.0, 0.0), (0.75, 0.0), (0.0, 0.75)] # Rotor actions which we want to avoid
actions = [action for action in actions if action not in extreme_actions]

num_actions = len(actions) # Number of actions
num_states = len(states) # Number of states

Q = np.zeros((num_states, num_actions)) # Q table

# Q-learning

### Helper functions
Our input are 8 points which represents a postion of the line. As shown on image below.
![line_represntation](img/line_representation.png "Line represntation")

In [None]:
# Generates an index for random action  
def random_action(num_actions: int) -> int:
    action = np.random.randint(0, num_actions)
    return action


def x_y_to_state_index(x: float, y: float) -> int:
    """
        Converts x, y into index from list of states. 
    """
    x_idx = -1
    for i in np.arange(0, 0.3 + x_step, x_step):
        if i > x:
            break
        x_idx += 1
    
    y_idx = -1
    for i in np.arange(-0.2, 0.2 + y_step, y_step):
        if i > y:
            break
        y_idx += 1
    
    return states.index((states_x[x_idx], states_y[y_idx]))
    
def observation_to_state(obs):
    """
        Takes first point from our set of 8 poitns which represent position of the line, 
        then converts it to state.
    """
    x, y = obs[0], obs[1]
    return x_y_to_state_index(x, y)

## Training
#### Q-learning algorithm:
- Create table Q of size: number of states x number of actions
- For every episode do:
    - While episode not done
        1. With epsilon probability explore (take random action) or with (1-epsilon) probability exploit (take action with highest reward) 
        2. Perform this action using env.step(action) and get reward.
        3. Get observation from enviroment and convert it to new_state.
        4. In Q-table update value under (state_index, action_index) address according to formula 1.1
        5. Update state to new state. <br><br>
Formula *1.1*     
![](img/qlearning.svg "Update cell in Q-table")

In [None]:
def train(env, Q=Q, episodes=episodes, alpha=alpha, gamma=gamma, epsilon=epsilon, checkpoint_name='q_table') -> None:
    """
    Body of Q-learning algorithm.
    """
    for i in range(episodes):
        obs = env.reset()
        state = observation_to_state(obs)

        done = False

        while not done:
            if random.uniform(0, 1) < epsilon:
                '''
                Exploration: doing random action
                '''
                action_idx = None # random action index
            else:
                '''
                Exploitation: doing the best action
                '''
                action_idx = None # index of action with highest reward for a given state
            """
            TO DO: Performing an action.
            """
            action = None
            _, _, done, _ = env.step((0,0))
            next_state = None
            """
            Updating Q-table.  
            """
            old_cell_value = None
            next_max = np.max(None)
            # implememnt formula
            # Update Q-table cell according to formula 1.1
            state = next_state
    checkpoint_name += '{}'.format(episodes)
    
    with open('checkpoint_name', 'wb') as file:
        pickle.dump(Q, file)

In [None]:
train(env)