# Setup

In [None]:
import numpy as np
import pickle
import random
from tensorflow.keras import layers
import gym
import gym_line_follower  # to register environment

# Creating enviroment
env = gym.make("LineFollower-v0")

# Paremeters

In [None]:
episodes = 1000
x_step = 0.025 # Jump between discretized x states
y_step = 0.025 # Jump between discretized y states
rotor_step = 0.25 # Jump between discretized rotor values


epsilon = 0.1 # Chance of exploration
alpha = 0.1 # learning rate
gamma = 0.6 # Discount factor

# Definitions

In [None]:
states_rotor =  np.arange(0, 1 + rotor_step, rotor_step) # Possible rotor actions
states_x = np.arange(0, 0.3 + x_step, x_step) # Possible x states
states_y = np.arange(-0.2, 0.2 + y_step, y_step) # Possible y states

# Create a list of tuples, with every possible state
states = [(x, y) for y in states_y for x in states_x]
# Create a list of tuples, with every possible action
actions = [(rotor_one_state, rotor_two_state) for rotor_one_state in states_rotor for rotor_two_state in states_rotor]

num_actions = len(actions) # Number of rotor actions
num_states = len(states) # Number of states

Q = np.zeros((num_states, num_actions)) # Q table

# Q-learning

### Helper functions

In [None]:
# Generates an index for random action  
def random_action(num_actions):
    action = np.random.randint(0, num_actions)
    return action

# Turns given x and y into their index in the state list
def x_y_to_state_idx(x, y):
    x_idx = -1
    for i in np.arange(0, 0.3 + x_step, x_step):
        if i > x:
            break
        x_idx += 1
    
    y_idx = -1
    for i in np.arange(-0.2, 0.2 + y_step, y_step):
        if i > y:
            break
        y_idx += 1
    return states.index((states_x[x_idx], states_y[y_idx]))
    
# Tranforms observation from enviroment into (x,y) state
def observation_to_state(obs):
    x, y = obs[0], obs[1]
    return x_y_to_state_idx(x, y)

### Training

In [None]:
'''
TODO:
- training progress information (reward, time etc)
- agent evalutaion
'''
def train(env, Q=Q, episodes=episodes, alpha=alpha, gamma=gamma, epsilon=epsilon, checkpoint_name='q_table'):
    for i in range(episodes):
        obs = env.reset()
        state = observation_to_state(obs)

        done = False

        while not done:
            if random.uniform(0, 1) < epsilon:
                '''
                Exploration: doing random action
                '''
                action_idx = random_action(num_actions)
            else:
                '''
                Exploitation: doing the best action
                '''
                action_idx = np.argmax(Q[state])
        
            action = actions[action_idx]
            next_obs, reward, done, _ = env.step(action)
            next_state = observation_to_state(next_obs)

            # Updating Q-table
            old_value = Q[state, action_idx]
            next_max = np.max(Q[next_state])
            
            new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
            Q[state, action_idx] = new_value

            state = next_state
    checkpoint_name += '{}'.format(episodes)
    
    with open('checkpoint_name', 'wb') as file:
        pickle.dump(Q, file)

In [None]:
train(env)