## Task 1 - CartPole-v0
Implementer Q-læring og bruk det for å løse cartpole-environmentet

In [45]:
import gym 
import math 
import numpy as np 

In [46]:
env = gym.make('CartPole-v0')

In [47]:
# Parmeters: cart position, cart velocity, pole angle, pole tip velocity

# Hyperparameters 
BUCKETS = (1, 1, 6, 12) 
EPISODES = 1000
MIN_LEARNING_RATE = 0.1
MIN_EPSILON = 0.1
DISCOUNT = 1.0
DECAY = 25

# Visualization variables 
SHOW_ENV = 200
SHOW_STATS = 50

In [48]:
q_table = np.random.uniform(low=0, high=1, size=(BUCKETS + (env.action_space.n, )))

In [49]:
upper_bounds = [env.observation_space.high[0], 0.5, env.observation_space.high[2], math.radians(50) / 1.]
lower_bounds = [env.observation_space.low[0], -0.5, env.observation_space.low[2], -math.radians(50) / 1.]

In [50]:
# Discretizes the state 
def discretize_state(obs):
    discretized = list()
    
    for i in range(len(obs)):
        scaling = (obs[i] + abs(lower_bounds[i])) / (upper_bounds[i] - lower_bounds[i])
        new_obs = int(round((BUCKETS[i] - 1) * scaling))
        new_obs = min(BUCKETS[i] - 1, max(0, new_obs))
        discretized.append(new_obs)
        
    return tuple(discretized)

In [51]:
# Chooses what action to take (random or look in Q-Table)
def choose_action(state):
    if (np.random.random() < epsilon):
        return env.action_space.sample() # Random action
    else:
        return np.argmax(q_table[state]) # Looks up in the Q-Table 

In [52]:
# Updates the Q-Table 
def update_q(state, action, reward, new_state):
    q_table[state][action] += learning_rate * (reward + DISCOUNT * np.max(q_table[new_state]) - q_table[state][action])

In [53]:
# Updates epsilon value (logarithmically decreasing)
def get_epsilon(episode):
    return max(MIN_EPSILON, min(1., 1. - math.log10((episode + 1) / DECAY)))

In [54]:
# Updates the learning rate (logarithmically decreasing)
def get_learning_rate(episode):
    return max(MIN_LEARNING_RATE, min(1., 1. - math.log10((episode + 1) / DECAY)))

In [55]:
print('Episode  Score   Average')

scores = []
successfulEpisode = -1  

for episode in range(EPISODES):
    render = episode % SHOW_ENV == 0 
    
    # Resets the state 
    current_state = discretize_state(env.reset())
    
    # Updates learning rate and epsilon 
    learning_rate = get_learning_rate(episode)
    epsilon = get_epsilon(episode)
            
    score = 0
    
    # Plays the game 
    done = False
    while not done:
        
        # Renders the current state 
        if render:
            env.render()
        
        action = choose_action(current_state)              # Chooses action
        obs, reward, done, _ = env.step(action)            # Performs action 
        new_state = discretize_state(obs)                  # Discretizes state
        update_q(current_state, action, reward, new_state) # Updates Q-Table
        current_state = new_state                          # Updates the current state
        score += reward                                    # Updates the score 
    
    scores.append(score)

    # Calculates the average of the last 100 episodes 
    average = sum(scores[-100:]) / 100 
    if(average >= 195.0 and successfulEpisode < 0):
        successfulEpisode = episode
    
    # Prints some statistics for every 50th episode 
    if episode % SHOW_STATS == 0: print(f'{episode}\t {score}\t {average}')

# Prints the result 
if successfulEpisode > 0:
    print(f'\nCompleted on episode {successfulEpisode}')
else:
    print('\nUnable to complete game')

Episode  Score   Average
0	 16.0	 0.16
50	 27.0	 13.12
100	 89.0	 31.8
150	 180.0	 94.97
200	 200.0	 174.65
250	 200.0	 198.52
300	 200.0	 200.0
350	 200.0	 200.0
400	 200.0	 200.0
450	 200.0	 200.0
500	 200.0	 200.0
550	 200.0	 199.33
600	 200.0	 199.33
650	 200.0	 200.0
700	 200.0	 200.0
750	 200.0	 200.0
800	 200.0	 200.0
850	 200.0	 200.0
900	 200.0	 200.0
950	 200.0	 200.0

Completed on episode 228
