The OpenAI Gym (https://gym.openai.com) provides us with a lot of different examples and games in which to train a learning agent. The task is to develop one of such agents. We will create a neural network that, given the state of the game (actually, two consecutive states), it outputs a family of quality values (Q-values) for each next possible move. The move with higher Q-value is chosen and performed in the game. This theoretical formalism was taken from https://www.nervanasys.com/demystifying-deep-reinforcement-learning/

In [None]:
# INITIALIZATION: libraries, parameters, network...
from keras.models import Sequential      # One layer after the other
from keras.layers import Dense, Flatten  # Dense layers are fully connected layers, Flatten layers flatten out multidimensional inputs
from collections import deque            # For storing moves 
from __future__ import division

import sys
import numpy as np
sys.path.append('/home/xbucha02/libraries')
import gym                                # To train our network
from gym import wrappers
env = gym.make('MountainCar-v0')          # Choose game (any in the gym should work)
env = wrappers.Monitor(env, '/home/lachubcz/tmp/cartpole-experiment-1', force=True)

import random     # For sampling batches from the observations

# Create network. Input is two consecutive game states, output is Q-values of the possible moves.
model = Sequential()  #vytvoreni linearnho modelu
model.add(Dense(20, activation="relu", kernel_initializer="uniform", input_shape=(2, 2)))
model.add(Flatten())       # Flatten input so as to have no problems with processing
model.add(Dense(18, activation="relu", kernel_initializer="uniform"))
model.add(Dense(10, activation="relu", kernel_initializer="uniform"))
model.add(Dense(3, activation="linear", kernel_initializer="uniform"))    # Same number of outputs as possible actions

model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])

# Parameters
                                # Register where the actions will be stored

observetime = 2000                         # Number of timesteps we will be acting on the game and observing results
epsilon = 0.55                              # Probability of doing a random move
epsilonBackup = epsilon
gamma = 0.8                                # Discounted future reward. How much we care about steps further in time
mb_size = 50                               # Learning minibatch size
episodes = 5000                               # Pocet epizod
numberOfGames = 20
observation = env.reset()                     # Game begins

In [None]:
# FIRST STEP: Knowing what each action does (Observing)
for eps in range (episodes):
    
    
    env.render()
    D = deque() #vyprazdneni D
    print('{}' .format(len(D)))
    #if D:
    #    print('not gut')
    
    obs = np.expand_dims(observation, axis=0)     # (Formatting issues) Making the observation the first element of a batch of inputs 
    state = np.stack((obs, obs), axis=1)
    done = False

    for t in range(observetime):
        if np.random.rand() <= epsilon:
            action = np.random.randint(0, env.action_space.n, size=1)[0]
        else:
            Q = model.predict(state)          # Q-values predictions
            action = np.argmax(Q)             # Move with highest Q-value is the chosen one
            
        observation_new, reward, done, info = env.step(action)     # See state of the game, reward... after performing the action
        if reward != -1.0:
            print('{}. Uspech!'.format(t)) #dosazeni cile
        obs_new = np.expand_dims(observation_new, axis=0)          # (Formatting issues)
        state_new = np.append(np.expand_dims(obs_new, axis=0), state[:, :1, :], axis=1)     # Update the input with the new state of the game
        D.append((state, action, reward, state_new, done))         # 'Remember' action and consequence
        state = state_new         # Update state

        #print('{}'.format(epsilon))
        if done:
            env.reset()           # Restart game if it's finished
            env.render()
            obs = np.expand_dims(observation, axis=0)     # (Formatting issues) Making the observation the first element of a batch of inputs 
            state = np.stack((obs, obs), axis=1)
            
    epsilon -= (epsilonBackup - 0.1) * (1/episodes)
    #print('{}. Observing Finished' .format())

    # SECOND STEP: Learning from the observations (Experience replay)
    minibatch = random.sample(D, mb_size) #z D vybere pocet mb_size samplu

    inputs_shape = (mb_size,) + state.shape[1:]
    inputs = np.zeros(inputs_shape)
    targets = np.zeros((mb_size, env.action_space.n)) #vytvori pole ([], []), 1. argument - radky, 2. argument - sloupce

    for i in range(0, mb_size):
        state = minibatch[i][0]
        action = minibatch[i][1]
        reward = minibatch[i][2]
        state_new = minibatch[i][3]
        done = minibatch[i][4]

        # Build Bellman equation for the Q function
        inputs[i:i+1] = np.expand_dims(state, axis=0)
        targets[i] = model.predict(state)
        Q_sa = model.predict(state_new)

        if done:
            targets[i, action] = reward
        else:
            targets[i, action] = reward + gamma * np.max(Q_sa)
            #print('{}'.format(Q_sa))

    
    model.train_on_batch(inputs, targets) # Train network to output the Q function, mozna neni casti cyklu
    #print('Inputs: {}; Targets: {}'.format(inputs, targets))
    print('{}. Episode Finished'.format(eps))
    eps+= 1 #aktualizovani poctu epizod

0
0. Episode Finished
0
1. Episode Finished
0
2. Episode Finished
0
3. Episode Finished
0
4. Episode Finished
0
5. Episode Finished
0
6. Episode Finished
0
7. Episode Finished
0
8. Episode Finished
0
9. Episode Finished
0
10. Episode Finished
0
11. Episode Finished
0
12. Episode Finished
0
13. Episode Finished
0
14. Episode Finished
0
15. Episode Finished
0
16. Episode Finished
0
17. Episode Finished
0
18. Episode Finished
0
19. Episode Finished
0
20. Episode Finished
0
21. Episode Finished
0
22. Episode Finished
0
23. Episode Finished
0
24. Episode Finished
0
25. Episode Finished
0
26. Episode Finished
0
27. Episode Finished
0
28. Episode Finished
0
29. Episode Finished
0
30. Episode Finished
0
31. Episode Finished
0
32. Episode Finished
0
33. Episode Finished
0
34. Episode Finished
0
35. Episode Finished
0
36. Episode Finished
0
37. Episode Finished
0
38. Episode Finished
0
39. Episode Finished
0
40. Episode Finished
0
41. Episode Finished
0
42. Episode Finished
0
43. Episode Finishe

In [None]:
# THIRD STEP: Play!
for x in range(0, numberOfGames):
    
    obs = np.expand_dims(observation, axis=0)
    state = np.stack((obs, obs), axis=1)
    done = False
    tot_reward = 0.0
    while not done:
        env.render()                    # Uncomment to see game running
        #print('State: {}'.format(state))
        Q = model.predict(state)
        #print('Q: {}'.format(Q))
        action = np.argmax(Q)  
        print('Action: {}'.format(action))
        observation, reward, done, info = env.step(action)
        obs = np.expand_dims(observation, axis=0)
        state = np.append(np.expand_dims(obs, axis=0), state[:, :1, :], axis=1)    
        tot_reward += reward
    observation = env.reset()
    print('{}. Game ended! Total reward: {}'.format(x, tot_reward))