In [1]:
#Importing
import os
import gym
import random
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

from keras.layers import Dense
from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import LeakyReLU
from IPython.display import clear_output, display

from collections import deque
from timeit import default_timer as timer

import os
from matplotlib.ticker import FormatStrFormatter

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


### Neural Network

In [2]:
## Creating Deep-Q network
def q_network(numInput, numHidden, numHiddenLayers, numOutput, optimiserFunction=tf.train.AdamOptimizer, \
              alpha=0.00025, lossFunction="mse", hiddenActivation="relu", outputActivation="linear"): 
    
    #Creating a TensorFlow class
    network = Sequential()
    
    #Adding 'n' hidden layers
    for x in range(numHiddenLayers):
        if x == 0:
            #Creating first hidden layer
            network.add(Dense(numHidden, input_dim=numInput, activation=hiddenActivation))
        else:
            network.add(Dense(numHidden, activation=hiddenActivation))
        
    #Creating output layer
    network.add(Dense(numOutput, activation=outputActivation))
    #network.add(Dense(numOutput, activation=LeakyReLU(alpha=0.1)))
    
    #Defining the loss function, the optimiser and the metrics.
    network.compile(loss=lossFunction, optimizer=Adam(lr=alpha))
    
    return network   

### Memory

In [3]:
class memory():
    def __init__(self, maxSize=2000):
        self.memory_buffer = deque(maxlen=maxSize)
        self.memory_rewards = []
        
    def add(self, state, action, reward, state_prime, done):
        self.memory_buffer.append((state, action, reward, state_prime, done))
        #self.memory_rewards.append(reward)
        
    def getBuffer(self):
        return self.memory_buffer
    
    def getRewards(self):
        return self.memory_rewards
    
    def resetRewards(self):
        self.memory_rewards = []

### Agent

In [4]:
class rocketMan():
    def __init__(self, numInput, numHidden, numHiddenLayers, numOutput, modelFileName, \
                 alpha=0.0001, gamma=0.99, epsilon=0.8, epsilonEnd=0, epsilonDecay=0.995, batchSize=32, \
                 learningPoint=5000, load=False, save=False):
    
        self.numInput = numInput
        self.numHidden = numHidden
        self.numHiddenLayers = numHiddenLayers
        self.numOutput = numOutput
    
        self.network = q_network(numInput, numHidden, numHiddenLayers, numOutput)
        self.targetNetwork = q_network(numInput, numHidden, numHiddenLayers, numOutput)
        
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilonEnd = epsilonEnd
        self.epsilonDecay = epsilonDecay
        self.batchSize = batchSize
                            
        self.replay = memory(250000) #Changing from 2000 to 500,000
        
        self.learningPoint = learningPoint
        
        #records the current number of steps taking through all episodes
        self.steps = 0 
        
        #How often to soft update the target network
        self.updateFreq = 500
         
        if(load):
            self.targetNetwork.load_weights(modelFileName)
        
    def getAction(self, state):
        if np.random.uniform(0, 1) < self.epsilon:
            return(env.action_space.sample())
        else:
            return(np.argmax(self.network.predict(state.reshape(1,-1))[0])) 
        
    def addMemory(self, state, action, reward, state_prime, done):
        self.replay.add(state, action, reward, state_prime, done)
        
        if(self.steps % self.updateFreq) == 0:
            self.softWeightUpdate(0.5)
        self.steps += 1
        
    def softWeightUpdate(self, tau):
        weights = np.asarray(self.network.get_weights())
        target_weights = np.asarray(self.targetNetwork.get_weights())
        self.targetNetwork.set_weights((weights * tau) + (target_weights * (1 - tau)))
        
    def train(self):    
        if(len(self.replay.getBuffer()) >= self.learningPoint):   
            training, target = self.getTrainingDataset()
            self.network.fit(training, target, batch_size=self.batchSize, nb_epoch=1, verbose=0)
            self.softWeightUpdate(0.5)     
        
    def getTrainingDataset(self):
        trainingX = []
        trainingY = []
        for state, action, reward, state_prime, done in random.sample(self.replay.getBuffer(), self.batchSize):
            q_values = self.network.predict(state.reshape(1,-1))
            target_q_values = q_values

            if done == True: 
                target_q_values[0][action] = reward
            else: 
                target_q_values[0][action] = reward + (self.gamma * np.max(self.targetNetwork.predict(state_prime.reshape(1,-1))))
            
            trainingX.append(state)
            trainingY.append(target_q_values[0])
        return np.asarray(trainingX), np.asarray(trainingY)

### Training

In [6]:
numEpisodes = 1000
minReward = -300 #If the sum reward drops below this, stop

#Repeatablilty
seed = 32

#Loading the gym environment
env = gym.make('LunarLander-v2')

# Set seed for PRN generator of numpy, random module and gym env.
np.random.seed(seed)
random.seed(seed)
env.seed(seed)

#Where to save the network weights
modelFileName = "LunarLanderWeights.h5"

numInput = env.observation_space.shape[0]
numActions = env.action_space.n

#Number of hidden neurons and layers0
numHidden = 128
numHiddenLayers = 1

#Creating agent (rocketMan)
rocketManAgent = rocketMan(numInput, numHidden, numHiddenLayers, numActions,modelFileName)

#Stop if episode takes longer than this time
maxTime = 10

#Information for plotting
rewardList = []
meanRewards = []
meanRewards1 = []
epsilonList = []
scores_window = deque(maxlen=100)
scores_window1 = deque(maxlen=20)

#These are getting a bit confusing, rename.
RTmean = []
RTwindow = deque(maxlen=20)
RTwindow1 = deque(maxlen=100)
meanRTWindow = []
meanRTWindow1 = []


#Every 'n' steps, render the lander
renderEvery = 100
renderLander = True

avgEpisodeRT = 0 #The average runtime for each episode
saveRecordings = False #Do you want to save the file

start = timer()
for episodeNum in range(numEpisodes):
    renderEpsiode = False #By default, do not render the episode
    
    
    if ((episodeNum % renderEvery) == 0) and (renderLander):
        renderEpsiode = True
        # env = gym.wrappers.Monitor(env, 'recording/' + str(episodeNum) + '/', force=True) 
        
    
    state = env.reset()
    step = 0
    episodeReward = 0
    episodeStartTime = timer()
    
    
    while True:       
        if(renderEpsiode):
            env.render()
        
        chosenAction = rocketManAgent.getAction(state)
        
        #Taking action in environemnt
        state_prime, reward, done, info = env.step(chosenAction)
        
        #Adding the state information to the replay buffer
        rocketManAgent.replay.add(state, chosenAction, reward, state_prime, done)
        
        state = state_prime
        
        #episode_rewards = np.sum(rocketManAgent.replay.getRewards())
        episodeReward += reward
        
        step += 1
        
        #if(step % 16) == 0:
        rocketManAgent.train()
        
        if(episodeReward < minReward):
            done = True
            
        #Some iterations are taking over 30 seconds, far too long 
        #episodeTempEndTime = timer()
        
        #if(episodeTempEndTime - episodeStartTime) > maxTime:
        #    done = True
            
        if done == True:
            #rocketManAgent.train()
            
            #print("Test:", len(rocketManAgent.replay.getBuffer()))
            
            
            #avgEpisodeRT += (episodeEndTime - episodeStartTime)
            
            #Updating epsilon
            if rocketManAgent.epsilon >= rocketManAgent.epsilonEnd:
                rocketManAgent.epsilon *= rocketManAgent.epsilonDecay   
            
            rewardList.append(episodeReward)
            epsilonList.append(rocketManAgent.epsilon)
            scores_window.append(episodeReward)
            scores_window1.append(episodeReward)
            
            meanRewards.append(np.mean(scores_window))
            meanRewards1.append(np.mean(scores_window1))
            
            episodeEndTime = timer()
            
            if not renderEpsiode:
                epRT = (episodeEndTime - episodeStartTime)
                RTmean.append(epRT)
                RTwindow.append(epRT)
                RTwindow1.append(epRT)
                meanRTWindow.append(np.mean(RTwindow))
                meanRTWindow1.append(np.mean(RTwindow1))
                    
            
            clear_output(wait=True)
            
            plt.figure(figsize=(20, 10))
            plt.subplots_adjust(top=1.0)
            plt.tight_layout()
            
            ax = plt.subplot(2,2,1)
            ax2 = ax.twinx()
            ax.set_title("Epsilon & Reward vs Episode")
            ax.set_ylabel("Reward")
            ax2.set_ylabel("Epsilon")
            ax2.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))
            ax.set_xlabel("Episode (Ep)")
            lns1 = ax.plot(range(episodeNum+1), rewardList, 'g-', label="Reward")
            lns2 = ax2.plot(range(episodeNum+1), epsilonList, 'b-', label="Epsilon")
            
            lns = lns1+lns2
            labs = [l.get_label() for l in lns]
            ax2.legend(lns, labs, loc=2)

            ax = plt.subplot(2,2,2)
            ax.set_title("Simple Moving Average (SMA) vs Episode")
            ax.set_ylabel("SMA reward")
            ax.set_xlabel("Episode (Ep)")            
            ax.plot(range(len(meanRewards1)), meanRewards1, color='orange', label="20 Ep SMA")
            ax.plot(range(len(meanRewards)), meanRewards, color='blue', label="100 Ep SMA")
            plt.legend(loc=2)
            
            if(len(RTmean) > 0):
                ax = plt.subplot(2,2,3)   
                ax.set_title("Runtime (RT) & SMA vs Episode")
                ax.set_ylabel("Runtime/s (RT)")
                ax.set_xlabel("Episode (Ep)")
                
                ax.plot((range(len(RTmean))), RTmean, color='r', label="Current RT", linewidth=0.5)
                plt.axhline(y=RTmean[-1], color='r', linestyle='--')
                
                ax.plot((range(len(meanRTWindow))), meanRTWindow, color='orange', label="20 Ep SMA", linewidth=3.0)
                plt.axhline(y=meanRTWindow[-1], color='orange', linestyle='--')

                ax.plot((range(len(meanRTWindow1))), meanRTWindow1, color='blue', label="100 EP SMA", linewidth=3.0)
                plt.axhline(y=meanRTWindow1[-1], color='blue', linestyle='--')
                            
                plt.legend(loc=2)
            plt.savefig('plotImages/' + str(episodeNum) + '.png')
            plt.show()                      
            
            #Resetting epsiode rewards
            #rocketManAgent.replay.resetRewards()
            
            env.close()
            
            
            #print("Epsiode RT: ", (episodeEndTime - episodeStartTime), " | Average: ", (avgEpisodeRT/(episodeNum+1)))
            break
    env.close()
end = timer()

#Creating video
os.system("ffmpeg -f image2 -r 12 -i plotImages/%d.png -y -an recording/timelapse.mp4")




#Delete files in folder
import glob

files = glob.glob('plotImages/*')
for f in files:
    os.remove(f)

    
print("RT: ", (end - start))

KeyboardInterrupt: 

KeyboardInterrupt: 

In [None]:
#Notes for next time
#https://stackoverflow.com/questions/8827016/matplotlib-savefig-in-jpeg-format
#https://stackoverflow.com/questions/753190/programmatically-generate-video-or-animated-gif-in-python
#https://github.com/openai/gym/issues/1254    
#https://moodle.bath.ac.uk/pluginfile.php/1100399/course/section/175216/rl18.pdf