In [None]:
#Importing
import os
import gym
import random
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

from keras.layers import Dense
from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import LeakyReLU
from IPython.display import clear_output, display

from collections import deque
from timeit import default_timer as timer

import os
from matplotlib.ticker import FormatStrFormatter

### Neural Network

In [None]:
## Creating Deep-Q network
def q_network(numInput, numHidden, numHiddenLayers, numOutput, optimiserFunction=tf.train.AdamOptimizer, \
              alpha=0.00025, lossFunction="mse", hiddenActivation="relu", outputActivation="linear"): 
    
    #Creating a TensorFlow class
    network = Sequential()
    network.add(Dense(64, activation=hiddenActivation))
    
    #Adding 'n' hidden layers
    for _ in range(numHiddenLayers):
        network.add(Dense(numHidden, activation=hiddenActivation))
        
    #Creating output layer
    network.add(Dense(numOutput, activation=outputActivation))
    #network.add(Dense(numOutput, activation=LeakyReLU(alpha=0.1)))
    
    #Defining the loss function, the optimiser and the metrics.
    network.compile(loss=lossFunction, optimizer=Adam(lr=alpha))
    
    return network

### Memory

In [None]:
class memory():
    def __init__(self, maxSize=500000):
        self.memory_buffer = deque(maxlen=maxSize)
        self.memory_rewards = []
        
    def add(self, state, action, reward, state_prime, done):
        self.memory_buffer.append((state, action, reward, state_prime, done))
        self.memory_rewards.append(reward)
        
    def getBuffer(self):
        return self.memory_buffer
    
    def getRewards(self):
        return self.memory_rewards
    
    def resetRewards(self):
        self.memory_rewards = []

### Agent

In [None]:
class rocketMan():
    def __init__(self, numInput, numHidden, numHiddenLayers, numOutput, \
                 alpha=0.0001, gamma=0.99, epsilon= 1, epsilonEnd=0.01, epsilonDecay=0.99, batchSize=64):
    
        self.network = q_network(numInput, numHidden, numHiddenLayers, numOutput)
        self.targetNetwork = q_network(numInput, numHidden, numHiddenLayers, numOutput)
        
        self.numInput = numInput
        self.numHidden = numHidden
        self.numHiddenLayers = numHiddenLayers
        self.numOutput = numOutput
        
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilonEnd = epsilonEnd
        self.epsilonDecay = epsilonDecay
        self.batchSize = batchSize
                            
        self.replay = memory(500000) #Changed from 2000 to 500,000
        
        self.minsamples=65
        self.steps = 0
        self.update_target_freq = 600
        
    def getAction(self, state):
        if np.random.uniform(0, 1) < self.epsilon:
            return(env.action_space.sample())
        else:
            return(np.argmax(self.network.predict(state.reshape(1,-1))[0])) 
        
    def addMemory(self, state, action, reward, state_prime, done):
        self.replay.add(state, action, reward, state_prime, done)
        
        if self.steps % self.update_target_freq == 0:
            self.softWeightUpdate(0.5)
        self.steps += 1
        
    def softWeightUpdate(self, tau):
        weights = np.asarray(self.network.get_weights())
        target_weights = np.asarray(self.targetNetwork.get_weights())
        self.targetNetwork.set_weights( (weights * tau) + (target_weights * (1 - tau)))
        
    def train(self):
        if(len(self.replay.getBuffer()) >= self.minsamples):               
            targetX, targetY = self.getTrainingSet()
            
            self.network.fit(targetX, targetY, batch_size=self.batchSize, nb_epoch= 1, verbose=0)
            self.softWeightUpdate(0.5)
            
    def getTrainingSet(self):
        stateArray, actionArray, rewardArray, state_primeArray, doneArray = \
                    zip(*random.sample(self.replay.getBuffer(), self.batchSize))
        
        stateArray = np.array(stateArray)
        state_primeArray = np.array(state_primeArray)
        
        currentQ = self.network.predict(stateArray)
        targetQ = self.targetNetwork.predict(state_primeArray)
        
        outputX = stateArray
        outputY = [] #Currently a list but will convert to np array
        
        for reward, action, done, q, targetQ in zip(rewardArray, actionArray, doneArray, currentQ, targetQ):
            target_for_action = reward # correct if state is final.            
            
            if not done:
                #If not add to it the discounted future rewards per current policy                
                target_for_action += (self.gamma * max(targetQ))
                       
            q[action] = target_for_action
            outputY.append(q)
            
        return (outputX, np.array(outputY))


### Training

In [None]:
numEpisodes = 1000
minReward = -300 #If the sum reward drops below this, stop

#Repeatablilty
seed = 32

#Loading the gym environment
env = gym.make('LunarLander-v2')

# Set seed for PRN generator of numpy, random module and gym env.
np.random.seed(seed)
random.seed(seed)
env.seed(seed)

numInput = env.observation_space.shape[0]
numActions = env.action_space.n

#Number of hidden neurons and layers0
numHidden = 128
numHiddenLayers = 1 #Already has a hidden layer, so 1 is actually 2 hidden (must change this)

#Creating agent (rocketMan)
rocketManAgent = rocketMan(numInput, numHidden, numHiddenLayers, numActions)

#Information for plotting
rewardList = []
meanRewards = []
meanRewards1 = []
epsilonList = []
scores_window = deque(maxlen=100)
scores_window1 = deque(maxlen=20)

for episodeNum in range(numEpisodes):
    
    
    # Past episode 300, don't do random actions anymore. There has been enough explanation. 
    if episodeNum > 300: 
        rocketManAgent.epsilon = 0
    
    
    state = env.reset()
    
    step = 0
    while True:        
        chosenAction = rocketManAgent.getAction(state)
        
        #Taking action in environemnt
        state_prime, reward, done, info = env.step(chosenAction)
        
        #Adding the state information to the replay buffer
        rocketManAgent.replay.add(state, chosenAction, reward, state_prime, done)
        
        state = state_prime
        
        episode_rewards = np.sum(rocketManAgent.replay.getRewards())
        
        step += 1
        
        #if(step % 16) == 0:
        rocketManAgent.train()
        
        if(episode_rewards < minReward):
            done = True
            
        if done == True:
            #rocketManAgent.train()
            
            #print("Test:", len(rocketManAgent.replay.getBuffer()))
            
            #Updating epsilon
            if rocketManAgent.epsilon >= rocketManAgent.epsilonEnd:
                rocketManAgent.epsilon *= rocketManAgent.epsilonDecay   
            
            rewardList.append(episode_rewards)
            epsilonList.append(rocketManAgent.epsilon)
            scores_window.append(episode_rewards)
            scores_window1.append(episode_rewards)
            
            meanRewards.append(np.mean(scores_window))
            meanRewards1.append(np.mean(scores_window1))
            
            clear_output(wait=True)
            
            plt.figure(figsize=(20, 10))
            ax = plt.subplot(2,2,1)
            ax2 = ax.twinx()
            ax.plot(range(episodeNum+1), rewardList, 'g-')
            ax2.plot(range(episodeNum+1), epsilonList, 'b-')

            ax = plt.subplot(2,2,2)
            ax.plot(range(len(meanRewards)), meanRewards, label="100 mean")
            ax.plot(range(len(meanRewards1)), meanRewards1, label="20 mean")
            plt.legend()
            plt.show()                      
            
            #Resetting epsiode rewards
            rocketManAgent.replay.resetRewards()
            
            break

### Training 2

In [None]:
#### numEpisodes = 1000
minReward = -300 #If the sum reward drops below this, stop

#Repeatablilty
seed = 32

#Loading the gym environment
env = gym.make('LunarLander-v2')

# Set seed for PRN generator of numpy, random module and gym env.
np.random.seed(seed)
random.seed(seed)
env.seed(seed)

#Where to save the network weights
modelFileName = "LunarLanderWeights.h5"

numInput = env.observation_space.shape[0]
numActions = env.action_space.n

#Number of hidden neurons and layers0
numHidden = 64
numHiddenLayers = 2

#Creating agent (rocketMan)
rocketManAgent = rocketMan(numInput, numHidden, numHiddenLayers, numActions,modelFileName)

#Stop if episode takes longer than this time
maxTime = 10

#Information for plotting
rewardList = []
meanRewards = []
meanRewards1 = []
epsilonList = []
scores_window = deque(maxlen=100)
scores_window1 = deque(maxlen=20)

#These are getting a bit confusing, rename.
RTmean = []
RTwindow = deque(maxlen=20)
RTwindow1 = deque(maxlen=100)
meanRTWindow = []
meanRTWindow1 = []


#Every 'n' steps, render the lander
renderEvery = 100
renderLander = False

avgEpisodeRT = 0 #The average runtime for each episode
saveRecordings = False #Do you want to save the file

start = timer()
for episodeNum in range(numEpisodes):
    renderEpsiode = False #By default, do not render the episode
    
    
    if ((episodeNum % renderEvery) == 0) and (renderLander):
        renderEpsiode = True
        env = gym.wrappers.Monitor(env, 'recording/' + str(episodeNum) + '/', force=True) 
        
    
    state = env.reset()
    step = 0
    episodeReward = 0
    episodeStartTime = timer()
    
    
    while True:       
        if(renderEpsiode):
            env.render()
        
        chosenAction = rocketManAgent.getAction(state)
        
        #Taking action in environemnt
        state_prime, reward, done, info = env.step(chosenAction)
        
        #Adding the state information to the replay buffer
        rocketManAgent.replay.add(state, chosenAction, reward, state_prime, done)
        
        state = state_prime
        
        #episode_rewards = np.sum(rocketManAgent.replay.getRewards())
        episodeReward += reward
        
        step += 1
        
        #if(step % 16) == 0:
        rocketManAgent.train()
        
        if(episodeReward < minReward):
            done = True
            
        #Some iterations are taking over 30 seconds, far too long 
        #episodeTempEndTime = timer()
        
        #if(episodeTempEndTime - episodeStartTime) > maxTime:
        #    done = True
            
        if done == True:
            #rocketManAgent.train()
            
            #print("Test:", len(rocketManAgent.replay.getBuffer()))
            
            
            #avgEpisodeRT += (episodeEndTime - episodeStartTime)
            
            #Updating epsilon
            if rocketManAgent.epsilon >= rocketManAgent.epsilonEnd:
                rocketManAgent.epsilon *= rocketManAgent.epsilonDecay   
            
            rewardList.append(episodeReward)
            epsilonList.append(rocketManAgent.epsilon)
            scores_window.append(episodeReward)
            scores_window1.append(episodeReward)
            
            meanRewards.append(np.mean(scores_window))
            meanRewards1.append(np.mean(scores_window1))
            
            episodeEndTime = timer()
            
            if not renderEpsiode:
                epRT = (episodeEndTime - episodeStartTime)
                RTmean.append(epRT)
                RTwindow.append(epRT)
                RTwindow1.append(epRT)
                meanRTWindow.append(np.mean(RTwindow))
                meanRTWindow1.append(np.mean(RTwindow1))
                    
            
            clear_output(wait=True)
            
            plt.figure(figsize=(20, 12))            
            plt.subplots_adjust(top=0.88)
            plt.tight_layout()
            
            ax = plt.subplot(2,2,1)
            ax2 = ax.twinx()
            ax.set_title("Epsilon & Reward vs Episode")
            ax.set_ylabel("Reward")
            ax2.set_ylabel("Epsilon")
            ax2.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))
            ax.set_xlabel("Episode (Ep)")
            lns1 = ax.plot(range(episodeNum+1), rewardList, 'g-', label="Reward")
            lns2 = ax2.plot(range(episodeNum+1), epsilonList, 'b-', label="Epsilon")
            
            lns = lns1+lns2
            labs = [l.get_label() for l in lns]
            ax2.legend(lns, labs, loc=2)

            ax = plt.subplot(2,2,2)
            ax.set_title("Simple Moving Average (SMA) vs Episode")
            ax.set_ylabel("SMA reward")
            ax.set_xlabel("Episode (Ep)")            
            ax.plot(range(len(meanRewards1)), meanRewards1, color='orange', label="20 Ep SMA")
            ax.plot(range(len(meanRewards)), meanRewards, color='blue', label="100 Ep SMA")
            plt.legend(loc=2)
            
            if(len(RTmean) > 0):
                ax = plt.subplot(2,2,3)   
                ax.set_title("Runtime (RT) & SMA vs Episode")
                ax.set_ylabel("Runtime/s (RT)")
                ax.set_xlabel("Episode (Ep)")
                
                ax.plot((range(len(RTmean))), RTmean, color='r', label="Current RT", linewidth=0.5)
                plt.axhline(y=RTmean[-1], color='r', linestyle='--')
                
                ax.plot((range(len(meanRTWindow))), meanRTWindow, color='orange', label="20 Ep SMA", linewidth=3.0)
                plt.axhline(y=meanRTWindow[-1], color='orange', linestyle='--')

                ax.plot((range(len(meanRTWindow1))), meanRTWindow1, color='blue', label="100 Ep SMA", linewidth=3.0)
                plt.axhline(y=meanRTWindow1[-1], color='blue', linestyle='--')
                            
                plt.legend(loc=2)
            plt.savefig('plotImages/' + str(episodeNum) + '.png')
            plt.show()                      
            
            #Resetting epsiode rewards
            #rocketManAgent.replay.resetRewards()
            
            env.close()
            
            
            #print("Epsiode RT: ", (episodeEndTime - episodeStartTime), " | Average: ", (avgEpisodeRT/(episodeNum+1)))
            break
    env.close()
end = timer()

#Creating video
os.system("ffmpeg -f image2 -r 24 -i plotImages/%d.png -y -an recording/timelapse.mp4")




#Delete files in folder
import glob

files = glob.glob('plotImages/*')
for f in files:
    os.remove(f)

    
print("RT: ", (end - start))