In [None]:
import copy
import pylab
import numpy as np
import time
import sys
from environment3 import Env
from keras.layers import Dense
from keras.optimizers import Adam
#from keras.optimizers import SGD
from keras.models import Sequential
from keras import backend as K
%run environment3.py

EPISODES = 2500
number_agents = 3
# this is RL agent for the GridWorld
class ReinforceAgent:
    def __init__(self, agentNr):
        self.load_model = True
        # actions which agent can do [l,r,u,d,ss]
        self.action_space = [0, 1, 2, 3, 4]
        # get size of state and action
        self.action_size = len(self.action_space)
        self.state_size = 21
        self.discount_factor = 0.99
        self.learning_rate = 0.001

        self.model = self.build_model()
        self.optimizer = self.optimizer()
        self.states, self.actions, self.rewards = [], [], []

        if self.load_model and agentNr == 0:
            self.model.load_weights('./save_model/3Agent10times10_32n_1.h5')
        elif self.load_model and agentNr == 1:
            self.model.load_weights('./save_model/3Agent10times10_32n_2.h5')
        elif self.load_model and agentNr == 2:
            self.model.load_weights('./save_model/3Agent10times10_32n_3.h5')

    # state is input and probability of each action(policy) is output of network
    def build_model(self):
        model = Sequential()
        model.add(Dense(32, input_dim=self.state_size, activation='relu'))
        model.add(Dense(32, activation='relu'))
        model.add(Dense(self.action_size, activation='softmax'))
        model.summary()
        return model

    # create error function and training function to update policy network
    def optimizer(self):
        action = K.placeholder(shape=[None, 5])
        discounted_rewards = K.placeholder(shape=[None, ])

        # Calculate cross entropy error function
        action_prob = K.sum(action * self.model.output, axis=1)
        cross_entropy = K.log(action_prob) * discounted_rewards
        loss = -K.sum(cross_entropy)

        # create training function
        optimizer = Adam(lr=self.learning_rate)
        #optimizer = SGD(lr=0.01, clipvalue=0.5)
        updates = optimizer.get_updates(self.model.trainable_weights, [],
                                        loss)
        train = K.function([self.model.input, action, discounted_rewards], [],
                           updates=updates)

        return train

    # get action from policy network, see np.random.choice instructions
    def get_action(self, state):
        policy = self.model.predict(state)[0]
        return np.random.choice(self.action_size, 1, p=policy)[0] 

    # calculate discounted rewards
    def discount_rewards(self, rewards):
        discounted_rewards = np.zeros_like(rewards)
        running_add = 0
        for t in reversed(range(0, len(rewards))):
            running_add = running_add * self.discount_factor + rewards[t]
            discounted_rewards[t] = running_add
        return discounted_rewards

    # save states, actions and rewards for an episode
    def append_sample(self, state, action, reward):
        self.states.append(state[0])
        self.rewards.append(reward)
        act = np.zeros(self.action_size)
        act[action] = 1
        self.actions.append(act)

    # update policy neural network
    def train_model(self):
        discounted_rewards = np.float32(self.discount_rewards(self.rewards))
        discounted_rewards -= np.mean(discounted_rewards)
        discounted_rewards /= np.std(discounted_rewards)

        self.optimizer([self.states, self.actions, discounted_rewards])
        self.states, self.actions, self.rewards = [], [], []


if __name__ == "__main__":
    
    env = Env()
    
    agents = []
    agents.append(ReinforceAgent(0))
    agents.append(ReinforceAgent(1))
    agents.append(ReinforceAgent(2))

    global_step = 0
    scores, episodes = [], []

    for e in range(EPISODES):
        episodes.append(e+1)
        done = [False]*number_agents
        score = [0]*number_agents
        # fresh env
        state = [env.reset(0), env.reset(1), env.reset(2)]
        state[0] = np.reshape(state[0], [1, 21])
        state[1] = np.reshape(state[1], [1, 21])
        state[2] = np.reshape(state[2], [1, 21])

        while (done[0] == False or done[1] == False or done[2] == False):
            for agent in agents:
                #t0 = time.time()
                #start = time.time()
                i = agents.index(agent)
                if done[i]:
                    continue
                global_step += 1
                if global_step % 10000 == 0:
                    print(global_step)
                # get action for the current state and go one step in environment
                action = agent.get_action(state[i])
                next_state, reward, done[i] = env.step(action, i)
                next_state = np.reshape(next_state, [1, 21])

                agent.append_sample(state[i], action, reward)
                score[i] += reward
                state[i] = copy.deepcopy(next_state)
                
                #end = time.time()
                #print(end-start)
                
                if done[i]:
                # update policy neural network for each episode
                    agent.train_model()
                    #scores.append(score)
                    #episodes.append(e)
                    score[i] = round(score[i], 2)
                    if e % 100 == 0:
                        print("Episode:", e + 1, " score for agent ",i+1,":",score[i], "at time_step:", global_step)
                t1 = time.time()
                #print(t1-t0)
        
        scores.append(score)
        if e  % 100 == 0:
    
            #agents[0].model.save_weights("./save_model/reinforce_new1.h5")
            #agents[1].model.save_weights("./save_model/reinforce_new2.h5")
            #agents[2].model.save_weights("./save_model/reinforce_new3.h5")
            if e ==2500:
                close()

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 32)                704       
_________________________________________________________________
dense_2 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_3 (Dense)              (None, 5)                 165       
Total params: 1,925
Trainable params: 1,925
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 32)                704       
_________________________________________________________________
dense_5 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_6 

# Plot test

In [None]:
import pylab
import pickle

with open ('./save_data/episodes', 'rb') as fp:
    episodes = pickle.load(fp)
with open ('./save_data/3Agent10times10_32n_report', 'rb') as fp:
    scores = pickle.load(fp)
    
s1 = []
s2 = []
s3 = []
for i in range(0, 2500):
    s1.append(scores[i][0])
    s2.append(scores[i][1])
    s3.append(scores[i][2])
    
#s2_new = [i * 1.43 for i in s2]

pylab.plot(episodes, s1, 'b', linewidth=0.1)
pylab.plot(episodes, s2, 'g', linewidth=0.1)
pylab.plot(episodes, s3, 'r', linewidth=0.1)
pylab.xlabel('Episodes')
pylab.ylabel('Reward')
pylab.ylim((-20, 3))
#pylab.xlim((0, 2500))
#pylab.title('Reward as function of episodes')
pylab.legend(('Agent 1', 'Agent 2', 'Agent 3'),
           loc='lower right')

pylab.savefig('./save_graph/3Agent10times10_32n_143864steps', format='eps', dpi=900)
#pylab.savefig('./save_graph/test2', format='eps', dpi=900)  

# Saving/writing lists

In [None]:
import pickle

with open('./save_data/3Agent10times10_32n_report', 'wb') as fp:
    pickle.dump(scores, fp)
#with open ('./save_data/test', 'rb') as fp:
   # list_name = pickle.load(fp)
    


In [None]:
import pickle
with open ('./save_data/3Agent10times10notseed', 'rb') as fp:
    scores = pickle.load(fp)

s1 = []
s2 = []
s3 = []
for i in range(0, 2500):
    s1.append(scores[i][0])
    s2.append(scores[i][1])
    s3.append(scores[i][2])
    
print(max(s3))

In [None]:
agents[0].model.save_weights("./save_model/3Agent10times10_32n_1.h5")
agents[1].model.save_weights("./save_model/3Agent10times10_32n_2.h5")
agents[2].model.save_weights("./save_model/3Agent10times10_32n_3.h5")

In [None]:
print(global_step)