In [8]:
import pygame, sys, random
import numpy as np
from keras.layers import Dense
from keras.models import Sequential
from keras.optimizers import Adam
import math
import matplotlib.pyplot as plt
import pylab
import datetime

num_episodes = 3601

obstacleRadius = 10
agentRadius = 10

# get size of state and action from environment

boundaryPos = [80, 80]
boundaryLength = [200,200]
initPosAgent = [boundaryPos[0] + boundaryLength[0] / 2 + 100, boundaryPos[1] + boundaryLength[1] / 2]
initPosGoal =  [boundaryPos[0] + boundaryLength[0] / 2, boundaryPos[1] + boundaryLength[1] / 2]

moveObstacles = False
action_size = 8
obsNumber = 4
state_size = 2
obsAngleUnit = 1
# state_size = obsNumber + 1            

In [9]:
# A2C(Advantage Actor-Critic) agent
class A2CAgent:
    def __init__(self, state_size, action_size):
        self.load_model = False
        
        # get size of state and action
        self.state_size = state_size
        self.action_size = action_size
        self.value_size = 1

        # These are hyper parameters for the Policy Gradient
        self.discount_factor = 0.99
        self.actor_lr = 0.002
        self.critic_lr = 0.005

        # create model for policy network
        self.actor = self.build_actor()
        self.critic = self.build_critic()
        
        if self.load_model:
            self.actor.load_weights("./Practice004_DataSave/Actor_PEARL.h5")
            self.critic.load_weights("./Practice004_DataSave/Critic_PEARL.h5")
#         print self.actor.get_weights()[0]
    # approximate policy and value using Neural Network
    # actor: state is input and probability of each action is output of model
    def build_actor(self):
        actor = Sequential()
        actor.add(Dense(1, input_dim=self.state_size, activation='relu', kernel_initializer='glorot_normal'))
        actor.add(Dense(self.action_size, activation='softmax', kernel_initializer='glorot_normal'))
        actor.summary()
        # See note regarding crossentropy in cartpole_reinforce.py
        actor.compile(loss='categorical_crossentropy', optimizer=Adam(lr=self.actor_lr))
        return actor

    # critic: state is input and value of state is output of model
    def build_critic(self):
        critic = Sequential()
        critic.add(Dense(1, input_dim=self.state_size, activation='relu', kernel_initializer='glorot_normal'))
        critic.add(Dense(self.value_size, activation='linear', kernel_initializer='glorot_normal'))
        critic.summary()
        critic.compile(loss="mse", optimizer=Adam(lr=self.critic_lr))
        return critic

    # using the output of policy network, pick action stochastically
    def get_action(self, state):
#         print state
        policy = self.actor.predict(state, batch_size=1).flatten()
#         print policy
#         print self.action_size
#         print self.actor.get_weights()[0]
#         policy = np.reshape(policy, [1, self.action_size])
        return np.random.choice(self.action_size, 1, p=policy)[0]

    # update policy network every episode
    def train_model(self, state, action, reward, next_state, done):
        target = np.zeros((1, self.value_size))
        advantages = np.zeros((1, self.action_size))

        value = self.critic.predict(state)[0]
        next_value = self.critic.predict(next_state)[0]

        if done:
            advantages[0][action] = reward - value
            target[0][0] = reward
        else:
            advantages[0][action] = reward + self.discount_factor * (next_value) - value
            target[0][0] = reward + self.discount_factor * next_value

        self.actor.fit(state, advantages, epochs=1, verbose=0)
        self.critic.fit(state, target, epochs=1, verbose=0)

In [10]:
def ckWall(xPos, yPos):
    flagWall = 0
    if(xPos < boundaryPos[0]):
        xPos = boundaryPos[0]
        flagWall = -1
    elif(xPos > boundaryPos[0] + boundaryLength[0]):
        xPos = boundaryPos[0] + boundaryLength[0]
        flagWall = -1
    if(yPos < boundaryPos[1]):
        yPos = boundaryPos[1]
        flagWall = -1
    elif(yPos > boundaryPos[1] + boundaryLength[1]):
        yPos = boundaryPos[1] + boundaryLength[1]
        flagWall = -1
#     if math.sqrt((xPos - initPosAgent[0]) ** 2 + (yPos - initPosAgent[1]) ** 2) > boundaryRadius - obstacleRadius:
#         if xPos - initPosAgent[0] != 0:
#             xPos = xPos - (xPos - initPosAgent[0])/abs(xPos - initPosAgent[0])
#         if yPos - initPosAgent[1] != 0:
#             yPos = yPos - (yPos - initPosAgent[1])/abs(yPos - initPosAgent[1])
    return [int(round(xPos)), int(round(yPos)), flagWall]

In [11]:
def stateGenerator(intenPref, distPref):
    returnSum = [intenPref, distPref]
    returnSum = np.reshape(returnSum, [1, state_size])
#     print returnSum
    return returnSum

In [12]:
def rangeFinder(allObsPos, rangeCenter):
    allObsAgtDistance = [0 for _ in range(obsNumber)]
    for i in range(0, obsNumber):
        allObsAgtDistance[i] = math.sqrt((allObsPos[i][0] - rangeCenter[0])**2 + (allObsPos[i][1] - rangeCenter[1])**2)
    index = np.argmin(allObsAgtDistance)
    return index

In [13]:
def takeAction(action):
    xAction = 0
    yAction = 0
    if action == 0:
        xAction = 1
    elif action == 1:
        xAction = 1
        yAction = 1
    elif action == 2:
        xAction = 1
        yAction = -1            
    elif action == 3:
        xAction = -1
        yAction = 1
    elif action == 4:
        xAction = -1
    elif action == 5:
        xAction = -1
        yAction = -1
    elif action == 6:
        yAction = -1
    elif action == 7:
        yAction = 1
#     elif action == 8:
#         xAction = 0
#         yAction = 0
        
    return [xAction, yAction]

In [None]:
pygame.init()
screen = pygame.display.set_mode([300,300])
screen.fill([200, 200, 200])

# make A2C agent
agent = A2CAgent(state_size, action_size)

rList, episodes = [], []
obsAngleIdx = 0
# Make Obstacles (obsNumber)
obstaclePos = [[0, 0] for _ in range(obsNumber)]
posOffset = 60
obstaclePos[0][0] = int(initPosGoal[0] + posOffset) 
obstaclePos[0][1] = int(initPosGoal[1]) 
obstaclePos[1][0] = int(initPosGoal[0]) 
obstaclePos[1][1] = int(initPosGoal[1] + posOffset) 
obstaclePos[2][0] = int(initPosGoal[0] - posOffset) 
obstaclePos[2][1] = int(initPosGoal[1]) 
obstaclePos[3][0] = int(initPosGoal[0]) 
obstaclePos[3][1] = int(initPosGoal[1] - posOffset) 

for e in range(num_episodes):
    # Initialize
    done = False
    score = 0
#    a = False
#    while not a:
#        initPosAgent[0] = random.randrange(boundaryPos[0] + 1, boundaryPos[0] + boundaryLength[0] - 1)
#        initPosAgent[1] = random.randrange(boundaryPos[1] + 1, boundaryPos[1] + boundaryLength[1] - 1)
#        if(math.sqrt((initPosAgent[0] - initPosGoal[0])**2 + (initPosAgent[1] - initPosGoal[1])**2) > obstacleRadius + agentRadius):
#            a = True
#            for i in range(0,obsNumber):
#                if math.sqrt((initPosAgent[0] - obstaclePos[i][0])**2 + (initPosAgent[1] - obstaclePos[i][1])**2) <= obstacleRadius + agentRadius:
#                    a = False
    pygame.draw.circle(screen, [100, 255, 100], initPosGoal, agentRadius, 0)
    x = initPosAgent[0]
    y = initPosAgent[1]
    print("Episode ", e, "Starts!")
    
    idx = rangeFinder(obstaclePos, [x, y])
    FeatureVec1 = (x - initPosGoal[0])**2 + (y - initPosGoal[1])**2
    FeatureVec2 = 1.0 / (0 + (x - obstaclePos[idx][0])**2 + (y - obstaclePos[idx][1])**2)
    state = stateGenerator(FeatureVec1, FeatureVec2)
    while not done:

        action = agent.get_action(state)

        xMove = 0
        yMove = 0
        
        [xMove, yMove] = takeAction(action)
        wallFlag = 0

        x = x + xMove
        y = y + yMove
        [x,y, wallFlag] = ckWall(x,y)

        collisionFlag = 0
        pygame.draw.circle(screen, [100, 100, 255], [x,y], agentRadius, 0)
        idx = rangeFinder(obstaclePos, [x, y])
        FeatureVec1 = (x - initPosGoal[0])**2 + (y - initPosGoal[1])**2
        FeatureVec2 = 1.0 / (0 + (x - obstaclePos[idx][0])**2 + (y - obstaclePos[idx][1])**2)
        next_state = stateGenerator(FeatureVec1, FeatureVec2)

        if(math.sqrt((x - initPosGoal[0])**2 + (y - initPosGoal[1])**2) <= obstacleRadius + agentRadius):
            print("Goal Reached!")
            collisionFlag = 1
            done = True
        for i in range(0,obsNumber):
            pygame.draw.circle(screen, [255, 50, 50], obstaclePos[i], obstacleRadius, 0)
            if math.sqrt((x - obstaclePos[i][0])**2 + (y - obstaclePos[i][1])**2) <= obstacleRadius + agentRadius:
                print("Collision!", idx)
                collisionFlag = -1
                done = True
#         if wallFlag == -1:
#             done = True
        if not done:
#             reward = 0
            reward = -1.0 * ((x - initPosGoal[0])**2 + (y - initPosGoal[1])**2) / 100000.0
#             print reward
        else:
            if collisionFlag == 1:
                reward = 10000
                rList.append(1)
            elif collisionFlag == -1:
                reward = -30
                rList.append(0)
#             if wallFlag == -1:
#                 reward = -30
        
        agent.train_model(state, action, reward, next_state, done)
        
        score += reward
        state = next_state
        if done:
            obstaclePos = [[0, 0] for _ in range(obsNumber)]
            for i in range(0,obsNumber):
#                 obsRadius = random.randrange(agentRadius + obstacleRadius + 1, boundaryRadius)
                obsRadius = posOffset
                obsAngleIdx += obsAngleUnit
                
                obsAngle = (90 * i) * math.pi / 180
#                 obsAngle = 0
                obstaclePos[i][0] = int(initPosGoal[0] + obsRadius * math.cos(obsAngle))
                obstaclePos[i][1] = int(initPosGoal[1] + obsRadius * math.sin(obsAngle))

            episodes.append(e)
        pygame.display.flip()
        screen.fill([200,200,200])
    print(datetime.datetime.now().strftime('%H:%M:%S'), score)
    print "===================================================================================================="
    # save the model
    if e % 50 == 0:
        agent.actor.save_weights("./Practice004_DataSave/Actor_PEARL.h5")
        agent.critic.save_weights("./Practice004_DataSave/Critic_PEARL.h5")


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_5 (Dense)              (None, 1)                 3         
_________________________________________________________________
dense_6 (Dense)              (None, 8)                 16        
Total params: 19
Trainable params: 19
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_7 (Dense)              (None, 1)                 3         
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 2         
Total params: 5
Trainable params: 5
Non-trainable params: 0
_________________________________________________________________
('Episode ', 0, 'Starts!')
('Collision!', 0)
('22:17:24', -69.93207999999996)
('Episod