In [1]:
import pygame, sys, random
import numpy as np
from keras.layers import Dense
from keras.models import Sequential
from keras.optimizers import Adam
import math
import matplotlib.pyplot as plt
import pylab
from fractions import Fraction

num_episodes = 500

obstacleRadius = 10
agentRadius = 10

# get size of state and action from environment

boundaryPos = [100, 100]
boundaryLength = [70,70]
boundaryRadius = 40
dispSize = [1280, 960]
initPosAgentStandard = [dispSize[0] - 100, 100]#dispSize[1]]
initPosAgent = initPosAgentStandard#[boundaryPos[0] + boundaryLength[0] / 2, boundaryPos[1] + boundaryLength[1] / 2]
goalPos = [100, 900]
goalAngle = 0#random.randrange(0, 360) * math.pi / 180

obstacleRandomRange = 1000

moveObstacles = True
action_size = 9
obsNumber = 200
state_size = 2
# state_size = obsNumber + 1

Using TensorFlow backend.


In [2]:
# A2C(Advantage Actor-Critic) agent
class A2CAgent:
    def __init__(self, state_size, action_size):
        self.load_model = True
        
        # get size of state and action
        self.state_size = state_size
        self.action_size = action_size
        self.value_size = 1

        # These are hyper parameters for the Policy Gradient
        self.discount_factor = 0.99
        self.actor_lr = 0.00002
        self.critic_lr = 0.00005

        # create model for policy network
        self.actor = self.build_actor()
        self.critic = self.build_critic()
        self.actorGoal = self.build_actorGoal()
        self.criticGoal = self.build_criticGoal()

        if self.load_model:
            self.actor.load_weights("./Practice004_DataSave/Actor_For.h5")
            self.critic.load_weights("./Practice004_DataSave/Critic_For.h5")
            self.actorGoal.load_weights("./Practice004_DataSave/Actor_Rev.h5")
            self.criticGoal.load_weights("./Practice004_DataSave/Critic_Rev.h5")
#             print self.actor.weights
#             print self.critic.weights
    # approximate policy and value using Neural Network
    # actor: state is input and probability of each action is output of model
    def build_actor(self):
        actor = Sequential()
        actor.add(Dense(128, input_dim=self.state_size, activation='relu', kernel_initializer='glorot_normal'))
        actor.add(Dense(self.action_size, activation='softmax', kernel_initializer='glorot_normal'))
        actor.summary()
        # See note regarding crossentropy in cartpole_reinforce.py
        actor.compile(loss='categorical_crossentropy', optimizer=Adam(lr=self.actor_lr))
        return actor

    # critic: state is input and value of state is output of model
    def build_critic(self):
        critic = Sequential()
        critic.add(Dense(128, input_dim=self.state_size, activation='relu', kernel_initializer='glorot_normal'))
        critic.add(Dense(self.value_size, activation='linear', kernel_initializer='glorot_normal'))
        critic.summary()
        critic.compile(loss="mse", optimizer=Adam(lr=self.critic_lr))
        return critic

    def build_actorGoal(self):
        actorGoal = Sequential()
        actorGoal.add(Dense(128, input_dim=self.state_size, activation='relu', kernel_initializer='glorot_normal'))
        actorGoal.add(Dense(self.action_size, activation='softmax', kernel_initializer='glorot_normal'))
        actorGoal.summary()
        # See note regarding crossentropy in cartpole_reinforce.py
        actorGoal.compile(loss='categorical_crossentropy', optimizer=Adam(lr=self.actor_lr))
        return actorGoal

    # critic: state is input and value of state is output of model
    def build_criticGoal(self):
        criticGoal = Sequential()
        criticGoal.add(Dense(128, input_dim=self.state_size, activation='relu', kernel_initializer='glorot_normal'))
        criticGoal.add(Dense(self.value_size, activation='linear', kernel_initializer='glorot_normal'))
        criticGoal.summary()
        criticGoal.compile(loss="mse", optimizer=Adam(lr=self.critic_lr))
        return criticGoal

    
    # using the output of policy network, pick action stochastically
    def get_action(self, state):
        policy = self.actor.predict(state, batch_size=1).flatten()
#         print policy
        return policy
#         return np.random.choice(self.action_size, 1, p=policy)[0]


    # using the output of policy network, pick action stochastically
    def get_actionGoal(self, state):
        policy = self.actorGoal.predict(state, batch_size=1).flatten()
#         print policy
        return policy
#         return np.random.choice(self.action_size, 1, p=policy)[0]


    # update policy network every episode
    def train_model(self, state, action, reward, next_state, done):
        target = np.zeros((1, self.value_size))
        advantages = np.zeros((1, self.action_size))

        value = self.critic.predict(state)[0]
        next_value = self.critic.predict(next_state)[0]

        if done:
            advantages[0][action] = reward - value
            target[0][0] = reward
        else:
            advantages[0][action] = reward + self.discount_factor * (next_value) - value
            target[0][0] = reward + self.discount_factor * next_value

        self.actor.fit(state, advantages, epochs=1, verbose=0)
        self.critic.fit(state, target, epochs=1, verbose=0)

In [3]:
def stateGenerator(obsPosition, agtPosition, idx):
    returnSum = []
    if idx != -1:
        returnSum = returnSum + [agtPosition[0] - obsPosition[idx][0], agtPosition[1] - obsPosition[idx][1]]
    else:
        returnSum = returnSum + [agtPosition[0] - obsPosition[0], agtPosition[1] - obsPosition[1]]
    returnSum = np.reshape(returnSum, [1, state_size])
    return returnSum

In [4]:
def takeAction(action):
    xAction = 0
    yAction = 0
    if action == 0:
        xAction = 1
    elif action == 1:
        xAction = 1
        yAction = 1
    elif action == 2:
        xAction = 1
        yAction = -1            
    elif action == 3:
        xAction = -1
        yAction = 1
    elif action == 4:
        xAction = -1
    elif action == 5:
        xAction = -1
        yAction = -1
    elif action == 6:
        yAction = -1
    elif action == 7:
        yAction = 1
    elif action == 8:
        xAction = 0
        yAction = 0
        
    return [xAction, yAction]

In [5]:
def rangeFinder(allObsPos, rangeCenter):
    countObs = 0
    rangeObstacle = [[0,0] for _ in range(obsNumber)]
    allObsAgtDistance = [0 for _ in range(obsNumber)]
    for i in range(0, obsNumber):
        allObsAgtDistance[i] = math.sqrt((allObsPos[i][0] - rangeCenter[0])**2 + (allObsPos[i][1] - rangeCenter[1])**2)
        if math.sqrt((rangeCenter[0] - allObsPos[i][0])**2 + (rangeCenter[1] - allObsPos[i][1])**2) < boundaryRadius:
            rangeObstacle[countObs] = allObsPos[i]
            countObs += 1
    index = np.argmin(allObsAgtDistance)
    return [countObs, rangeObstacle, index]

In [6]:
def goalFinder(agtPos):
    if goalPos[0] == agtPos[0]:
        if goalPos[1] > agtPos[1]:
            goalAngle = 90 * math.pi / 180
        else:
            goalAngle = -90 * math.pi / 180
    else:
        goalAngle = math.atan(1.0*(goalPos[1]-agtPos[1])/(goalPos[0]-agtPos[0]))
    if goalPos[0] < agtPos[0]:
        goalAngle += math.pi
        
    tmpGoal = [0,0]
    tmpGoal[0] = int(math.floor(agtPos[0] + boundaryRadius * math.cos(goalAngle)))
    tmpGoal[1] = int(math.floor(agtPos[1] + boundaryRadius * math.sin(goalAngle)))
    return tmpGoal

In [7]:
def nearestAction(actionIdx):
    nearAction = []
    if actionIdx == 0:
        nearAction = [1, 2]
    elif actionIdx == 1:
        nearAction = [0, 7]
    elif actionIdx == 2:
        nearAction = [0, 6]
    elif actionIdx == 3:
        nearAction = [4, 7]
    elif actionIdx == 4:
        nearAction = [3, 5]
    elif actionIdx == 5:
        nearAction = [4, 6]
    elif actionIdx == 6:
        nearAction = [5, 2]
    elif actionIdx == 7:
        nearAction = [1, 3]
    else:
        nearAction = [8, 8]
    return nearAction

In [8]:
pygame.init()
screen = pygame.display.set_mode(dispSize)
screen.fill([200, 200, 200])

# make A2C agent
agent = A2CAgent(state_size, action_size)
agentGoal = A2CAgent(state_size, action_size)
rList, episodes = [], []

# Make Obstacles (obsNumber)
obstaclePos = [[0, 0] for _ in range(obsNumber)]
for i in range(0,obsNumber):
#     obsRadius = random.randrange(agentRadius + obstacleRadius + 10, obstacleRandomRange)
#     obsAngle = random.randrange(90,180) * math.pi / 180
#     obstaclePos[i][0] = int(initPosAgent[0] + obsRadius * math.cos(obsAngle)) #boundaryPos[0] + random.randrange(1, boundaryLength[0])
#     obstaclePos[i][1] = int(initPosAgent[1] + obsRadius * math.sin(obsAngle)) #boundaryPos[1] + random.randrange(1, boundaryLength[1])
#     obstaclePos[i][0] = int(initPosAgent[0] - (obstacleRadius + agentRadius + random.randrange(0, dispSize[0])))
#     obstaclePos[i][1] = int(initPosAgent[1] + (obstacleRadius + agentRadius + random.randrange(0, dispSize[1])))
    obstaclePos[i][0] = int(150 + i % 14 * 80)
    obstaclePos[i][1] = int(i / 14 * 80)
for e in range(num_episodes):
    # Initialize
    done = False
    score = 0
    x = initPosAgent[0]
    y = initPosAgent[1]
    print("Episode ", e, "Starts!")
    
    while not done:
        [rangeObsNumber, rangeObsPos, minIndex] = rangeFinder(obstaclePos, initPosAgent)
#         print rangeObsNumber

        macroState = stateGenerator([obstaclePos[minIndex][0], obstaclePos[minIndex][1]], [x, y], -1)
        macroPolicy = agent.get_action(macroState)
        
        tmpAction = []
        for i in range(0,rangeObsNumber):
            state = stateGenerator(rangeObsPos, [x,y], i)
            policyArr = agent.get_action(state)
            if i == 0:
                tmpAction = (policyArr)
            else:
                tmpAction = tmpAction * (policyArr)
#         print("0: ", tmpAction)
        
#         if tmpAction != []:
#             for j in range(0, action_size):
#                 if tmpAction[j] > 0.9999:
#                     tmpAction[j] = 1
# #                 elif tmpAction[j] > 0.995:
# #                     tmpAction[j] = 0.1
#                 else:
#                     tmpAction[j] = 0
#             tmpArgMax = np.argmax(tmpAction)
            
#         tmpAction = [round(elem,0) for elem in tmpAction]
            
#         if rangeObsNumber != 0:
#             tmpAction = tmpAction * (macroPolicy)
#         print("1:", macroPolicy)
    
        if rangeObsNumber == 0:
            tmpAction = [1.0/action_size for _ in range(0, action_size)]
        
        tmpGoalPos = goalFinder([x, y])
        state = stateGenerator(tmpGoalPos, [x,y], -1)
        policyArr = agentGoal.get_actionGoal(state)

#         if np.mean(tmpAction) == 0:
#             tmpAction[tmpArgMax] = 1
        
        tmpAction = tmpAction * np.asarray(policyArr)
    
        tmpAction = tmpAction / np.sum(tmpAction)
        action = np.random.choice(action_size, 1, p = tmpAction)[0]

        xMove = 0
        yMove = 0

        [xMove, yMove] = takeAction(action)

        x = x + xMove
        y = y + yMove

        wallFlag = 0
        collisionFlag = 0
#         [x, y] = ckWall(x, y)
        pygame.draw.circle(screen, [100, 100, 255], [x,y], 10, 0)
#         pic = pygame.image.load("image/vacumrobot.jpg")
#         pic = pygame.transform.scale(pic, (40, 40))
#         screen.blit(pic, [x - agentRadius,y - agentRadius])
#         next_state = stateGenerator(obstaclePos, [x,y])
        next_macroState = stateGenerator([obstaclePos[minIndex][0], obstaclePos[minIndex][1]], [x, y], -1)

        initPosAgent = [x,y]
   
        if math.sqrt((x -  goalPos[0])**2 + (y - goalPos[1])**2) <= agentRadius:
            print("Goal Reached!")
            collisionFlag = 1
            done = True
        for i in range(0,obsNumber):
            if moveObstacles:
                obstaclePos[i][0] = obstaclePos[i][0] + random.randrange(-1,2)
                obstaclePos[i][1] = obstaclePos[i][1] + random.randrange(-1,2)
                
            pygame.draw.circle(screen, [255, 50, 50], obstaclePos[i], obstacleRadius, 0)
            if math.sqrt((x - obstaclePos[i][0])**2 + (y - obstaclePos[i][1])**2) < 19:
                print("Collision!")
                collisionFlag = -1
                done = True
#                 break
        
        if not done:
            reward = 0

        else:
            if collisionFlag == 1:
#                 reward = 10000 * math.cos(math.atan2(y - initPosAgent[1], x - initPosAgent[0]))
                reward = 1000
                rList.append(1)
            elif collisionFlag == -1:
                reward = -1000
                rList.append(0)
        
#         agent.train_model(state, action, reward, next_state, done)
        
#         agent.train_model(macroState, action, reward, next_macroState, done)
#         macroState = next_macroState
    
        score += reward
#         state = next_state
#         if score >= 10000:
#             print "Success!"
#             done = True
        pygame.draw.circle(screen, [255,100,100], initPosAgent, boundaryRadius, 2)

        if done:
            # every episode, plot the play time
            initPosAgent = initPosAgentStandard
            obstaclePos = [[0, 0] for _ in range(obsNumber)]
            for i in range(0,obsNumber):
#                 obsRadius = random.randrange(agentRadius + obstacleRadius + 100, obstacleRandomRange)
#                 obsAngle = random.randrange(90,180) * math.pi / 180
#                 obstaclePos[i][0] = int(initPosAgent[0] + obsRadius * math.cos(obsAngle))
#                 obstaclePos[i][1] = int(initPosAgent[1] + obsRadius * math.sin(obsAngle))
#                 obstaclePos[i][0] = int(initPosAgent[0] - (obstacleRadius + agentRadius + random.randrange(0, dispSize[0])))
#                 obstaclePos[i][1] = int(initPosAgent[1] + (obstacleRadius + agentRadius + random.randrange(0, dispSize[1])))
                obstaclePos[i][0] = int(150 + i % 14 * 80)
                obstaclePos[i][1] = int(i / 14 * 80)
            episodes.append(e)
            
        #circle(Surface, color, pos, radius, width=0)
        pygame.draw.circle(screen, [100,255,100], goalPos, 10, 2)
        pygame.draw.circle(screen, [0, 255, 200], tmpGoalPos, 5, 5)
        pygame.display.flip()
        screen.fill([220,220,220])
#     if e % 50 == 0:
#         agent.actor.save_weights("./Practice004_DataSave/Actor_For.h5")
#         agent.critic.save_weights("./Practice004_DataSave/Critic_For.h5")
    print score


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 128)               384       
_________________________________________________________________
dense_2 (Dense)              (None, 9)                 1161      
Total params: 1,545
Trainable params: 1,545
Non-trainable params: 0
_________________________________________________________________
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 128)               384       
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 129       
Total params: 513
Trainable params: 513
Non-trainable params

Goal Reached!
1000
('Episode ', 61, 'Starts!')
Goal Reached!
1000
('Episode ', 62, 'Starts!')
Goal Reached!
1000
('Episode ', 63, 'Starts!')
Collision!
-1000
('Episode ', 64, 'Starts!')
Goal Reached!
1000
('Episode ', 65, 'Starts!')
Goal Reached!
1000
('Episode ', 66, 'Starts!')
Collision!
-1000
('Episode ', 67, 'Starts!')
Collision!
-1000
('Episode ', 68, 'Starts!')
Goal Reached!
1000
('Episode ', 69, 'Starts!')
Goal Reached!
1000
('Episode ', 70, 'Starts!')
Goal Reached!
1000
('Episode ', 71, 'Starts!')
Goal Reached!
1000
('Episode ', 72, 'Starts!')
Goal Reached!
1000
('Episode ', 73, 'Starts!')
Goal Reached!
1000
('Episode ', 74, 'Starts!')
Goal Reached!
1000
('Episode ', 75, 'Starts!')
Goal Reached!
1000
('Episode ', 76, 'Starts!')
Collision!
-1000
('Episode ', 77, 'Starts!')
Goal Reached!
1000
('Episode ', 78, 'Starts!')
Goal Reached!
1000
('Episode ', 79, 'Starts!')
Goal Reached!
1000
('Episode ', 80, 'Starts!')
Goal Reached!
1000
('Episode ', 81, 'Starts!')
Goal Reached!
1000
('

Goal Reached!
1000
('Episode ', 235, 'Starts!')
Goal Reached!
1000
('Episode ', 236, 'Starts!')
Goal Reached!
1000
('Episode ', 237, 'Starts!')
Goal Reached!
1000
('Episode ', 238, 'Starts!')
Collision!
-1000
('Episode ', 239, 'Starts!')
Goal Reached!
1000
('Episode ', 240, 'Starts!')
Goal Reached!
1000
('Episode ', 241, 'Starts!')
Collision!
-1000
('Episode ', 242, 'Starts!')
Collision!
-1000
('Episode ', 243, 'Starts!')
Goal Reached!
1000
('Episode ', 244, 'Starts!')
Collision!
-1000
('Episode ', 245, 'Starts!')
Goal Reached!
1000
('Episode ', 246, 'Starts!')
Goal Reached!
1000
('Episode ', 247, 'Starts!')
Goal Reached!
1000
('Episode ', 248, 'Starts!')
Goal Reached!
1000
('Episode ', 249, 'Starts!')
Collision!
-1000
('Episode ', 250, 'Starts!')
Goal Reached!
1000
('Episode ', 251, 'Starts!')
Goal Reached!
1000
('Episode ', 252, 'Starts!')
Collision!
-1000
('Episode ', 253, 'Starts!')
Goal Reached!
1000
('Episode ', 254, 'Starts!')
Collision!
-1000
('Episode ', 255, 'Starts!')
Collis

Goal Reached!
1000
('Episode ', 409, 'Starts!')
Goal Reached!
1000
('Episode ', 410, 'Starts!')
Goal Reached!
1000
('Episode ', 411, 'Starts!')
Goal Reached!
1000
('Episode ', 412, 'Starts!')
Goal Reached!
1000
('Episode ', 413, 'Starts!')
Goal Reached!
1000
('Episode ', 414, 'Starts!')
Goal Reached!
1000
('Episode ', 415, 'Starts!')
Goal Reached!
1000
('Episode ', 416, 'Starts!')
Collision!
-1000
('Episode ', 417, 'Starts!')
Goal Reached!
1000
('Episode ', 418, 'Starts!')
Goal Reached!
1000
('Episode ', 419, 'Starts!')
Goal Reached!
1000
('Episode ', 420, 'Starts!')
Collision!
-1000
('Episode ', 421, 'Starts!')
Goal Reached!
1000
('Episode ', 422, 'Starts!')
Collision!
-1000
('Episode ', 423, 'Starts!')
Collision!
-1000
('Episode ', 424, 'Starts!')
Goal Reached!
1000
('Episode ', 425, 'Starts!')
Goal Reached!
1000
('Episode ', 426, 'Starts!')
Goal Reached!
1000
('Episode ', 427, 'Starts!')
Goal Reached!
1000
('Episode ', 428, 'Starts!')
Goal Reached!
1000
('Episode ', 429, 'Starts!')


In [9]:
print("Percent of successful episodes: " + str(100.0 * sum(rList)/num_episodes) + "%")

# plt.bar(range(len(rList)), rList, color = "Blue",width = 0.00001)
# plt.show()

Percent of successful episodes: 70.0%
