In [1]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.animation as animation

import time
import random

## Environments (Planet)

In [2]:
# U D L R
ACTIONS = {0: (-1, 0), 1: (1, 0), 2: (0, -1), 3: (0, 1)}
class Planet(object):
    def __init__(self):
        # start with defining your planet       
        self.planet = np.zeros((6, 6)).astype(int)
        #POIs
        #self.planet[5, 5] = 3
        #self.planet[2, 1] = 3
        #self.planet[3, 4] = 3
        #Robots
        self.planet[0, 0] = 2
        #Traps
        #self.planet[3, 4] = 1
        #self.planet[4, 2] = 1
        #self.planet[10, 2] = 1
        #self.planet[7, 0] = 1        
        #self.planet[7, 6] = 1
        #self.planet[14, 12] = 1
        #self.planet[11, 5] = 1
        #self.planet[5, 8] = 1        
        #self.planet[2:7, 12] = 1
        
        self.robot_positions = [(0,0)]
        self.robot_0_done = False
        self.steps = 0 # contains num steps robot took
        self.allowed_states = None # for now, this is none
        self.construct_allowed_states() 
        
    def is_allowed_move(self, state, action):
        y, x = state
        y += ACTIONS[action][0]
        x += ACTIONS[action][1]
        # moving off the board
        if y < 0 or x < 0 or y > 5 or x > 5:
             return False
        # moving into empty space or POI
        # if self.planet[y, x] == 0 or self.planet[y, x] == 3:
            #return True
        else:
            return True
        
    def construct_allowed_states(self):
        allowed_states = {}
        for y, row in enumerate(self.planet):
            for x, col in enumerate(row):
                # iterate through all valid spaces
                if self.planet[(y,x)] != 1:
                    allowed_states[(y,x)] = []
                    for action in ACTIONS:
                        if self.is_allowed_move((y, x), action):
                            allowed_states[(y,x)].append(action)
        self.allowed_states = allowed_states
        
    def update_planet(self, action, robot_num):
        y, x = self.robot_positions[robot_num]
        old_y, old_x = self.robot_positions[robot_num]
        y += ACTIONS[action][0]
        x += ACTIONS[action][1]
        #if self.planet[y, x] != 2:
        self.planet[y, x] = 2
        self.planet[old_y, old_x] = 0
        self.robot_positions[robot_num] = (y, x)
        
    def invalid_move(self, action, robot_num):
        y, x = self.robot_positions[robot_num]
        y += ACTIONS[action][0]
        x += ACTIONS[action][1]
        return (y, x)
        
                
    def is_game_over(self):
        if self.robot_0_done == True:
            return True
        else:
            return False
    def give_reward(self, state_history, robot_num, visited_locations):
        location = self.robot_positions[robot_num]
        if location not in visited_locations:
            return 1
        else:
            return -0.05
    def get_state_and_reward(self, state_history, robot_num, visited_locations):
        return self.robot_positions[robot_num], self.give_reward(state_history, robot_num, visited_locations)

## Agent Class (Robots)

In [3]:
# U D L R
ACTIONS = {0: (-1, 0), 1: (1, 0), 2: (0, -1), 3: (0, 1)}
class Agent(object):
    def __init__(self, states, alpha=0.15, gamma = 0.9, random_factor=0.2, robot_num=9):
        self.state_history = [((0, 0), 0, 0, (0, 0))] # state, reward, action, next_state
        self.initial_alpha = alpha
        self.alpha = alpha
        self.learning_decay = 1
        self.gamma = gamma
        self.random_factor = random_factor
        self.robot_num = robot_num
        
        # start the rewards table
        self.qtable = np.zeros((36, 4)).tolist()
        
        self.visited_locations = [(0,0)]
        self.no_visited_locations = 0
        self.seen_locations = []

    def update_state_history(self, state, reward, action, next_state):
        self.state_history.append((state, reward, action, next_state))
        
    def trapped(self, reward):
        temp = (self.state_history.pop())
        list_s_history = list(temp)
        list_s_history[1] = reward
        tup_n_history = tuple(list_s_history)
        self.state_history.append(tup_n_history)
        
    def reset_count(self):
        self.visited_locations = []
        self.no_visited_locations = 0
        
    def learn(self, state, reward, action, next_state, episode):
        a = self.alpha
        g = self.gamma
        y, x = state
        state_index = (y * 6) + x
        current_qReward =  self.qtable[state_index][action]
        Ny, Nx = next_state       
        next_state_index = (Ny * 5) + Nx
        #self.qtable[state_index][action] = reward + (g * (max(self.qtable[next_state_index])))
        self.qtable[state_index][action] = current_qReward + (a * (reward + (g * (max(self.qtable[next_state_index]))) - current_qReward))
        self.state_history = [] # reset the state_history
        self.random_factor = self.random_factor*0.9995 # decrease random_factor
        if self.random_factor <= 0.01:
            self.random_factor = 0.01 
        self.alpha = (1/(1+self.learning_decay*episode))*self.initial_alpha
            
    def choose_action(self, state, allowed_moves, state_history):
        next_move = None
        n = np.random.uniform()
        if n < self.random_factor:
            next_move = np.random.choice(allowed_moves)
            valid = True
        else:
            y, x = state
            state_index = (y * 6) + x
            
            unvisited_actions = []
            
            action = 0
            while action < 4:
                temp_y, temp_x = state
                temp_y += ACTIONS[action][0]
                temp_x += ACTIONS[action][1]
                if not (temp_y < 0 or temp_x < 0 or temp_y > 5 or temp_x > 5):
                    if not ((temp_y, temp_x) in self.visited_locations):
                        action_reward = self.qtable[state_index][action]
                        unvisited_actions.append((action_reward, action))
                action += 1
            
            if unvisited_actions == []:
                action = 0
                while action < 4:
                    temp_y, temp_x = state
                    temp_y += ACTIONS[action][0]
                    temp_x += ACTIONS[action][1]
                    if not (temp_y < 0 or temp_x < 0 or temp_y > 5 or temp_x > 5):
                        action_reward = self.qtable[state_index][action]
                        unvisited_actions.append((action_reward, action))
                    action += 1
                action_label = unvisited_actions[unvisited_actions.index(max(unvisited_actions))]
                unvisited_actions.remove(action_label)
                next_move = action_label[1]    
                valid = True
            else:
                action_label = unvisited_actions[unvisited_actions.index(max(unvisited_actions))]
                unvisited_actions.remove(action_label)
                next_move = action_label[1]    
                valid = True
                # Get univisited possiblilities:
        return next_move, valid  

## Learning Loop

In [None]:
if __name__ == '__main__':  
    #Hyper Parameters
    Learning_Rate = 0.001
    Error_Rate = 0.9
    Epsilon = 0.5
    Episodes = 4000
    Movement_Budget = 500
    completed_count = 0
    
    planet = Planet()
    robot = Agent(planet.planet, alpha=Learning_Rate, gamma=Error_Rate, random_factor=Epsilon, robot_num=0)
    moveHistory = []
    epsilonHistory = []
    
    for i in range(Episodes):
        if i % 1000 == 0:                    
            print("Episode",i)
            print("Epsilon",robot.random_factor)
            print("Completed", completed_count) 
            completed_count = 0
        movementBudget = Movement_Budget
        robot.reset_count()
        
        while not planet.is_game_over():
            sHistory = robot.state_history
            state, _ = planet.get_state_and_reward(sHistory, robot.robot_num, robot.visited_locations) # get the current state
            action, valid = robot.choose_action(state, planet.allowed_states[state], sHistory) # choose an action (explore or exploit)
            if valid:
                planet.update_planet(action, robot.robot_num) # update the planet according to the action
                next_state, reward = planet.get_state_and_reward(sHistory, robot.robot_num, robot.visited_locations) # get the new state and reward    
                if next_state not in robot.visited_locations:
                    robot.visited_locations.append(next_state)
                    robot.no_visited_locations += 1
                    if robot.no_visited_locations >=36:
                        completed_count += 1
                        reward = 10
                    if next_state in robot.seen_locations:
                        robot.seen_locations.remove(next_state)

                robot.learn(state, reward, action, next_state, i)
                #robot.update_state_history(state, reward, action, next_state) # update the robot memory with state and reward
            
            planet.steps += 1
            movementBudget -= 1
            if robot.no_visited_locations >= 36: 
                planet.robot_0_done = True
            elif movementBudget <= 0:
                planet.robot_0_done = True        
        
        # robot should learn after every episode

        moveHistory.append(planet.steps) # get a history of number of steps taken to plot later
        epsilonHistory.append(robot.random_factor)
        planet = Planet() # reinitialize the planet

print("Episode",i)
print("Epsilon",robot.random_factor)
print("Completed", completed_count) 
        
print("Visited Locations:", robot.no_visited_locations, "Steps:", moveHistory[-1])  
plt.figure(figsize=(18, 10))
plt.semilogy(moveHistory, color="b")
#plt.semilogy(epsilonHistory, color="r")
plt.show()

## Testing and Simulation

In [None]:
testHistory = []

robot.reset_count()

for i in range(1):
        if i % 100 == 0:            
            print(i)
        
        planet = Planet()
        tx, ty, px, py, rx, ry, x_history, y_history = [], [], [], [], [], [], [], []
        
        for y, row in enumerate(planet.planet):
            for x, col in enumerate(row):
                if planet.planet[(y,x)] == 1:
                    tx.append(x)
                    ty.append(y)
                if planet.planet[(y,x)] == 3:
                    px.append(x)
                    py.append(y)
                if planet.planet[(y,x)] == 2:
                    rx.append(x)
                    ry.append(y)
        x_history.append(rx)
        y_history.append(ry)
        plt.scatter(rx, ry, marker = 'o', color = 'g') 
        plt.scatter(tx, ty, marker = 'x', color = 'r') 
        plt.scatter(px, py, marker = '*', color = 'y') 
        plt.xlim([-1, 6])
        plt.ylim([-1, 6])   
        plt.show()
        
        movementBudget = Movement_Budget
        robot.reset_count()
        robot.random_factor = 0
        vist_count = 0
        count = 0
        action_history = []
        
        state = (0,0)
        
        """
        while vist_count < 36 and count < 600:
            count += 1
            y, x = state
            Sindex = (y * 6) + x
            state_index = (y * 6) + x
            
            unvisited_actions = []
            action = 0
            while action < 4:
                temp_y, temp_x = state
                temp_y += ACTIONS[action][0]
                temp_x += ACTIONS[action][1]
                if not (temp_y < 0 or temp_x < 0 or temp_y > 5 or temp_x > 5):
                    action_reward = robot.qtable[state_index][action]
                    unvisited_actions.append((action_reward, action))
                action += 1
                       
            action_label = unvisited_actions[unvisited_actions.index(max(unvisited_actions))]
            next_move = action_label[1]    

            action_history.append(next_move)
            planet.update_planet(next_move, robot.robot_num)
            state, _ = planet.get_state_and_reward(sHistory, robot.robot_num, robot.visited_locations)
            if state not in robot.visited_locations:
                robot.visited_locations.append(state)
                vist_count += 1                                  
            """
        
        while not planet.is_game_over():
            sHistory = robot.state_history
            state, _ = planet.get_state_and_reward(sHistory, robot.robot_num, robot.visited_locations) # get the current state
            action, valid = robot.choose_action(state, planet.allowed_states[state], sHistory) # choose an action (explore or exploit)
            if valid:
                planet.update_planet(action, robot.robot_num) # update the planet according to the action
                next_state, reward = planet.get_state_and_reward(sHistory, robot.robot_num, robot.visited_locations) # get the new state and reward    
                if next_state not in robot.visited_locations:
                    robot.visited_locations.append(next_state)
                    robot.no_visited_locations += 1
                    if robot.no_visited_locations >=36:
                        completed_count += 1
                        reward = 10
                    if next_state in robot.seen_locations:
                        robot.seen_locations.remove(next_state)

                robot.learn(state, reward, action, next_state, i)
                #robot.update_state_history(state, reward, action, next_state) # update the robot memory with state and reward
            
            planet.steps += 1
            movementBudget -= 1
            if robot.no_visited_locations >= 36: 
                planet.robot_0_done = True
            elif movementBudget <= 0:
                planet.robot_0_done = True

            rx = []
            ry = []
            for y, row in enumerate(planet.planet):
                for x, col in enumerate(row):
                    if planet.planet[(y,x)] == 2:
                        rx.append(x)
                        ry.append(y)
            x_history.append(rx)
            y_history.append(ry)
        
        print("Count", count)
        testHistory.append(planet.steps) # get a history of number of steps taken to plot later
        
        fig, ax = plt.subplots()
                  
        def animate(i):
            if i % 1 == 0:            
                print(i) 
            plt.clf()
            plt.xlim([-1, 6])
            plt.ylim([-1, 6])
            plt.scatter(x_history[i], y_history[i], marker = 'o', color = 'g') 
            px.append(x_history[i])
            py.append(y_history[i])
            plt.scatter(px, py, marker = '*', color = 'y') 
            return fig,

        ani = animation.FuncAnimation(fig, animate, repeat=True,
                                            frames=len(x_history) - 1, interval=500)        
        # To save the animation using Pillow as a gif
        writer = animation.PillowWriter(fps=5,
                                     metadata=dict(artist='Me'),
                                     bitrate=1800)
        ani.save('path.gif', writer=writer)
        plt.show()
        