In [3]:
import numpy as np
import operator
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:


class GridWorld:
    def __init__(self):
        self.height = 5
        self.width = 5
        self.grid = np.zeros((self.height, self.width)) - 1  # Initialize grid with -1 rewards
        self.current_location = (4, np.random.randint(0, self.width))  # Agent starts at a random column in the bottom row

        self.bomb_location = (1, 3)
        self.gold_location = (0, 3)
        self.terminal_states = [self.bomb_location, self.gold_location]

        # Set specific rewards for bomb and gold locations
        self.grid[self.bomb_location[0], self.bomb_location[1]] = -10
        self.grid[self.gold_location[0], self.gold_location[1]] = 10

        self.actions = ['UP', 'DOWN', 'LEFT', 'RIGHT']

    def get_available_actions(self):
        """Returns possible actions"""
        return self.actions

    def agent_on_map(self):
        """Prints out current location of the agent on the grid (used for debugging)"""
        grid = np.zeros((self.height, self.width))
        grid[self.current_location[0], self.current_location[1]] = 1
        return grid

    def get_reward(self, location):
        """Returns the reward for an input position"""
        return self.grid[location[0], location[1]]

    def make_step(self, action):
        """
        Moves the agent in specified direction.
        If agent is at a border, agent stays still but takes a negative reward.
        Returns the reward for the move.
        """
        last_location = self.current_location

        if action == 'UP':
            if last_location[0] == 0:  # At the top border
                reward = self.get_reward(last_location)
            else:
                self.current_location = (self.current_location[0] - 1, self.current_location[1])
                reward = self.get_reward(self.current_location)

        elif action == 'DOWN':
            if last_location[0] == self.height - 1:  # At the bottom border
                reward = self.get_reward(last_location)
            else:
                self.current_location = (self.current_location[0] + 1, self.current_location[1])
                reward = self.get_reward(self.current_location)

        elif action == 'LEFT':
            if last_location[1] == 0:  # At the left border
                reward = self.get_reward(last_location)
            else:
                self.current_location = (self.current_location[0], self.current_location[1] - 1)
                reward = self.get_reward(self.current_location)

        elif action == 'RIGHT':
            if last_location[1] == self.width - 1:  # At the right border
                reward = self.get_reward(last_location)
            else:
                self.current_location = (self.current_location[0], self.current_location[1] + 1)
                reward = self.get_reward(self.current_location)

        return reward

    def check_state(self):
        """
        Check if the agent is in a terminal state (gold or bomb).
        If so, return 'TERMINAL'.
        """
        if self.current_location in self.terminal_states:
            return 'TERMINAL'
        return 'ONGOING'


In [5]:
class RandomAgent():

  def choose_action(self,get_available_actions):
    """Returns a random choice of the available actions"""
    return np.random.choice(available_actions)

In [7]:
class Q_Agent():
  def __init__(self,environment, episilon =0.05,alpha=0.1, gamma =1):
    self.environment=environment
    self.q_table = dict()
    for x in range(environment.height):
      for y in range(environment.width):
        self.q_table[(x,y)]={'UP':0,'DOWN':0,'LEFT':0,'RIGHT':0}

    self.episilon=episilon
    self.alpha=alpha
    self.gamma=gamma

  def choose_action(self,available_actions):
    """Returs the optimal action from q-value table. If multiple optimal actions, chooses random choice. Will make an explotory randim action dependent on episilon"""

    if np.random.uniform(0,1)<self.episilon:
      action = available_actions[np.random.randint(0,len(available_actions))]

    else:
      q_values_of_state = self.q_table[self.environment.current_location]
      maxValues = max(q_values_of_state.values())
      action = np.random.choice([k for k, v in q_values_of_state.items() if v == maxValue])

      return action

  def learn(self,old_state, reward, new_state, action):
    """Update the Q- Value table using Q-Learning"""

    q_values_of_state = self.q_table[new_state]
    max_q_value_in_new_state = max(q_values_of_state.values())
    current_q_value = self.q_table[old_state][action]

    self.q_table[old_state][action]=(1-self.alpha)*current_q_value+self.alpha*(reward + self.gamma*max_q_value_in_new_state)


In [15]:
def play(environment,agent,trials=500, max_steps_per_episode=1000,learn=False):
  """The play function runs iterations and updates Q-values if desired."""
  reward_per_episode =[]

  for trial in range(trials):
    cumulative_reward = 1
    step = 0
    gamma_over = False
    while step < max_steps_per_episode and gamma_over != True:
      old_state = environment.current_location
      action = agent.choose_action(environment.actions)
      reward=environment.make_step(action)
      new_state = environment.current_location

      if learn == True:
        agent.learn(old_state,reward,new_state,action)

        cumulative_reward += reward
        step +=1

      if environment.check_state() == 'TERMINAL':
        environment.__init__()
        game_over = True

    reward_per_episode.append(cumulative_reward)

    return reward_per_episode


In [16]:
env = GridWorld()
agent = RandomAgent()

print("Current position of thr agent =", env.current_location)
print(env.agent_on_map())
available_actions=env.get_available_actions()
print("Available Actions =",available_actions)
choosen_action =agent.choose_action(available_actions)
print("Randomly chosen action = ",choosen_action)
reward = env.make_step(choosen_action)

print("Rward Obtained :",reward)
print("Current postion of the agent =",env.current_location)
print(env.agent_on_map())



Current position of thr agent = (4, 2)
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0.]]
Available Actions = ['UP', 'DOWN', 'LEFT', 'RIGHT']
Randomly chosen action =  LEFT
Rward Obtained : -1.0
Current postion of the agent = (4, 1)
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]]


In [None]:
environment = GridWorld()
agentQ = RandomAgent()

reward_per_episode=play(environment,agentQ, trials =500)
plt.plot(reward_per_episode)

In [None]:
def pretty(d,indent=0):
  for key, value in d.items():
    print('\t'*indent + str(key))
    if isinstance(value,dict):
      pretty(value,indent+1)
    else:
      print('\t'*(indent+1)+str(value))
pretty(agentQ.q_table)