# Reinforcement Learning in Gridworld Environment using Markov decision process (mdp)

In [617]:
import numpy as np
class Environment:
    def __init__(self,height,width,start,goal):
        self.width = width
        self.height = height
        
        # R - possible reward space
        self.R = np.zeros((height,width)) -1
        # Obstacles
        self.R[2,:-1] = np.nan
        # Goal
        self.goal = goal
        self.R[self.goal] = 20
        
        # S - possible state states
        self.S = []       
        for index, reward in np.ndenumerate(self.R):
            # Not an obstacle
            if(not np.isnan(reward)):
                self.S.append(index)
        self.S = np.asarray(self.S)

        # A - Actions Space
        self.A = ['u','d','l','r']
        
        # s - Ini State - list for easier indexing
        self.s = list(start)
        
        # r - cumulative Reward
        self.G_t = 0
        
        self.is_done = False
        
    def get_rewards(self):
        return self.G_t
    
    def get_possible_rewards(self):
        return self.R
    
    def get_possible_states(self):
        return self.S
    
    def get_state(self):
        return self.s
    
    def get_actions(self):
        return self.A
    
    def get_grid_world(self):
        grid_world = np.chararray((self.width, self.height),unicode=True)
        grid_world[:] = '-'
        grid_world[self.s[0],self.s[1]]= 'a'
        grid_world[2,:-1] = 'o'
        grid_world[self.goal] = 'g'
        return grid_world
     
    def step(self, action,print_info=True):
        new_s,new_r = self.take_step(self.s,action)
#         self.G_t += new_r
        self.s = new_s
        if(print_info):
            print(f'old_s:{self.s} new_s:{new_s} r_t:{new_r} G_t:{self.G_t}')
            print(f"World \n {self.get_grid_world()}")
        
        self.is_done = (self.s == self.goal)
            
        return new_r,new_s,self.is_done
        
    def check_if_s_in_S(self,s):
        for possible_s in self.S:
            if(possible_s[0] == s[0] and possible_s[1] == s[1]):
                return True
        return False
    
    def take_step(self,s,a):
        if(a not in self.A):
            raise ValueError('Unknown action', a)
        new_s = s.copy()
        if(a == 'u'):
            new_s[0] -= 1
        if(a == 'd'):
            new_s[0] += 1
        if(a == 'l'):
            new_s[1] -= 1
        if(a == 'r'):
            new_s[1] += 1
         
        # Out of bounds - stay in same place
        if((new_s[0] < 0 or new_s[0]>self.height-1) or (new_s[1] < 0 or new_s[1]>self.width-1)):
            return s,-1
        # Check Obstacles - not in possible states - stay in same place
        elif(not self.check_if_s_in_S(new_s)):
            return s,-1
        return new_s,self.R[new_s[0],new_s[1]]
    
    


In [618]:
import random
# class Agent(object):
#     '''Agent base class'''

#     def __init__(self, actions):
#         self.actions = actions
#         

#     def step(self, obs, reward, done, info):
#         raise NotImplementedError
        
class RandomAgent():
    '''Agent that samples actions uniformly at random'''

    def __init__(self, actions):
        self.actions = actions
        self.num_actions = len(actions)
        
        self.r_t = np.array([])
        # r - cumulative Reward
        self.G_t = 0
    
    def step(self):
        return self.actions[random.randint(0, self.num_actions-1)]
    
    def update(self,new_s,reward):
        self.s = new_s
        self.r_t = np.append(self.r_t,reward)
        self.G_t += reward
        return self.r_t,self.G_t

In [619]:
class Experiment():
    def __init__(self,env,agent):
        self.env = env
        self.agent = agent
        
    def run(self, num_steps):
        steps = 0
        done = False
        reward = .0
        rewards = np.array([])
        losses = []

        while steps < num_steps:
            print(f"*** Step: {steps} ***")
            action = self.agent.step()
            reward,new_s,is_done = self.env.step(action)
            r_t, cumulative_r = self.agent.update(new_s,reward)
            print(f"Cumulative_r:{cumulative_r}")
            steps += 1
            
            if(is_done):
                print(f"At Goal:{new_s}")
                break

In [620]:
height=7
width=7

env = Environment(height=7,width=7,start=(height-1,0),goal=(0,0))
random_agent = RandomAgent(env.get_actions())
exp = Experiment(agent=random_agent,env=env)

In [621]:
exp.run(50)

*** Step: 0 ***
old_s:[5, 0] new_s:[5, 0] r_t:-1.0 G_t:0
World 
 [['g' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['o' 'o' 'o' 'o' 'o' 'o' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['a' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']]
Cumulative_r:-1.0
*** Step: 1 ***
old_s:[5, 0] new_s:[5, 0] r_t:-1 G_t:0
World 
 [['g' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['o' 'o' 'o' 'o' 'o' 'o' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['a' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']]
Cumulative_r:-2.0
*** Step: 2 ***
old_s:[5, 1] new_s:[5, 1] r_t:-1.0 G_t:0
World 
 [['g' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['o' 'o' 'o' 'o' 'o' 'o' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['-' 'a' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']]
Cumulative_r:-3.0
*** Step: 3 ***
old_s:[5, 0] new_s:[5, 0] r_t:-1.0 G_t:0
World 
 [['g' '-' '-' '-' '-' '-' '-']
 ['

In [545]:
for i in np.arange(4):
    env.step('u')

old_s:[5, 0] new_s:[5, 0] r_t:-1.0 G_t:-1.0
World 
 [['g' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['o' 'o' 'o' 'o' 'o' 'o' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['a' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']]
old_s:[4, 0] new_s:[4, 0] r_t:-1.0 G_t:-2.0
World 
 [['g' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['o' 'o' 'o' 'o' 'o' 'o' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['a' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']]
old_s:[3, 0] new_s:[3, 0] r_t:-1.0 G_t:-3.0
World 
 [['g' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['o' 'o' 'o' 'o' 'o' 'o' '-']
 ['a' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']]
old_s:[3, 0] new_s:[3, 0] r_t:0 G_t:-3.0
World 
 [['g' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['o' 'o' 'o' 'o' 'o' 'o' '-']
 ['a' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-'

In [539]:
for i in np.arange(4):
    env.step('r')

old_s:[3, 6] new_s:[3, 6] r:0
World 
 [['g' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['o' 'o' 'o' 'o' 'o' 'o' '-']
 ['-' '-' '-' '-' '-' '-' 'a']
 ['-' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']]
old_s:[3, 6] new_s:[3, 6] r:0
World 
 [['g' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['o' 'o' 'o' 'o' 'o' 'o' '-']
 ['-' '-' '-' '-' '-' '-' 'a']
 ['-' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']]
old_s:[3, 6] new_s:[3, 6] r:0
World 
 [['g' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['o' 'o' 'o' 'o' 'o' 'o' '-']
 ['-' '-' '-' '-' '-' '-' 'a']
 ['-' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']]
old_s:[3, 6] new_s:[3, 6] r:0
World 
 [['g' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['o' 'o' 'o' 'o' 'o' 'o' '-']
 ['-' '-' '-' '-' '-' '-' 'a']
 ['-' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['-' '-