# Reinforcement Learning in Gridworld Environment using Markov decision process (mdp)

In [543]:
import numpy as np
class Environment:
    def __init__(self,height,width,start,goal):
        self.width = width
        self.height = height
        
        # R - possible reward space
        self.R = np.zeros((height,width)) -1
        # Obstacles
        self.R[2,:-1] = np.nan
        # Goal
        self.goal = goal
        self.R[self.goal] = 20
        
        # S - possible state states
        self.S = []       
        for index, reward in np.ndenumerate(self.R):
            # Not an obstacle
            if(not np.isnan(reward)):
                self.S.append(index)
        self.S = np.asarray(self.S)

        # A - Actions Space
        self.A = ['u','d','l','r']
        
        # s - Ini State - list for easier indexing
        self.s = list(start)
        
        # r - cumulative Reward
        self.G_t = 0
        
        
    def get_rewards(self):
        return self.G_t
    
    def get_possible_rewards(self):
        return self.R
    
    def get_possible_states(self):
        return self.S
    
    def get_state(self):
        return self.s
    
    def get_actions(self):
        return self.actions
    
    def get_grid_world(self):
        grid_world = np.chararray((self.width, self.height),unicode=True)
        grid_world[:] = '-'
        grid_world[self.s[0],self.s[1]]= 'a'
        grid_world[2,:-1] = 'o'
        grid_world[self.goal] = 'g'
        return grid_world
     
    def step(self, action,print_info=True):
        new_s,new_r = self.take_step(self.s,action)
        self.G_t += new_r
        self.s = new_s
        if(print_info):
            print(f'old_s:{self.s} new_s:{new_s} r_t:{new_r} G_t:{self.G_t}')
            print(f"World \n {self.get_grid_world()}")
        
    def check_if_s_in_S(self,s):
        for possible_s in self.S:
            if(possible_s[0] == s[0] and possible_s[1] == s[1]):
                return True
        return False
    
    def take_step(self,s,a):
        if(a not in self.A):
            raise ValueError('Unknown action', a)
        new_s = s.copy()
        if(a == 'u'):
            new_s[0] -= 1
        if(a == 'd'):
            new_s[0] += 1
        if(a == 'l'):
            new_s[1] -= 1
        if(a == 'r'):
            new_s[1] += 1
         
        # Out of bounds - stay in same place
        if((new_s[0] < 0 or new_s[0]>self.height-1) or (new_s[1] < 0 or new_s[1]>self.width-1)):
            return s,0
        # Check Obstacles - not in possible states - stay in same place
        elif(not self.check_if_s_in_S(new_s)):
            return s,0
        return new_s,self.R[new_s[0],new_s[1]]

In [544]:
height=7
width=7
env = Environment(height=7,width=7,start=(height-1,0),goal=(0,0))
print(f"World \n {env.get_grid_world()}")

World 
 [['g' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['o' 'o' 'o' 'o' 'o' 'o' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['a' '-' '-' '-' '-' '-' '-']]


In [545]:
for i in np.arange(4):
    env.step('u')

old_s:[5, 0] new_s:[5, 0] r_t:-1.0 G_t:-1.0
World 
 [['g' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['o' 'o' 'o' 'o' 'o' 'o' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['a' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']]
old_s:[4, 0] new_s:[4, 0] r_t:-1.0 G_t:-2.0
World 
 [['g' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['o' 'o' 'o' 'o' 'o' 'o' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['a' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']]
old_s:[3, 0] new_s:[3, 0] r_t:-1.0 G_t:-3.0
World 
 [['g' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['o' 'o' 'o' 'o' 'o' 'o' '-']
 ['a' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']]
old_s:[3, 0] new_s:[3, 0] r_t:0 G_t:-3.0
World 
 [['g' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['o' 'o' 'o' 'o' 'o' 'o' '-']
 ['a' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-'

In [539]:
for i in np.arange(4):
    env.step('r')

old_s:[3, 6] new_s:[3, 6] r:0
World 
 [['g' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['o' 'o' 'o' 'o' 'o' 'o' '-']
 ['-' '-' '-' '-' '-' '-' 'a']
 ['-' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']]
old_s:[3, 6] new_s:[3, 6] r:0
World 
 [['g' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['o' 'o' 'o' 'o' 'o' 'o' '-']
 ['-' '-' '-' '-' '-' '-' 'a']
 ['-' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']]
old_s:[3, 6] new_s:[3, 6] r:0
World 
 [['g' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['o' 'o' 'o' 'o' 'o' 'o' '-']
 ['-' '-' '-' '-' '-' '-' 'a']
 ['-' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']]
old_s:[3, 6] new_s:[3, 6] r:0
World 
 [['g' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['o' 'o' 'o' 'o' 'o' 'o' '-']
 ['-' '-' '-' '-' '-' '-' 'a']
 ['-' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-']
 ['-' '-