In [1]:
import numpy as np
import random
from tabulate import tabulate
from tqdm import tqdm
import ast

In [2]:
def generate_grid_world(length, width,path_lenght,holes_number,Random_State):
    
    random.seed(Random_State)
    #store all cells in a list
    Grid_Cells = []
    for row in range(length):
        for col in range(width):
            Grid_Cells.append([row,col])


    #specify the number of holes in the gridworld
    
    #specify the start point as a random cell
    start = [random.randint(0, length), random.randint(0, width)]

    #create a path from start point
    """instead of defining start and goal points,
      we define just a start point and a random path with a random lenght to
       another point and name it as goal point"""
    
    def random_path(Start, Path_Lenght,length, width):
        
        Path = []
        Path.append(Start)
        for i in range(Path_Lenght):
            
            #there are two moves that take us on a random cell named Goal [1,0], [0,1]
            
            move = random.choice([[1,0], [0,1]])
            
            #update the start cell/point by the above move
            Start = [x + y for x, y in zip(Start, move)]
            
            #if the movement take us out of our gridworld, we reverse the change in the start point
            if Start[0] < 0 or Start[1] < 0 or Start[0] > length-1 or Start[1] > width-1:

                Start = [x - y for x, y in zip(Start, move)]

            else:
                
                #create a path history
                Path.append(Start)

        Goal = Start

        return Goal,Path
    

    GoalPath = random_path(start, path_lenght,length, width)

    goal = GoalPath[0]
    path = GoalPath[1]

    #now we must eliminate the path cells from the Grid_Cells to choose hole cells from remaining cells

    FreeCells = [x for x in Grid_Cells if x not in path]

    Holes = random.sample(FreeCells, holes_number)

    #Also, we can visualize our gridworld in a simple way

    def mark_holes(holes):
        marked_data = [["Hole" if [row, col] in holes else [row, col] for col in range(width)] for row in range(length)]
        return marked_data
    
    marked_matrix = mark_holes(Holes)

    print(tabulate(marked_matrix, tablefmt="grid"))

    
    return length, width, start, goal, Holes, path,Grid_Cells

In [3]:
environment = generate_grid_world(5, 4,4,4,39)
environment

+--------+--------+--------+--------+
| Hole   | [0, 1] | [0, 2] | [0, 3] |
+--------+--------+--------+--------+
| [1, 0] | [1, 1] | [1, 2] | [1, 3] |
+--------+--------+--------+--------+
| Hole   | [2, 1] | [2, 2] | [2, 3] |
+--------+--------+--------+--------+
| Hole   | [3, 1] | Hole   | [3, 3] |
+--------+--------+--------+--------+
| [4, 0] | [4, 1] | [4, 2] | [4, 3] |
+--------+--------+--------+--------+


(5,
 4,
 [1, 2],
 [4, 3],
 [[2, 0], [3, 2], [3, 0], [0, 0]],
 [[1, 2], [1, 3], [2, 3], [3, 3], [4, 3]],
 [[0, 0],
  [0, 1],
  [0, 2],
  [0, 3],
  [1, 0],
  [1, 1],
  [1, 2],
  [1, 3],
  [2, 0],
  [2, 1],
  [2, 2],
  [2, 3],
  [3, 0],
  [3, 1],
  [3, 2],
  [3, 3],
  [4, 0],
  [4, 1],
  [4, 2],
  [4, 3]])

In [6]:
def probability_distribution(grid_size,randomness):
    #random.seed(40)
    
    #by this function we generate probabilities which their sum is equal to 1
    def generate_probabilities(n):

        numbers = [random.random() for _ in range(n)]
        total_sum = sum(numbers)
        scaled_numbers = [num / total_sum for num in numbers]
        
        return scaled_numbers
    
    cells_prob = {}
    if randomness == 'stochastic':
        for cell in range(grid_size):
            
            #we set the number of probs to 4 due to 4 possible action for each cell (go to its neighbors)
            probs = generate_probabilities(4)

            cells_prob[cell] = probs
    elif randomness == 'equal probable':

        for cell in range(grid_size):

            cells_prob[cell] = [0.25,0.25,0.25,0.25]
    
    elif randomness == 'deterministic':
        for cell in range(grid_size):

            cells_prob[cell] = [0.03,0.06,0.01,0.9] #[0,0,0,1] ##[0.15,.15,0.1,0.6]


    #Note that we consider the correspondence between probabilities and actions as below:
    #probs = [p1, p2, p3, p4] ---> [[1,0],[-1,0],[0,1],[0,-1]]

    return cells_prob

def neighbor_cells(cell):

    grid_cells = environment[6]
    Actions = [[1,0],[-1,0],[0,1],[0,-1]]

    Neighbors = []
    Actions_Neighbors = []
    for action in Actions:

        neighbor = [x + y for x, y in zip(cell, action)]
        #if neighbor not in environment[4]:
        Neighbors.append(neighbor)
        Actions_Neighbors.append(action)

    return Neighbors, Actions_Neighbors

#Note
"""As we want to use monte carlo method for estimating the state values
   it has been assumed that we have not any knowledge about the environment.
   Therefore, we should consider the transitions into the holes cells
   (against the case of policy iteration)"""

def arbitrary_policy(randomness):
    #random.seed(randomness)
    
    policy = {}
    policy_action = {}
    for state in environment[6]:

        if state not in environment[4]:

            neighbors = neighbor_cells(state)[0]
            Actions_Neighbors = neighbor_cells(state)[1]

            allowed_positions = []

            for neighbor in neighbors:
                
                if neighbor in environment[6] and neighbor not in environment[4]:
                    
                    allowed_positions.append(neighbor)
        
            next_state = random.choice(allowed_positions)

            row = next_state[0] - state[0]
            col = next_state[1] - state[1]
            PolicyAction = [row, col]

            policy['{}'.format(state)] = next_state
            policy_action['{}'.format(state)] = PolicyAction


    return policy, policy_action

def state_reward(next_state):

    if next_state in environment[4]:

        r = -3
    
    elif next_state == environment[3]:

        r = 100
    
    elif next_state not in environment[6]:

        r = -2
    
    else:

        r = -1
    
    return r

def reverse_dictionary(dict):
    reverse_dict = {}
    for key in list(dict.keys()):
        val = dict[key]
        reverse_dict[val] = key
    return reverse_dict


state_indice_dict = {}
counter = 0
for state in environment[6]:

    state = str(state)
    state_indice_dict[state] = counter
    counter = counter + 1


def generate_trajectory(policy,randomness,environment_stochasticity):

    policy_action = policy[1]
    probs = probability_distribution(environment[0]*environment[1],environment_stochasticity)
    start = environment[2]
    terminate = start
    trajectory = [start]
    pure_trajectory = [start]
    c = 0
    while terminate != environment[3]:
        random.seed(randomness+c)
        Actions = [[1,0],[-1,0],[0,1],[0,-1]]
        action = policy_action[str(terminate)]
        Actions.remove(action)
        sorted_actions = Actions + [action]
        state_indice = state_indice_dict[str(terminate)]
        actions_prob = probs[state_indice]
        actions_prob.sort()

        selected_action = random.choices(sorted_actions, actions_prob)[0]
        
        next_state = [x + y for x, y in zip(terminate, selected_action)]
        pure_trajectory.append(next_state)
        
        #if the agent goes out of the gridworld, it stays in its current state
        if next_state not in environment[6]:
            next_state = terminate
        
        #if it drops into the holes, it goes to the start points
        elif next_state in environment[4]:
            next_state = start  

        terminate = next_state
        trajectory.append(terminate)
        c = c+1
    
    trajectory.append(environment[3])
    pure_trajectory.append(environment[3])

    return trajectory,pure_trajectory

## n-step TD for estimating $V \approx  v_{\pi}$

In [5]:
def n_step_TD(num_trials, n, policy, gamma, alpha, environment_stochasticity):

    V = {}
    for state in environment[6]:
    
        if state not in environment[4]:

            V[str(state)] = 0
    
    indice_state_dict = {}
    counter = 0
    for state in environment[6]:

        #state = str(state)
        indice_state_dict[counter] = state
        counter = counter + 1
    
    state_policy = policy[0]
    action_policy = policy[1]

    for trial in tqdm(range(num_trials)):

        TRAJECTORY = generate_trajectory(policy,trial,environment_stochasticity)

        trajectory = TRAJECTORY[0]
        pure_trajectory = TRAJECTORY[1]
        
        T = float('inf')
        tau = 0
        t = -1
        while tau != T - 1:

            t = t + 1

            if t < T:
                
                #t_state = indice_state_dict[t]
                #next_state = state_policy[str(t_state)]
                

                if pure_trajectory[t+1] == environment[3]:
                    T = t + 1
            
            tau = t - n + 1

            if tau >= 0:

                G = 0

                for i in range(tau+1, min(tau+n,T)+1):

                    r = state_reward(pure_trajectory[i+1])

                    G = G + (gamma ** (i-tau-1)) * r

                if tau + n < T:

                    #print(tau , n)
                    
                    tau_n_state = trajectory[tau + n] #indice_state_dict[tau + n]

                    if tau_n_state not in environment[4] and tau_n_state != environment[3]:

                        G = G + (gamma ** n) * V[str(tau_n_state)]
                
                tau_state = trajectory[tau] #indice_state_dict[tau]
                #tau
                if tau_state not in environment[4]:
                    #print(type(tau_state))


                    V[str(tau_state)] = V[str(tau_state)] + alpha * (G - V[str(tau_state)])
        

    
    return V


In [26]:
policy_0 = arbitrary_policy(41)

n_step_TD(1000, 2, policy_0, 0.3, 0.9, 'deterministic')

100%|██████████| 1000/1000 [03:21<00:00,  4.95it/s]


{'[0, 1]': -1.4285724302339287,
 '[0, 2]': -1.443169885479679,
 '[0, 3]': -2.3288197695616843,
 '[1, 0]': -1.4289050421621532,
 '[1, 1]': -1.428804696583943,
 '[1, 2]': -1.42903754063062,
 '[1, 3]': -1.4504922034868155,
 '[2, 1]': -1.4287574405696546,
 '[2, 2]': 34.05655312979737,
 '[2, 3]': 116.85704179643528,
 '[3, 1]': 0,
 '[3, 3]': 100.0,
 '[4, 0]': 0,
 '[4, 1]': 0,
 '[4, 2]': 0,
 '[4, 3]': 0}

In [27]:
n_step_TD(1000, 2, policy_0, 0.3, 0.9, 'stochastic')

100%|██████████| 1000/1000 [00:36<00:00, 27.23it/s]


{'[0, 1]': -1.4301128100715403,
 '[0, 2]': -1.575520135560438,
 '[0, 3]': -2.5984038493598653,
 '[1, 0]': -1.4629465272519007,
 '[1, 1]': -3.2805724318680447,
 '[1, 2]': 8.202714452634787,
 '[1, 3]': -1.4410685563616235,
 '[2, 1]': -1.4347263015381202,
 '[2, 2]': 34.05743741875119,
 '[2, 3]': 128.81786799436864,
 '[3, 1]': -1.5939095375377672,
 '[3, 3]': 100.0,
 '[4, 0]': -1.633465751738598,
 '[4, 1]': 117.33555165854531,
 '[4, 2]': 94.45066221705233,
 '[4, 3]': 0}

## n-step Sarsa for estimating $Q \approx q_{*}$ or $q_{\pi}$

In [37]:
def n_step_sarsa(num_trials, n, policy, gamma, alpha, environment_stochasticity,epsilon):
    
    grid_size = environment[0]*environment[1]

    probs = probability_distribution(grid_size,environment_stochasticity)

    Q = {}
    for state in environment[6]:

        if state not in environment[4]:
            
            Q[str(state)] = {}

            for action in ["[1, 0]","[-1, 0]","[0, 1]","[0, -1]"]:

                Q[str(state)][action] = random.uniform(1e-9, 1e-8)
    
    def state_action_nextstate(current_state):

        if type(current_state) == str:

            state = ast.literal_eval(current_state)
        else:
            state = current_state
        #Choose action using policy derived from Q===================================
        value_action_state = reverse_dictionary(Q[str(state)])
        Max_val = max(list(value_action_state.keys()))
        best_action = value_action_state[Max_val]
        best_action = ast.literal_eval(best_action)

        #============================================================================
        #Epsilon Greedy
        if random.uniform(0, 1) > epsilon:

            selected_action = best_action
        
        else:
            Actions = [[1,0],[-1,0],[0,1],[0,-1]]
            Actions.remove(best_action)
            epsilon_action = random.choice(Actions)

            selected_action = epsilon_action
        #============================================================================
        
        Actions = [[1,0],[-1,0],[0,1],[0,-1]]
        Actions.remove(selected_action)
        sorted_actions = Actions + [selected_action]
        state_indice = state_indice_dict[str(state)]
        actions_prob = probs[state_indice]
        actions_prob.sort()
        #due to stochasticity of the environment
        Final_action = random.choices(sorted_actions, actions_prob)[0]
        #print(type(state), type(Final_action))
        
        next_state = [x + y for x, y in zip(state, Final_action)]

        if next_state not in environment[6] or next_state in environment[4]:

            next_state = current_state

        return Final_action, next_state


    state_t  = environment[2]

    for trial in tqdm(range(num_trials)):
        
        T = float('inf')
        tau = 0
        t = -1
        trajectory = []
        state_t  = environment[2]
        while tau != T - 1:

            t = t + 1

            if t < T:
                trajectory.append(state_t)
                next_state = state_action_nextstate(state_t)[1]
                #last_state = state_t
                state_t = next_state
                

                if next_state == environment[3]:
                    T = t + 1
                
                #else:
                #    next_action = 

            
            tau = t - n + 1

            if tau >= 0:

                G = 0
                
                state_tau = trajectory[tau]
                state_i = state_tau
                action_tau = state_action_nextstate(state_tau)[0]

                action_ii = 0
                for i in range(tau+1, min(tau+n,T)+1):

                    #actionTN = action_ii

                    state_ii = state_action_nextstate(state_i)[1]
                    action_ii = state_action_nextstate(state_i)[0]

                    #stateTN = state_i #s_(t+n) this is store for the next part
                     

                    #n_state_action.append((state_ii, action_ii))

                    #state_i = state_ii
                    r = state_reward(state_ii)

                    G = G + (gamma ** (i-tau-1)) * r

                    if i < min(tau+n,T):

                        state_i = state_ii

                if tau + n < T:

                    stateTN = state_action_nextstate(state_ii)[1]
                    actionTN = state_action_nextstate(state_ii)[0]

                    #print(tau , n)
                    
                    #tau_n_state = trajectory[tau + n] #indice_state_dict[tau + n]

                    #if state_ii not in environment[4] and state_ii != environment[3]:

                    G = G + (gamma ** n) * Q[str(stateTN)][str(actionTN)]

                if state_tau not in environment[4] and state_tau != environment[3]:
                    #print(type(tau_state))


                    Q[str(state_tau)][str(action_tau)] = Q[str(state_tau)][str(action_tau)] + alpha * (G - Q[str(state_tau)][str(action_tau)])
        
    return Q


In [38]:
policy_0 = arbitrary_policy(41)

n_step_sarsa(10000, 2, policy_0, 0.3, 0.9, 'deterministic', 0.1)

100%|██████████| 10000/10000 [00:21<00:00, 458.04it/s]


{'[0, 1]': {'[1, 0]': -1.428538743289225,
  '[-1, 0]': -1.428571382649282,
  '[0, 1]': -1.428571363476915,
  '[0, -1]': -1.428571408947251},
 '[0, 2]': {'[1, 0]': -1.1790629533322383,
  '[-1, 0]': -1.4285713738801578,
  '[0, 1]': -1.4242859548653708,
  '[0, -1]': -1.1823206519733565},
 '[0, 3]': {'[1, 0]': -1.4267677718062353,
  '[-1, 0]': -1.4281419564667566,
  '[0, 1]': -1.4285713828090096,
  '[0, -1]': -0.6054348238710867},
 '[1, 0]': {'[1, 0]': -1.4285689358860627,
  '[-1, 0]': -1.4285708559569708,
  '[0, 1]': -1.4085860176048621,
  '[0, -1]': -1.428571402832552},
 '[1, 1]': {'[1, 0]': -1.4285685499900997,
  '[-1, 0]': -1.4285706337105024,
  '[0, 1]': -1.1572832050906312,
  '[0, -1]': 1.0361208416774685},
 '[1, 2]': {'[1, 0]': -1.4270509458148666,
  '[-1, 0]': -1.4262360455742509,
  '[0, 1]': -1.0900850574840457,
  '[0, -1]': -1.4276095453266708},
 '[1, 3]': {'[1, 0]': -1.3080445650818473,
  '[-1, 0]': -1.4285709941430913,
  '[0, 1]': -1.3128570685844432,
  '[0, -1]': -1.4285702806

In [39]:
n_step_sarsa(10000, 2, policy_0, 0.3, 0.9, 'stochastic', 0.1)

  0%|          | 0/10000 [00:00<?, ?it/s]

100%|██████████| 10000/10000 [05:31<00:00, 30.17it/s]


{'[0, 1]': {'[1, 0]': -1.428571427680361,
  '[-1, 0]': -1.428571428513213,
  '[0, 1]': -1.4285714228677544,
  '[0, -1]': -1.428571428429193},
 '[0, 2]': {'[1, 0]': -1.4285714285622721,
  '[-1, 0]': -1.4285714284678062,
  '[0, 1]': -1.4285714274459735,
  '[0, -1]': -1.4285714281031252},
 '[0, 3]': {'[1, 0]': -1.4285714283113913,
  '[-1, 0]': -1.428571428370475,
  '[0, 1]': -1.4285714279549002,
  '[0, -1]': -1.4285714280566173},
 '[1, 0]': {'[1, 0]': -1.4285714234890952,
  '[-1, 0]': -1.4285708590859978,
  '[0, 1]': -1.4285713716186095,
  '[0, -1]': -1.428571422141252},
 '[1, 1]': {'[1, 0]': -1.4285714215035155,
  '[-1, 0]': -1.4285714284505917,
  '[0, 1]': -1.4285714266113527,
  '[0, -1]': -1.4285713582165407},
 '[1, 2]': {'[1, 0]': -1.4285713775326154,
  '[-1, 0]': -1.4285714236830338,
  '[0, 1]': -1.428570819915354,
  '[0, -1]': -1.4285714223602444},
 '[1, 3]': {'[1, 0]': -1.428570520570798,
  '[-1, 0]': -1.4285708195510765,
  '[0, 1]': -1.4285712471656926,
  '[0, -1]': -1.42857142494