In [1]:
import numpy as np
import random
from tabulate import tabulate
from tqdm import tqdm
import ast

In [2]:
def generate_grid_world(length, width,path_lenght,holes_number,Random_State):
    
    random.seed(Random_State)
    #store all cells in a list
    Grid_Cells = []
    for row in range(length):
        for col in range(width):
            Grid_Cells.append([row,col])


    #specify the number of holes in the gridworld
    
    #specify the start point as a random cell
    start = [random.randint(0, length), random.randint(0, width)]

    #create a path from start point
    """instead of defining start and goal points,
      we define just a start point and a random path with a random lenght to
       another point and name it as goal point"""
    
    def random_path(Start, Path_Lenght,length, width):
        
        Path = []
        Path.append(Start)
        for i in range(Path_Lenght):
            
            #there are two moves that take us on a random cell named Goal [1,0], [0,1]
            
            move = random.choice([[1,0], [0,1]])
            
            #update the start cell/point by the above move
            Start = [x + y for x, y in zip(Start, move)]
            
            #if the movement take us out of our gridworld, we reverse the change in the start point
            if Start[0] < 0 or Start[1] < 0 or Start[0] > length-1 or Start[1] > width-1:

                Start = [x - y for x, y in zip(Start, move)]

            else:
                
                #create a path history
                Path.append(Start)

        Goal = Start

        return Goal,Path
    

    GoalPath = random_path(start, path_lenght,length, width)

    goal = GoalPath[0]
    path = GoalPath[1]

    #now we must eliminate the path cells from the Grid_Cells to choose hole cells from remaining cells

    FreeCells = [x for x in Grid_Cells if x not in path]

    Holes = random.sample(FreeCells, holes_number)

    #Also, we can visualize our gridworld in a simple way

    def mark_holes(holes):
        marked_data = [["Hole" if [row, col] in holes else [row, col] for col in range(width)] for row in range(length)]
        return marked_data
    
    marked_matrix = mark_holes(Holes)

    print(tabulate(marked_matrix, tablefmt="grid"))

    
    return length, width, start, goal, Holes, path,Grid_Cells

In [3]:
#environment = generate_grid_world(50, 40,1300,400,39)
environment = generate_grid_world(5, 4,4,4,39)

environment

+--------+--------+--------+--------+
| Hole   | [0, 1] | [0, 2] | [0, 3] |
+--------+--------+--------+--------+
| [1, 0] | [1, 1] | [1, 2] | [1, 3] |
+--------+--------+--------+--------+
| Hole   | [2, 1] | [2, 2] | [2, 3] |
+--------+--------+--------+--------+
| Hole   | [3, 1] | Hole   | [3, 3] |
+--------+--------+--------+--------+
| [4, 0] | [4, 1] | [4, 2] | [4, 3] |
+--------+--------+--------+--------+


(5,
 4,
 [1, 2],
 [4, 3],
 [[2, 0], [3, 2], [3, 0], [0, 0]],
 [[1, 2], [1, 3], [2, 3], [3, 3], [4, 3]],
 [[0, 0],
  [0, 1],
  [0, 2],
  [0, 3],
  [1, 0],
  [1, 1],
  [1, 2],
  [1, 3],
  [2, 0],
  [2, 1],
  [2, 2],
  [2, 3],
  [3, 0],
  [3, 1],
  [3, 2],
  [3, 3],
  [4, 0],
  [4, 1],
  [4, 2],
  [4, 3]])

In [4]:
def probability_distribution(grid_size,randomness):
    #random.seed(40)
    
    #by this function we generate probabilities which their sum is equal to 1
    def generate_probabilities(n):

        numbers = [random.random() for _ in range(n)]
        total_sum = sum(numbers)
        scaled_numbers = [num / total_sum for num in numbers]
        
        return scaled_numbers
    
    cells_prob = {}
    if randomness == 'stochastic':
        for cell in range(grid_size):
            
            #we set the number of probs to 4 due to 4 possible action for each cell (go to its neighbors)
            probs = generate_probabilities(4)

            cells_prob[cell] = probs
    elif randomness == 'equal probable':

        for cell in range(grid_size):

            cells_prob[cell] = [0.25,0.25,0.25,0.25]
    
    elif randomness == 'deterministic':
        for cell in range(grid_size):

            cells_prob[cell] = [0.03,0.06,0.01,0.9] #[0,0,0,1] ##[0.15,.15,0.1,0.6]


    #Note that we consider the correspondence between probabilities and actions as below:
    #probs = [p1, p2, p3, p4] ---> [[1,0],[-1,0],[0,1],[0,-1]]

    return cells_prob

def neighbor_cells(cell):

    grid_cells = environment[6]
    Actions = [[1,0],[-1,0],[0,1],[0,-1]]

    Neighbors = []
    Actions_Neighbors = []
    for action in Actions:

        neighbor = [x + y for x, y in zip(cell, action)]
        #if neighbor not in environment[4]:
        Neighbors.append(neighbor)
        Actions_Neighbors.append(action)

    return Neighbors, Actions_Neighbors

def arbitrary_policy(randomness):

        #random.seed(randomness)
        
    policy = {}
    policy_action = {}
    for state in environment[6]:

        if state not in environment[4]:

            neighbors = neighbor_cells(state)[0]
            Actions_Neighbors = neighbor_cells(state)[1]

            allowed_positions = []

            for neighbor in neighbors:
                
                if neighbor in environment[6] and neighbor not in environment[4]:
                    
                    allowed_positions.append(neighbor)
            
            if len(allowed_positions) > 0:
                
                next_state = random.choice(allowed_positions)
                row = next_state[0] - state[0]
                col = next_state[1] - state[1]
                PolicyAction = [row, col]

                policy['{}'.format(state)] = next_state
                policy_action['{}'.format(state)] = PolicyAction



    return policy, policy_action

def state_reward(next_state):

    if next_state in environment[4]:

        r = -3
    
    elif next_state == environment[3]:

        r = 10
    
    elif next_state not in environment[6]:

        r = -2
    
    else:

        r = -1
    
    return r

def reverse_dictionary(dict):
    reverse_dict = {}
    for key in list(dict.keys()):
        val = dict[key]
        reverse_dict[val] = key
    return reverse_dict


state_indice_dict = {}
counter = 0
for state in environment[6]:

    state = str(state)
    state_indice_dict[state] = counter
    counter = counter + 1

def generate_trajectory(policy,randomness,environment_stochasticity):

    policy_action = policy[1]
    probs = probability_distribution(environment[0]*environment[1],environment_stochasticity)
    start = environment[2]
    terminate = start
    trajectory = []
    pure_trajectory = [start]
    c = 0
    while terminate != environment[3]:
        random.seed(randomness+c)
        Actions = [[1,0],[-1,0],[0,1],[0,-1]]
        action = policy_action[str(terminate)]
        Actions.remove(action)
        sorted_actions = Actions + [action]
        state_indice = state_indice_dict[str(terminate)]
        actions_prob = probs[state_indice]
        actions_prob.sort()

        selected_action = random.choices(sorted_actions, actions_prob)[0]
        current_state = terminate
        next_state = [x + y for x, y in zip(terminate, selected_action)]
        pure_trajectory.append(next_state)
        
        #if the agent goes out of the gridworld, it stays in its current state
        if next_state not in environment[6]:
            next_state = terminate
        
        #if it drops into the holes, it goes to the start points
        elif next_state in environment[4]:
            next_state = start  

        terminate = next_state
        trajectory.append((current_state))
        c = c+1
    
    trajectory.append((environment[3]))
    pure_trajectory.append(environment[3])

    return trajectory,pure_trajectory

def extract_features(state):

    goal = environment[3]
    max_length = environment[0]
    max_width = environment[1]

    w1 = (goal[0] - state[0]) / max_width
    w2 = (goal[1] - state[1]) / max_length

    return abs(w1), abs(w2)

## Semi-gradient $TD(\lambda)$ for Estimating $\hat{v} \approx v_{\pi}$

In [5]:
def semi_gradient_TD_lambda(num_trials, gamma, policy, Lambda, alpha, environment_stochasticity):

    W = {}
    for state in environment[6]:

        if state not in environment[4]:

            W[str(state)] = {}

            Features = extract_features(state)

            for element in [0,1]:
                
                W[str(state)][element] = Features[element] + random.uniform(1e-9, 1e-8)


    V = {}
    state_observed = {}
    for state in environment[6]:

        if state not in environment[4]:
            
            V[str(state)] = 0
            state_observed[str(state)] = 0
    
    for trial in tqdm(range(num_trials)):

        TRAJECTORY = generate_trajectory(policy,trial,environment_stochasticity)
        

        trajectory = TRAJECTORY[0]

        z1 , z2 = 0 , 0

        for step_indx in range(len(trajectory[:-1])):

            step = trajectory[step_indx]
            next_step = trajectory[step_indx+1]

            r = state_reward(next_step)
            #print(W[str(step)][0],W[str(step)][1])
            gradient_w1 = -np.sin(abs(W[str(step)][0]) + abs(W[str(step)][1])) * (W[str(step)][0]/abs(W[str(step)][0]))
            gradient_w2 = -np.sin(abs(W[str(step)][1]) + abs(W[str(step)][0])) * (W[str(step)][1]/abs(W[str(step)][1]))
            
            z1 = gamma * Lambda * z1 + gradient_w1
            z2 = gamma * Lambda * z2 + gradient_w2

            delta = r + gamma * np.cos(abs(W[str(next_step)][0]) + abs(W[str(next_step)][1]))\
                         - np.cos(abs(W[str(step)][0]) + abs(W[str(step)][1]))

            #alpha = 1/math.log(step_indx+2)
            W[str(step)][0] = W[str(step)][0] + alpha * delta * z1

            W[str(step)][1] = W[str(step)][1] + alpha * delta * z2

            
            V[str(step)] = V[str(step)] + np.cos(abs(W[str(step)][0]) + abs(W[str(step)][1]))
            state_observed[str(step)] = state_observed[str(step)] + 1
    
    for state in environment[6]:

        if state not in environment[4]:

            if state_observed[str(state)] > 0:

                V[str(state)] = V[str(state)] / state_observed[str(state)]


    return V



In [8]:
policy_0 = arbitrary_policy(41)

semi_gradient_TD_lambda(10000, 0.9, policy_0, 0.5, 0.2, 'deterministic')

  0%|          | 0/10000 [00:00<?, ?it/s]

100%|██████████| 10000/10000 [03:16<00:00, 50.86it/s]


{'[0, 1]': -0.9999932232602464,
 '[0, 2]': -0.999830208355103,
 '[0, 3]': -0.9987997898611931,
 '[1, 0]': -0.9999562590044944,
 '[1, 1]': -0.9999901506851769,
 '[1, 2]': -0.9999728473887616,
 '[1, 3]': -0.9999159239509892,
 '[2, 1]': -0.9999854721077955,
 '[2, 2]': -0.9997137018491942,
 '[2, 3]': -0.9999188898910035,
 '[3, 1]': -0.9999915704768668,
 '[3, 3]': -0.9259746815859576,
 '[4, 0]': -0.9999990261167743,
 '[4, 1]': -0.9999976129506686,
 '[4, 2]': -0.9893220441738694,
 '[4, 3]': 0}

## True online $TD(\lambda)$ for Estimating $W^T X \approx v_{\pi}$

In [42]:
def true_online_TD_lambda(num_trials, gamma, policy, Lambda, alpha, environment_stochasticity):

    W = {}
    for state in environment[6]:

        if state not in environment[4]:

            Features = extract_features(state)
            
            W[str(state)] = np.array([0, 0])

    V = {}
    state_observed = {}
    for state in environment[6]:

        if state not in environment[4]:
            
            V[str(state)] = 0
            state_observed[str(state)] = 0
    

    for trial in tqdm(range(num_trials)):

        TRAJECTORY = generate_trajectory(policy,trial,environment_stochasticity)

        trajectory = TRAJECTORY[0]
        z = np.array([0,0])
        v_old = 0

        for step_indx in range(len(trajectory[:-1])):

            step = trajectory[step_indx]
            next_step = trajectory[step_indx+1]

            r = state_reward(next_step)

            x_features = np.array([extract_features(step)[0],extract_features(step)[1]])
            xprim_features = np.array([extract_features(next_step)[0],extract_features(next_step)[1]])

            v = np.matmul(np.transpose(W[str(step)]) , x_features)
            v_prime = np.matmul(np.transpose(W[str(step)]) , xprim_features)

            delta = r + gamma * v_prime - v 
            #print(gamma * Lambda * z)
            #print(np.matmul(z , x_features))
            #print( np.matmul(1 - alpha * gamma * Lambda * np.matmul(np.transpose(z) , x_features) , x_features))
            z = gamma * Lambda * z +\
                  (1 - alpha * gamma * Lambda * np.matmul(np.transpose(z) , x_features)) * x_features
            
            W[str(step)] = W[str(step)] +\
                            alpha * (delta + v - v_old) * z - alpha * (v - v_old) * x_features
            #print(W[str(step)])
            v_old = v_prime
            V[str(step)] = V[str(step)] + np.cos(abs(W[str(step)][0]) + abs(W[str(step)][1]))
            state_observed[str(step)] = state_observed[str(step)] + 1
    



    return V,W

        


In [46]:
true_online_TD_lambda(500, 0.9, policy_0, 0.03, 0.02, 'deterministic')

100%|██████████| 500/500 [00:04<00:00, 106.03it/s]


({'[0, 1]': 9.43536194556366,
  '[0, 2]': 28.276216443317836,
  '[0, 3]': 13.634662721241174,
  '[1, 0]': 36.79849297251622,
  '[1, 1]': -433.0221765091339,
  '[1, 2]': 1.1925062965601612,
  '[1, 3]': -28.51072569144077,
  '[2, 1]': 390.2462587758875,
  '[2, 2]': 72.00122084088336,
  '[2, 3]': 41.42228789851967,
  '[3, 1]': -108.14759638409062,
  '[3, 3]': 1.999935447527856,
  '[4, 0]': -1608.9836268413972,
  '[4, 1]': 219.91119694671116,
  '[4, 2]': 472.81097515783796,
  '[4, 3]': 0},
 {'[0, 1]': array([-6.79903526e+09, -1.26539568e+09]),
  '[0, 2]': array([1.25862686e+12, 3.67348767e+11]),
  '[0, 3]': array([-1.46972616e+09, -3.03282254e+08]),
  '[1, 0]': array([-6.06875029e+09, -3.16127388e+09]),
  '[1, 1]': array([7.85841292e+12, 1.76526918e+12]),
  '[1, 2]': array([-1.10314764e+14, -1.45604276e+15]),
  '[1, 3]': array([9.21434942e+10, 2.71493775e+10]),
  '[2, 1]': array([-7.43474150e+10, -3.66317043e+10]),
  '[2, 2]': array([531.02895986,  -1.26522396]),
  '[2, 3]': array([-391982