In [1]:
import numpy as np
import random
import math
from tabulate import tabulate
from tqdm import tqdm
import ast

In [2]:
def generate_grid_world(length, width,path_lenght,holes_number,Random_State):
    
    random.seed(Random_State)
    #store all cells in a list
    Grid_Cells = []
    for row in range(length):
        for col in range(width):
            Grid_Cells.append([row,col])


    #specify the number of holes in the gridworld
    
    #specify the start point as a random cell
    start = [random.randint(0, length), random.randint(0, width)]

    #create a path from start point
    """instead of defining start and goal points,
      we define just a start point and a random path with a random lenght to
       another point and name it as goal point"""
    
    def random_path(Start, Path_Lenght,length, width):
        
        Path = []
        Path.append(Start)
        for i in range(Path_Lenght):
            
            #there are two moves that take us on a random cell named Goal [1,0], [0,1]
            
            move = random.choice([[1,0], [0,1]])
            
            #update the start cell/point by the above move
            Start = [x + y for x, y in zip(Start, move)]
            
            #if the movement take us out of our gridworld, we reverse the change in the start point
            if Start[0] < 0 or Start[1] < 0 or Start[0] > length-1 or Start[1] > width-1:

                Start = [x - y for x, y in zip(Start, move)]

            else:
                
                #create a path history
                Path.append(Start)

        Goal = Start

        return Goal,Path
    

    GoalPath = random_path(start, path_lenght,length, width)

    goal = GoalPath[0]
    path = GoalPath[1]

    #now we must eliminate the path cells from the Grid_Cells to choose hole cells from remaining cells

    FreeCells = [x for x in Grid_Cells if x not in path]

    Holes = random.sample(FreeCells, holes_number)

    #Also, we can visualize our gridworld in a simple way

    def mark_holes(holes):
        marked_data = [["Hole" if [row, col] in holes else [row, col] for col in range(width)] for row in range(length)]
        return marked_data
    
    marked_matrix = mark_holes(Holes)

    print(tabulate(marked_matrix, tablefmt="grid"))

    
    return length, width, start, goal, Holes, path,Grid_Cells

In [3]:
#environment = generate_grid_world(50, 40,1300,400,39)
environment = generate_grid_world(5, 4,4,4,39)

environment

+--------+--------+--------+--------+
| Hole   | [0, 1] | [0, 2] | [0, 3] |
+--------+--------+--------+--------+
| [1, 0] | [1, 1] | [1, 2] | [1, 3] |
+--------+--------+--------+--------+
| Hole   | [2, 1] | [2, 2] | [2, 3] |
+--------+--------+--------+--------+
| Hole   | [3, 1] | Hole   | [3, 3] |
+--------+--------+--------+--------+
| [4, 0] | [4, 1] | [4, 2] | [4, 3] |
+--------+--------+--------+--------+


(5,
 4,
 [1, 2],
 [4, 3],
 [[2, 0], [3, 2], [3, 0], [0, 0]],
 [[1, 2], [1, 3], [2, 3], [3, 3], [4, 3]],
 [[0, 0],
  [0, 1],
  [0, 2],
  [0, 3],
  [1, 0],
  [1, 1],
  [1, 2],
  [1, 3],
  [2, 0],
  [2, 1],
  [2, 2],
  [2, 3],
  [3, 0],
  [3, 1],
  [3, 2],
  [3, 3],
  [4, 0],
  [4, 1],
  [4, 2],
  [4, 3]])

In [4]:
def probability_distribution(grid_size,randomness):
    #random.seed(40)
    
    #by this function we generate probabilities which their sum is equal to 1
    def generate_probabilities(n):

        numbers = [random.random() for _ in range(n)]
        total_sum = sum(numbers)
        scaled_numbers = [num / total_sum for num in numbers]
        
        return scaled_numbers
    
    cells_prob = {}
    if randomness == 'stochastic':
        for cell in range(grid_size):
            
            #we set the number of probs to 4 due to 4 possible action for each cell (go to its neighbors)
            probs = generate_probabilities(4)

            cells_prob[cell] = probs
    elif randomness == 'equal probable':

        for cell in range(grid_size):

            cells_prob[cell] = [0.25,0.25,0.25,0.25]
    
    elif randomness == 'deterministic':
        for cell in range(grid_size):

            cells_prob[cell] = [0.07,0.09,0.04,0.8] #[0,0,0,1] ##[0.15,.15,0.1,0.6]


    #Note that we consider the correspondence between probabilities and actions as below:
    #probs = [p1, p2, p3, p4] ---> [[1,0],[-1,0],[0,1],[0,-1]]

    return cells_prob

def neighbor_cells(cell):

    grid_cells = environment[6]
    Actions = [[1,0],[-1,0],[0,1],[0,-1]]

    Neighbors = []
    Actions_Neighbors = []
    for action in Actions:

        neighbor = [x + y for x, y in zip(cell, action)]
        #if neighbor not in environment[4]:
        Neighbors.append(neighbor)
        Actions_Neighbors.append(action)

    return Neighbors, Actions_Neighbors

def arbitrary_policy(randomness):

        #random.seed(randomness)
        
    policy = {}
    policy_action = {}
    for state in environment[6]:

        if state not in environment[4]:

            neighbors = neighbor_cells(state)[0]
            Actions_Neighbors = neighbor_cells(state)[1]

            allowed_positions = []

            for neighbor in neighbors:
                
                if neighbor in environment[6] and neighbor not in environment[4]:
                    
                    allowed_positions.append(neighbor)
            
            if len(allowed_positions) > 0:
                
                next_state = random.choice(allowed_positions)
                row = next_state[0] - state[0]
                col = next_state[1] - state[1]
                PolicyAction = [row, col]

                policy['{}'.format(state)] = next_state
                policy_action['{}'.format(state)] = PolicyAction



    return policy, policy_action

def state_reward(next_state):

    if next_state in environment[4]:

        r = -3
    
    elif next_state == environment[3]:

        r = 10
    
    elif next_state not in environment[6]:

        r = -2
    
    else:

        r = -1
    
    return r

def reverse_dictionary(dict):
    reverse_dict = {}
    for key in list(dict.keys()):
        val = dict[key]
        reverse_dict[val] = key
    return reverse_dict


state_indice_dict = {}
counter = 0
for state in environment[6]:

    state = str(state)
    state_indice_dict[state] = counter
    counter = counter + 1

def generate_trajectory(policy,randomness,environment_stochasticity):

    policy_action = policy[1]
    probs = probability_distribution(environment[0]*environment[1],environment_stochasticity)
    start = environment[2]
    terminate = start
    trajectory = []
    trajectory_actions = []
    pure_trajectory = [start]
    c = 0
    while terminate != environment[3]:
        random.seed(randomness+c)
        Actions = [[1,0],[-1,0],[0,1],[0,-1]]
        action = policy_action[str(terminate)]
        Actions.remove(action)
        sorted_actions = Actions + [action]
        state_indice = state_indice_dict[str(terminate)]
        actions_prob = probs[state_indice]
        actions_prob.sort()

        selected_action = random.choices(sorted_actions, actions_prob)[0]

        trajectory_actions.append(selected_action)
        current_state = terminate
        next_state = [x + y for x, y in zip(terminate, selected_action)]
        pure_trajectory.append(next_state)
        
        #if the agent goes out of the gridworld, it stays in its current state
        if next_state not in environment[6]:
            next_state = terminate
        
        #if it drops into the holes, it goes to the start points
        elif next_state in environment[4]:
            next_state = start  

        terminate = next_state
        trajectory.append((current_state))
        c = c+1
    
    trajectory.append((environment[3]))
    pure_trajectory.append(environment[3])

    return trajectory,trajectory_actions

def extract_features(state):

    goal = environment[3]
    max_length = environment[0]
    max_width = environment[1]

    w1 = (goal[0] - state[0]) / max_width
    w2 = (goal[1] - state[1]) / max_length

    return abs(w1), abs(w2)

In [5]:
def H_theta(environment):
    
    policy = {}
    policy_action = {}

    for state in environment[6]:

        if state != environment[3] and state not in environment[4]:

            Neighbors = neighbor_cells(state)

            Distances = {}

            for neighbor in Neighbors[0]:

                if neighbor not in environment[4] and neighbor in environment[6]:

                    distance = np.cos(extract_features(neighbor)[0]+extract_features(neighbor)[1])
                    Distances[distance] = neighbor
            
            #closest to the terminate state
            if list(Distances.keys()) != []:

                best_neighbor = Distances[max(list(Distances.keys()))]
            
            else:
                best_neighbor = state

            policy[str(state)] = best_neighbor

            row = best_neighbor[0] - state[0]
            col = best_neighbor[1] - state[1]
            PolicyAction = [row,col]
            policy_action[str(state)] = PolicyAction
    

    return policy, policy_action

In [6]:
H_theta(environment)

({'[0, 1]': [1, 1],
  '[0, 2]': [1, 2],
  '[0, 3]': [1, 3],
  '[1, 0]': [1, 1],
  '[1, 1]': [2, 1],
  '[1, 2]': [2, 2],
  '[1, 3]': [2, 3],
  '[2, 1]': [3, 1],
  '[2, 2]': [2, 3],
  '[2, 3]': [3, 3],
  '[3, 1]': [4, 1],
  '[3, 3]': [4, 3],
  '[4, 0]': [4, 1],
  '[4, 1]': [4, 2],
  '[4, 2]': [4, 3]},
 {'[0, 1]': [1, 0],
  '[0, 2]': [1, 0],
  '[0, 3]': [1, 0],
  '[1, 0]': [0, 1],
  '[1, 1]': [1, 0],
  '[1, 2]': [1, 0],
  '[1, 3]': [1, 0],
  '[2, 1]': [1, 0],
  '[2, 2]': [0, 1],
  '[2, 3]': [1, 0],
  '[3, 1]': [1, 0],
  '[3, 3]': [1, 0],
  '[4, 0]': [0, 1],
  '[4, 1]': [0, 1],
  '[4, 2]': [0, 1]})

## REINFORCCE: Monte-Carlo Policy-Gradient Control (episodic) for $\pi_{*}$

In [6]:
def pi_theta(environment,Theta):

    policy = {}
    policy_action = {}

    Actions = [[1, 0],[-1, 0],[0, 1],[0, -1]]

    for state in environment[6]:

        if state != environment[3] and state not in environment[4]:

            Distances = {}

            for action in Actions:

                distance = np.cos(abs(Theta[str(state)][str(action)][0]) + abs(Theta[str(state)][str(action)][1]))
                #print(distance)

                Distances[distance] = action
            
            #closest to the terminate state
            if list(Distances.keys()) != []:

                best_action = Distances[max(list(Distances.keys()))]
                next_state = [x + y for x, y in zip(state, best_action)]
                
            
            else:
                next_state = state
                best_action = random.choices(Actions)
    
            policy[str(state)] = next_state
            policy_action[str(state)] = best_action
    

    return policy, policy_action

In [8]:
Theta = {}
for state in environment[6]:

    if state not in environment[4]:

        Theta[str(state)] = {} 

        for action in [[1, 0],[-1, 0],[0, 1],[0, -1]]:

            next_state = [x + y for x, y in zip(state, action)]
            Features = extract_features(next_state)
            
            Theta[str(state)][str(action)] = [Features[0],Features[1]]

pi_theta(environment,Theta)

({'[0, 1]': [1, 1],
  '[0, 2]': [1, 2],
  '[0, 3]': [1, 3],
  '[1, 0]': [2, 0],
  '[1, 1]': [2, 1],
  '[1, 2]': [2, 2],
  '[1, 3]': [2, 3],
  '[2, 1]': [3, 1],
  '[2, 2]': [3, 2],
  '[2, 3]': [3, 3],
  '[3, 1]': [4, 1],
  '[3, 3]': [4, 3],
  '[4, 0]': [4, 1],
  '[4, 1]': [4, 2],
  '[4, 2]': [4, 3]},
 {'[0, 1]': [1, 0],
  '[0, 2]': [1, 0],
  '[0, 3]': [1, 0],
  '[1, 0]': [1, 0],
  '[1, 1]': [1, 0],
  '[1, 2]': [1, 0],
  '[1, 3]': [1, 0],
  '[2, 1]': [1, 0],
  '[2, 2]': [1, 0],
  '[2, 3]': [1, 0],
  '[3, 1]': [1, 0],
  '[3, 3]': [1, 0],
  '[4, 0]': [0, 1],
  '[4, 1]': [0, 1],
  '[4, 2]': [0, 1]})

In [20]:
def monte_carlo_policy_gradient(num_trials, gamma, alpha, environment_stochasticity):
    
    Theta = {}
    for state in environment[6]:

        if state not in environment[4]:

            Theta[str(state)] = {} 

            for action in [[1, 0],[-1, 0],[0, 1],[0, -1]]:

                next_state = [x + y for x, y in zip(state, action)]
                Features = extract_features(next_state)
                
                Theta[str(state)][str(action)] = [Features[0],Features[1]] #Features[element] + random.uniform(1e-9, 1e-8)

    Q = {}
    for state in environment[6]:

        if state not in environment[4]:
            
            Q[str(state)] = {}

            for action in ["[1, 0]","[-1, 0]","[0, 1]","[0, -1]"]:

                Q[str(state)][action] = random.uniform(1e-9, 1e-8)
    
    Optimal_Policy = {}
    for state in environment[6]:

        if state not in environment[4]:

            Optimal_Policy[str(state)] = state

    
    for trial in tqdm(range(num_trials)):

        policy = pi_theta(environment,Theta)

        TRAJECTORY = generate_trajectory(policy,trial,environment_stochasticity)

        trajectory = TRAJECTORY[0]
        #print(len(trajectory))
        #print(trajectory[-100:])
        actions = TRAJECTORY[1]
        

        G = 0

        for step_indx in range(len(trajectory[:-1])):

            step = trajectory[step_indx]
            
            next_step = trajectory[step_indx+1]

            done_action = actions[step_indx]

            for k in range(step_indx+1,len(trajectory)):

                
                step_k = trajectory[k]

                #next_step_k = trajectory[k+1]
            
                r = state_reward(step_k)

                G = G + gamma ** (k - step_indx - 1) * r
        
            softmax_denominator = 0.0001
            for action in ["[1, 0]","[-1, 0]","[0, 1]","[0, -1]"]:

                softmax_denominator = softmax_denominator +\
                    -np.sin(Theta[str(step)][action][0] + Theta[str(step)][action][1]) *\
                math.exp(np.cos(Theta[str(step)][action][0] + Theta[str(step)][action][1]))

            #print('softmax_denominators',softmax_denominator)
            gradient = (-np.sin(Theta[str(step)][str(done_action)][0] + Theta[str(step)][str(done_action)][1]) *\
            math.exp(np.cos(Theta[str(step)][str(done_action)][0] + Theta[str(step)][str(done_action)][1]))) - softmax_denominator

            #print('gradient',gradient)
                
            t1 = Theta[str(step)][str(done_action)][0] +\
                alpha * (gamma ** step_indx) * G * gradient
            
            t2 = Theta[str(step)][str(done_action)][1] +\
                alpha * (gamma ** step_indx) * G * gradient

            Theta[str(step)][str(done_action)] = [t1,t2]

            Q[str(step)][str(done_action)] = np.cos(abs(t1)+abs(t2))
    
    for state in environment[6]:

        if str(state) in list(Q.keys()):

            value_action_state = reverse_dictionary(Q[str(state)])
            Max_val = max(list(value_action_state.keys()))
            best_action = value_action_state[Max_val]
            best_action = ast.literal_eval(best_action)
            next_state = [x + y for x, y in zip(state, best_action)]

            if next_state not in environment[4] and next_state in environment[6]:

                Optimal_Policy[str(state)] = next_state

            else:

                Optimal_Policy[str(state)] = state



    return Q, Optimal_Policy

In [22]:
#My trial and error shows that we should consider the gamma lower than 0.9
monte_carlo_policy_gradient(1000, 0.05, 0.1, 'deterministic')

100%|██████████| 1000/1000 [00:03<00:00, 283.24it/s]


({'[0, 1]': {'[1, 0]': 0.7241384675723698,
   '[-1, 0]': -0.049247033421538536,
   '[0, 1]': 0.38252184475653783,
   '[0, -1]': 0.04122590484344676},
  '[0, 2]': {'[1, 0]': -0.3220543517165513,
   '[-1, 0]': -0.3255366687491661,
   '[0, 1]': -0.3323150246661916,
   '[0, -1]': -0.33351601522310276},
  '[0, 3]': {'[1, 0]': 0.7316888688738197,
   '[-1, 0]': 0.3153223623952687,
   '[0, 1]': 0.4473784650569547,
   '[0, -1]': 0.43470464016508953},
  '[1, 0]': {'[1, 0]': 0.6506196690364913,
   '[-1, 0]': 0.0016581378741039362,
   '[0, 1]': 0.4184760470827733,
   '[0, -1]': 0.044824492491251526},
  '[1, 1]': {'[1, 0]': -0.5275736590079866,
   '[-1, 0]': -0.5280258785296158,
   '[0, 1]': -0.525051468583322,
   '[0, -1]': -0.5335159921654302},
  '[1, 2]': {'[1, 0]': -0.9995943954566024,
   '[-1, 0]': -0.9388472248098347,
   '[0, 1]': -0.7644298947571735,
   '[0, -1]': -0.8250009455325726},
  '[1, 3]': {'[1, 0]': -0.33185118551951004,
   '[-1, 0]': -0.3341941158153252,
   '[0, 1]': -0.34011277597

## REINFORCCE with Baseline (episodic) for estimating $\pi_{\theta} \approx \pi_{*}$

In [51]:
def baseline(num_trials, gamma, w_alpha, t_alpha, environment_stochasticity):

    Theta = {}
    for state in environment[6]:

        if state not in environment[4]:

            Theta[str(state)] = {}

            for action in [[1, 0],[-1, 0],[0, 1],[0, -1]]:

                next_state = [x + y for x, y in zip(state, action)]
                Features = extract_features(next_state)
                
                Theta[str(state)][str(action)] = [Features[0],Features[1]]
    
    W = {}
    for state in environment[6]:

        if state not in environment[4]:

            Features = extract_features(state)
            
            W[str(state)] = [Features[0]+random.uniform(1e-9, 1e-8),Features[1]+random.uniform(1e-9, 1e-8)]
    
    V = {}
    state_observed = {}
    for state in environment[6]:

        if state not in environment[4]:
            
            V[str(state)] = 0
            state_observed[str(state)] = 0

    Q = {}
    for state in environment[6]:

        if state not in environment[4]:
            
            Q[str(state)] = {}

            for action in ["[1, 0]","[-1, 0]","[0, 1]","[0, -1]"]:

                Q[str(state)][action] = random.uniform(1e-9, 1e-8)
    
    Optimal_Policy = {}
    for state in environment[6]:

        if state not in environment[4]:

            Optimal_Policy[str(state)] = state

    
    for trial in tqdm(range(num_trials)):

        policy = pi_theta(environment,Theta)

        TRAJECTORY = generate_trajectory(policy,trial,environment_stochasticity)

        trajectory = TRAJECTORY[0]

        actions = TRAJECTORY[1]
        

        G = 0

        for step_indx in range(len(trajectory[:-1])):

            step = trajectory[step_indx]
            
            next_step = trajectory[step_indx+1]

            done_action = actions[step_indx]

            for k in range(step_indx+1,len(trajectory)):

                
                step_k = trajectory[k]

                #next_step_k = trajectory[k+1]
            
                r = state_reward(step_k)

                G = G + gamma ** (k - step_indx - 1) * r
            
            v_hat = np.cos(abs(W[str(step)][0]) + abs(W[str(step)][1]))

            delta = G - v_hat

            gradient_w1 = -np.sin(abs(W[str(step)][0]) + abs(W[str(step)][1])) * (W[str(step)][0]/abs(W[str(step)][0]))
            gradient_w2 = -np.sin(abs(W[str(step)][1]) + abs(W[str(step)][0])) * (W[str(step)][1]/abs(W[str(step)][1]))

            W[str(step)][0] = W[str(step)][0] + w_alpha * delta * gradient_w1

            W[str(step)][1] = W[str(step)][1] + w_alpha * delta * gradient_w2

            
            V[str(step)] = V[str(step)] + np.cos(abs(W[str(step)][0]) + abs(W[str(step)][1]))


            softmax_denominator = 0.0001
            for action in ["[1, 0]","[-1, 0]","[0, 1]","[0, -1]"]:

                softmax_denominator = softmax_denominator +\
                    -np.sin(Theta[str(step)][action][0] + Theta[str(step)][action][1]) *\
                math.exp(np.cos(Theta[str(step)][action][0] + Theta[str(step)][action][1]))

            #print('softmax_denominators',softmax_denominator)
            gradient = (-np.sin(Theta[str(step)][str(done_action)][0] + Theta[str(step)][str(done_action)][1]) *\
            math.exp(np.cos(Theta[str(step)][str(done_action)][0] + Theta[str(step)][str(done_action)][1]))) - softmax_denominator

            #print('gradient',gradient)
                
            t1 = Theta[str(step)][str(done_action)][0] +\
                t_alpha * (gamma ** step_indx) * delta * gradient
            
            t2 = Theta[str(step)][str(done_action)][1] +\
                t_alpha * (gamma ** step_indx) * delta * gradient

            Theta[str(step)][str(done_action)] = [t1,t2]

            Q[str(step)][str(done_action)] = np.cos(abs(t1)+abs(t2))
    
    for state in environment[6]:

        if str(state) in list(Q.keys()):

            value_action_state = reverse_dictionary(Q[str(state)])
            Max_val = max(list(value_action_state.keys()))
            best_action = value_action_state[Max_val]
            best_action = ast.literal_eval(best_action)
            next_state = [x + y for x, y in zip(state, best_action)]

            if next_state not in environment[4] and next_state in environment[6]:

                Optimal_Policy[str(state)] = next_state

            else:

                Optimal_Policy[str(state)] = state



    return V, Q, Optimal_Policy

In [54]:
baseline(1000, 0.005,0.1, 0.1, 'deterministic')

100%|██████████| 1000/1000 [00:01<00:00, 892.26it/s]


({'[0, 1]': -95.04639579379811,
  '[0, 2]': -207.58500704757856,
  '[0, 3]': -42.5969616575847,
  '[1, 0]': -164.2549703346718,
  '[1, 1]': -1348.2026981109395,
  '[1, 2]': -2010.9988097002827,
  '[1, 3]': -629.8141142535152,
  '[2, 1]': -1058.1237556579745,
  '[2, 2]': -207.10434582756073,
  '[2, 3]': -564.1032737270275,
  '[3, 1]': -917.0104795138152,
  '[3, 3]': 494.8171595699261,
  '[4, 0]': -74.55085932384843,
  '[4, 1]': -932.3273933671558,
  '[4, 2]': -722.1284663003171,
  '[4, 3]': 0},
 {'[0, 1]': {'[1, 0]': 0.4095107246606044,
   '[-1, 0]': -0.079045574106728,
   '[0, 1]': 0.3624368801630644,
   '[0, -1]': -0.02912633039806283},
  '[0, 2]': {'[1, 0]': 0.747224915222459,
   '[-1, 0]': 0.1409596020356665,
   '[0, 1]': 0.5403023058681398,
   '[0, -1]': 0.21778030944790022},
  '[0, 3]': {'[1, 0]': 0.7316888688738211,
   '[-1, 0]': 8.947011370440381e-09,
   '[0, 1]': 0.36235775447676305,
   '[0, -1]': 0.3624339375521967},
  '[1, 0]': {'[1, 0]': 0.454870946519627,
   '[-1, 0]': -0.0

# Actor-Critic Methods

## One-step Actor-Critic (episodic), for estimating $\pi_{\theta} \approx \pi_{*}$

In [63]:
def one_step_actor_critic(num_trials, gamma, w_alpha, t_alpha, environment_stochasticity):

    Theta = {}
    for state in environment[6]:

        if state not in environment[4]:

            Theta[str(state)] = {}

            for action in [[1, 0],[-1, 0],[0, 1],[0, -1]]:

                next_state = [x + y for x, y in zip(state, action)]
                Features = extract_features(next_state)
                
                Theta[str(state)][str(action)] = [Features[0],Features[1]]
    
    W = {}
    for state in environment[6]:

        if state not in environment[4]:

            Features = extract_features(state)
            
            W[str(state)] = [Features[0]+random.uniform(1e-9, 1e-8),Features[1]+random.uniform(1e-9, 1e-8)]
    
    V = {}
    state_observed = {}
    for state in environment[6]:

        if state not in environment[4]:
            
            V[str(state)] = 0
            state_observed[str(state)] = 0

    Q = {}
    for state in environment[6]:

        if state not in environment[4]:
            
            Q[str(state)] = {}

            for action in ["[1, 0]","[-1, 0]","[0, 1]","[0, -1]"]:

                Q[str(state)][action] = random.uniform(1e-9, 1e-8)
    
    Optimal_Policy = {}
    for state in environment[6]:

        if state not in environment[4]:

            Optimal_Policy[str(state)] = state

    
    for trial in tqdm(range(num_trials)):

        policy = pi_theta(environment,Theta)

        TRAJECTORY = generate_trajectory(policy,trial,environment_stochasticity)

        trajectory = TRAJECTORY[0]

        actions = TRAJECTORY[1]
        

        G = 0
        I = 1

        for step_indx in range(len(trajectory[:-1])):

            step = trajectory[step_indx]
            
            next_step = trajectory[step_indx+1]

            done_action = actions[step_indx]
            
            v_hat_step = np.cos(abs(W[str(step)][0]) + abs(W[str(step)][1]))
            v_hat_next_step = np.cos(abs(W[str(next_step)][0]) + abs(W[str(next_step)][1]))
            r = state_reward(next_step)
            delta = r + gamma * v_hat_next_step - v_hat_step

            gradient_w1 = -np.sin(abs(W[str(step)][0]) + abs(W[str(step)][1])) * (W[str(step)][0]/abs(W[str(step)][0]))
            gradient_w2 = -np.sin(abs(W[str(step)][1]) + abs(W[str(step)][0])) * (W[str(step)][1]/abs(W[str(step)][1]))

            W[str(step)][0] = W[str(step)][0] + w_alpha * delta * gradient_w1

            W[str(step)][1] = W[str(step)][1] + w_alpha * delta * gradient_w2

            
            V[str(step)] = V[str(step)] + np.cos(abs(W[str(step)][0]) + abs(W[str(step)][1]))


            softmax_denominator = 0.0001
            for action in ["[1, 0]","[-1, 0]","[0, 1]","[0, -1]"]:

                softmax_denominator = softmax_denominator +\
                    -np.sin(Theta[str(step)][action][0] + Theta[str(step)][action][1]) *\
                math.exp(np.cos(Theta[str(step)][action][0] + Theta[str(step)][action][1]))

            #print('softmax_denominators',softmax_denominator)
            gradient = (-np.sin(Theta[str(step)][str(done_action)][0] + Theta[str(step)][str(done_action)][1]) *\
            math.exp(np.cos(Theta[str(step)][str(done_action)][0] + Theta[str(step)][str(done_action)][1]))) - softmax_denominator

            #print('gradient',gradient)
                
            t1 = Theta[str(step)][str(done_action)][0] +\
                t_alpha * I * delta * gradient
            
            t2 = Theta[str(step)][str(done_action)][1] +\
                t_alpha * I * delta * gradient

            Theta[str(step)][str(done_action)] = [t1,t2]

            Q[str(step)][str(done_action)] = np.cos(abs(t1)+abs(t2))
            I = gamma * I
    
    for state in environment[6]:

        if str(state) in list(Q.keys()):

            value_action_state = reverse_dictionary(Q[str(state)])
            Max_val = max(list(value_action_state.keys()))
            best_action = value_action_state[Max_val]
            best_action = ast.literal_eval(best_action)
            next_state = [x + y for x, y in zip(state, best_action)]

            if next_state not in environment[4] and next_state in environment[6]:

                Optimal_Policy[str(state)] = next_state

            else:

                Optimal_Policy[str(state)] = state



    return V, Q, Optimal_Policy


In [67]:
one_step_actor_critic(1000, 0.05,0.5, 0.7, 'deterministic')

  0%|          | 0/1000 [00:00<?, ?it/s]

100%|██████████| 1000/1000 [00:02<00:00, 379.33it/s]


({'[0, 1]': -860.3076210582119,
  '[0, 2]': -4180.09966920436,
  '[0, 3]': -2765.974007313246,
  '[1, 0]': -406.2906355155226,
  '[1, 1]': -3784.105547676202,
  '[1, 2]': -10520.660373814468,
  '[1, 3]': -5153.514297126877,
  '[2, 1]': -1301.722967067913,
  '[2, 2]': -1512.5480462175228,
  '[2, 3]': -491.18959751862985,
  '[3, 1]': -1034.3398536782017,
  '[3, 3]': -15.676458542581848,
  '[4, 0]': -102.41434506539863,
  '[4, 1]': -937.0345853798509,
  '[4, 2]': -92.13313501093741,
  '[4, 3]': 0},
 {'[0, 1]': {'[1, 0]': 0.4272020238645434,
   '[-1, 0]': -0.07654376852531099,
   '[0, 1]': 0.36289728262255055,
   '[0, -1]': -0.025365826770949246},
  '[0, 2]': {'[1, 0]': 0.46947105847960385,
   '[-1, 0]': 0.22622475853815943,
   '[0, 1]': 0.5403023058681399,
   '[0, -1]': 0.31734085450485133},
  '[0, 3]': {'[1, 0]': 0.7316888688738201,
   '[-1, 0]': 0.315322362395268,
   '[0, 1]': 0.37420844144266896,
   '[0, -1]': 0.36635506376672133},
  '[1, 0]': {'[1, 0]': 0.4607614513807729,
   '[-1, 0]

## Actor-Critic with Eligibility Traces (episodic), for estimating $\pi_{\theta} \approx \pi_{*}$

In [75]:
def eligibility_traces_actor_critic_episodic(num_trials, gamma, w_alpha, t_alpha, environment_stochasticity,w_lambda,t_lambda):

    Theta = {}
    for state in environment[6]:

        if state not in environment[4]:

            Theta[str(state)] = {}

            for action in [[1, 0],[-1, 0],[0, 1],[0, -1]]:

                next_state = [x + y for x, y in zip(state, action)]
                Features = extract_features(next_state)
                
                Theta[str(state)][str(action)] = [Features[0]+random.uniform(1e-9, 1e-8),Features[1]+random.uniform(1e-9, 1e-8)]
    
    
    W = {}
    for state in environment[6]:

        if state not in environment[4]:

            Features = extract_features(state)
            
            W[str(state)] = [Features[0]+random.uniform(1e-9, 1e-8),Features[1]+random.uniform(1e-9, 1e-8)]
    
    V = {}
    state_observed = {}
    for state in environment[6]:

        if state not in environment[4]:
            
            V[str(state)] = 0
            state_observed[str(state)] = 0

    Q = {}
    for state in environment[6]:

        if state not in environment[4]:
            
            Q[str(state)] = {}

            for action in ["[1, 0]","[-1, 0]","[0, 1]","[0, -1]"]:

                Q[str(state)][action] = random.uniform(1e-9, 1e-8)
    
    Optimal_Policy = {}
    for state in environment[6]:

        if state not in environment[4]:

            Optimal_Policy[str(state)] = state

    
    for trial in tqdm(range(num_trials)):

        policy = pi_theta(environment,Theta)

        TRAJECTORY = generate_trajectory(policy,trial,environment_stochasticity)

        trajectory = TRAJECTORY[0]

        actions = TRAJECTORY[1]
        t_Z = {}
        for state in environment[6]:

            if state not in environment[4]:

                t_Z[str(state)] = {}
                
                for action in ["[1, 0]","[-1, 0]","[0, 1]","[0, -1]"]:

                    t_Z[str(state)][action] = [random.uniform(1e-9, 1e-8),random.uniform(1e-9, 1e-8)]
        
        w_Z = {}
        for state in environment[6]:

            if state not in environment[4]:

                w_Z[str(state)] = {}
                
                for action in ["[1, 0]","[-1, 0]","[0, 1]","[0, -1]"]:

                    w_Z[str(state)][action] = [random.uniform(1e-9, 1e-8),random.uniform(1e-9, 1e-8)]

        I = 1

        for step_indx in range(len(trajectory[:-1])):

            step = trajectory[step_indx]
            
            next_step = trajectory[step_indx+1]

            done_action = actions[step_indx]


            v_hat_step = np.cos(abs(W[str(step)][0]) + abs(W[str(step)][1]))
            v_hat_next_step = np.cos(abs(W[str(next_step)][0]) + abs(W[str(next_step)][1]))
            r = state_reward(next_step)
            delta = r + gamma * v_hat_next_step - v_hat_step


            gradient_w1 = -np.sin(abs(W[str(step)][0]) + abs(W[str(step)][1])) * (W[str(step)][0]/abs(W[str(step)][0]))
            gradient_w2 = -np.sin(abs(W[str(step)][1]) + abs(W[str(step)][0])) * (W[str(step)][1]/abs(W[str(step)][1]))

            w_Z[str(step)][str(done_action)][0] = gamma * w_lambda * w_Z[str(step)][str(done_action)][0] + gradient_w1
            w_Z[str(step)][str(done_action)][1] = gamma * w_lambda * w_Z[str(step)][str(done_action)][1] + gradient_w2
            

            softmax_denominator = 0.0001
            for action in ["[1, 0]","[-1, 0]","[0, 1]","[0, -1]"]:

                softmax_denominator = softmax_denominator +\
                    -np.sin(Theta[str(step)][action][0] + Theta[str(step)][action][1]) *\
                math.exp(np.cos(Theta[str(step)][action][0] + Theta[str(step)][action][1]))

            #print('softmax_denominators',softmax_denominator)
            gradient_t1 = (-np.sin(Theta[str(step)][str(done_action)][0] + Theta[str(step)][str(done_action)][1]) * (Theta[str(step)][str(done_action)][0]/abs(Theta[str(step)][str(done_action)][0])) *\
            math.exp(np.cos(Theta[str(step)][str(done_action)][0] + Theta[str(step)][str(done_action)][1]))) - softmax_denominator

            gradient_t2 = (-np.sin(Theta[str(step)][str(done_action)][0] + Theta[str(step)][str(done_action)][1]) * (Theta[str(step)][str(done_action)][1]/abs(Theta[str(step)][str(done_action)][1])) *\
            math.exp(np.cos(Theta[str(step)][str(done_action)][0] + Theta[str(step)][str(done_action)][1]))) - softmax_denominator

            t_Z[str(step)][str(done_action)][0] = gamma * t_lambda * t_Z[str(step)][str(done_action)][0] + I * gradient_t1
            t_Z[str(step)][str(done_action)][1] = gamma * t_lambda * t_Z[str(step)][str(done_action)][1] + I * gradient_t2


            W[str(step)][0] = W[str(step)][0] + w_alpha * delta * w_Z[str(step)][str(done_action)][0]

            W[str(step)][1] = W[str(step)][1] + w_alpha * delta * w_Z[str(step)][str(done_action)][1]

            
            V[str(step)] = V[str(step)] + np.cos(abs(W[str(step)][0]) + abs(W[str(step)][1]))




            #print('gradient',gradient)
                
            t1 = Theta[str(step)][str(done_action)][0] +\
                t_alpha * delta * t_Z[str(step)][str(done_action)][0]
            
            t2 = Theta[str(step)][str(done_action)][1] +\
                t_alpha * delta * t_Z[str(step)][str(done_action)][1]

            Theta[str(step)][str(done_action)] = [t1,t2]

            Q[str(step)][str(done_action)] = np.cos(abs(t1)+abs(t2))
            I = gamma * I
    
    for state in environment[6]:

        if str(state) in list(Q.keys()):

            value_action_state = reverse_dictionary(Q[str(state)])
            Max_val = max(list(value_action_state.keys()))
            best_action = value_action_state[Max_val]
            best_action = ast.literal_eval(best_action)
            next_state = [x + y for x, y in zip(state, best_action)]

            if next_state not in environment[4] and next_state in environment[6]:

                Optimal_Policy[str(state)] = next_state

            else:

                Optimal_Policy[str(state)] = state



    return V, Q, Optimal_Policy


In [76]:
eligibility_traces_actor_critic_episodic(1000, 0.05, 0.5, 0.5, 'deterministic',0.2,0.3)

100%|██████████| 1000/1000 [00:01<00:00, 636.62it/s]


({'[0, 1]': -14.39216157934931,
  '[0, 2]': -140.09687714983772,
  '[0, 3]': -23.02240390315338,
  '[1, 0]': -12.390073257275487,
  '[1, 1]': -326.11383275593835,
  '[1, 2]': -2449.719999354992,
  '[1, 3]': -242.61224376397573,
  '[2, 1]': -1302.738746048939,
  '[2, 2]': -1977.4942454828454,
  '[2, 3]': -434.1199101146315,
  '[3, 1]': -1158.3426521579377,
  '[3, 3]': -43.119299527347636,
  '[4, 0]': -114.40912538945888,
  '[4, 1]': -1075.1407400691994,
  '[4, 2]': -90.62618658786363,
  '[4, 3]': 0},
 {'[0, 1]': {'[1, 0]': 0.4227127204380705,
   '[-1, 0]': 7.772026331095774e-09,
   '[0, 1]': 5.1586720274743505e-09,
   '[0, -1]': -0.027304196578897228},
  '[0, 2]': {'[1, 0]': 0.8161741550199557,
   '[-1, 0]': 0.1407669568008603,
   '[0, 1]': 0.5208312380812664,
   '[0, -1]': 0.21585772322029495},
  '[0, 3]': {'[1, 0]': 0.7302858422890514,
   '[-1, 0]': 0.3153223689217506,
   '[0, 1]': 4.0423561430237565e-09,
   '[0, -1]': 0.36291320935694865},
  '[1, 0]': {'[1, 0]': 0.46747128664727994,


## Actor-Critic with Eligibility Traces (continuing), for estimating $\pi_{\theta} \approx \pi_{*}$

### Note that our task is not continuing!

In [7]:
def eligibility_traces_actor_critic_continuing(num_trials, w_alpha, t_alpha, r_alpha, environment_stochasticity,w_lambda,t_lambda):

    Theta = {}
    for state in environment[6]:

        if state not in environment[4]:

            Theta[str(state)] = {}

            for action in [[1, 0],[-1, 0],[0, 1],[0, -1]]:

                next_state = [x + y for x, y in zip(state, action)]
                Features = extract_features(next_state)
                
                Theta[str(state)][str(action)] = [Features[0]+random.uniform(1e-9, 1e-8),Features[1]+random.uniform(1e-9, 1e-8)]
    
    
    W = {}
    for state in environment[6]:

        if state not in environment[4]:

            Features = extract_features(state)
            
            W[str(state)] = [Features[0]+random.uniform(1e-9, 1e-8),Features[1]+random.uniform(1e-9, 1e-8)]
    
    V = {}
    state_observed = {}
    for state in environment[6]:

        if state not in environment[4]:
            
            V[str(state)] = 0
            state_observed[str(state)] = 0

    Q = {}
    for state in environment[6]:

        if state not in environment[4]:
            
            Q[str(state)] = {}

            for action in ["[1, 0]","[-1, 0]","[0, 1]","[0, -1]"]:

                Q[str(state)][action] = random.uniform(1e-9, 1e-8)
    
    Optimal_Policy = {}
    for state in environment[6]:

        if state not in environment[4]:

            Optimal_Policy[str(state)] = state

    R_bar = 0
    for trial in tqdm(range(num_trials)):

        policy = pi_theta(environment,Theta)

        TRAJECTORY = generate_trajectory(policy,trial,environment_stochasticity)

        trajectory = TRAJECTORY[0]

        actions = TRAJECTORY[1]
        t_Z = {}
        for state in environment[6]:

            if state not in environment[4]:

                t_Z[str(state)] = {}
                
                for action in ["[1, 0]","[-1, 0]","[0, 1]","[0, -1]"]:

                    t_Z[str(state)][action] = [random.uniform(1e-9, 1e-8),random.uniform(1e-9, 1e-8)]
        
        w_Z = {}
        for state in environment[6]:

            if state not in environment[4]:

                w_Z[str(state)] = {}
                
                for action in ["[1, 0]","[-1, 0]","[0, 1]","[0, -1]"]:

                    w_Z[str(state)][action] = [random.uniform(1e-9, 1e-8),random.uniform(1e-9, 1e-8)]


        for step_indx in range(len(trajectory[:-1])):

            step = trajectory[step_indx]
            
            next_step = trajectory[step_indx+1]

            done_action = actions[step_indx]


            v_hat_step = np.cos(abs(W[str(step)][0]) + abs(W[str(step)][1]))
            v_hat_next_step = np.cos(abs(W[str(next_step)][0]) + abs(W[str(next_step)][1]))
            r = state_reward(next_step)
            delta = r - R_bar + v_hat_next_step - v_hat_step
            R_bar = R_bar + r_alpha * delta


            gradient_w1 = -np.sin(abs(W[str(step)][0]) + abs(W[str(step)][1])) * (W[str(step)][0]/abs(W[str(step)][0]))
            gradient_w2 = -np.sin(abs(W[str(step)][1]) + abs(W[str(step)][0])) * (W[str(step)][1]/abs(W[str(step)][1]))

            w_Z[str(step)][str(done_action)][0] =  w_lambda * w_Z[str(step)][str(done_action)][0] + gradient_w1
            w_Z[str(step)][str(done_action)][1] =  w_lambda * w_Z[str(step)][str(done_action)][1] + gradient_w2
            

            softmax_denominator = 0.0001
            for action in ["[1, 0]","[-1, 0]","[0, 1]","[0, -1]"]:

                softmax_denominator = softmax_denominator +\
                    -np.sin(Theta[str(step)][action][0] + Theta[str(step)][action][1]) *\
                math.exp(np.cos(Theta[str(step)][action][0] + Theta[str(step)][action][1]))

            #print('softmax_denominators',softmax_denominator)
            gradient_t1 = (-np.sin(Theta[str(step)][str(done_action)][0] + Theta[str(step)][str(done_action)][1]) * (Theta[str(step)][str(done_action)][0]/abs(Theta[str(step)][str(done_action)][0])) *\
            math.exp(np.cos(Theta[str(step)][str(done_action)][0] + Theta[str(step)][str(done_action)][1]))) - softmax_denominator

            gradient_t2 = (-np.sin(Theta[str(step)][str(done_action)][0] + Theta[str(step)][str(done_action)][1]) * (Theta[str(step)][str(done_action)][1]/abs(Theta[str(step)][str(done_action)][1])) *\
            math.exp(np.cos(Theta[str(step)][str(done_action)][0] + Theta[str(step)][str(done_action)][1]))) - softmax_denominator

            t_Z[str(step)][str(done_action)][0] =  t_lambda * t_Z[str(step)][str(done_action)][0] +  gradient_t1
            t_Z[str(step)][str(done_action)][1] =  t_lambda * t_Z[str(step)][str(done_action)][1] +  gradient_t2


            W[str(step)][0] = W[str(step)][0] + w_alpha * delta * w_Z[str(step)][str(done_action)][0]

            W[str(step)][1] = W[str(step)][1] + w_alpha * delta * w_Z[str(step)][str(done_action)][1]

            
            V[str(step)] = V[str(step)] + np.cos(abs(W[str(step)][0]) + abs(W[str(step)][1]))




            #print('gradient',gradient)
                
            t1 = Theta[str(step)][str(done_action)][0] +\
                t_alpha * delta * t_Z[str(step)][str(done_action)][0]
            
            t2 = Theta[str(step)][str(done_action)][1] +\
                t_alpha * delta * t_Z[str(step)][str(done_action)][1]

            Theta[str(step)][str(done_action)] = [t1,t2]

            Q[str(step)][str(done_action)] = np.cos(abs(t1)+abs(t2))

    
    for state in environment[6]:

        if str(state) in list(Q.keys()):

            value_action_state = reverse_dictionary(Q[str(state)])
            Max_val = max(list(value_action_state.keys()))
            best_action = value_action_state[Max_val]
            best_action = ast.literal_eval(best_action)
            next_state = [x + y for x, y in zip(state, best_action)]

            if next_state not in environment[4] and next_state in environment[6]:

                Optimal_Policy[str(state)] = next_state

            else:

                Optimal_Policy[str(state)] = state



    return V, Q, Optimal_Policy


In [12]:
eligibility_traces_actor_critic_continuing(100, 0.5, 0.5, 0.9, 'deterministic',0.2,0.3)

100%|██████████| 100/100 [00:45<00:00,  2.22it/s]


({'[0, 1]': -9537.358598356795,
  '[0, 2]': -33013.95947753112,
  '[0, 3]': -115298.72837895376,
  '[1, 0]': -23625.51886426553,
  '[1, 1]': -51091.70222123474,
  '[1, 2]': -70144.72982007058,
  '[1, 3]': -86856.12667171471,
  '[2, 1]': -30764.326726770818,
  '[2, 2]': -18167.48556223864,
  '[2, 3]': -11180.351199213515,
  '[3, 1]': -1308.4932121574852,
  '[3, 3]': -1303.4130824263973,
  '[4, 0]': -461.47098533403175,
  '[4, 1]': -126.95997859395781,
  '[4, 2]': -51.452545155353114,
  '[4, 3]': 0},
 {'[0, 1]': {'[1, 0]': -0.9913036104874409,
   '[-1, 0]': -0.9979499910385808,
   '[0, 1]': 0.8996328057517945,
   '[0, -1]': 0.7380657960431292},
  '[0, 2]': {'[1, 0]': -0.9461479686241047,
   '[-1, 0]': -0.5799126115615092,
   '[0, 1]': 0.9471481347759487,
   '[0, -1]': 0.5652217362417044},
  '[0, 3]': {'[1, 0]': 0.6857748319118119,
   '[-1, 0]': -0.0022510836229773933,
   '[0, 1]': 0.052745396270233946,
   '[0, -1]': 0.37000012819363953},
  '[1, 0]': {'[1, 0]': -0.1506198515145001,
   '[-