In [1]:
import numpy as np
import random
from tabulate import tabulate
from tqdm import tqdm
import ast

In [2]:
def generate_grid_world(length, width,path_lenght,holes_number,Random_State):
    
    random.seed(Random_State)
    #store all cells in a list
    Grid_Cells = []
    for row in range(length):
        for col in range(width):
            Grid_Cells.append([row,col])


    #specify the number of holes in the gridworld
    
    #specify the start point as a random cell
    start = [random.randint(0, length), random.randint(0, width)]

    #create a path from start point
    """instead of defining start and goal points,
      we define just a start point and a random path with a random lenght to
       another point and name it as goal point"""
    
    def random_path(Start, Path_Lenght,length, width):
        
        Path = []
        Path.append(Start)
        for i in range(Path_Lenght):
            
            #there are two moves that take us on a random cell named Goal [1,0], [0,1]
            
            move = random.choice([[1,0], [0,1]])
            
            #update the start cell/point by the above move
            Start = [x + y for x, y in zip(Start, move)]
            
            #if the movement take us out of our gridworld, we reverse the change in the start point
            if Start[0] < 0 or Start[1] < 0 or Start[0] > length-1 or Start[1] > width-1:

                Start = [x - y for x, y in zip(Start, move)]

            else:
                
                #create a path history
                Path.append(Start)

        Goal = Start

        return Goal,Path
    

    GoalPath = random_path(start, path_lenght,length, width)

    goal = GoalPath[0]
    path = GoalPath[1]

    #now we must eliminate the path cells from the Grid_Cells to choose hole cells from remaining cells

    FreeCells = [x for x in Grid_Cells if x not in path]

    Holes = random.sample(FreeCells, holes_number)

    #Also, we can visualize our gridworld in a simple way

    def mark_holes(holes):
        marked_data = [["Hole" if [row, col] in holes else [row, col] for col in range(width)] for row in range(length)]
        return marked_data
    
    marked_matrix = mark_holes(Holes)

    print(tabulate(marked_matrix, tablefmt="grid"))

    
    return length, width, start, goal, Holes, path,Grid_Cells

In [3]:
#environment = generate_grid_world(50, 40,1300,400,39)
environment = generate_grid_world(5, 4,4,4,39)

environment

+--------+--------+--------+--------+
| Hole   | [0, 1] | [0, 2] | [0, 3] |
+--------+--------+--------+--------+
| [1, 0] | [1, 1] | [1, 2] | [1, 3] |
+--------+--------+--------+--------+
| Hole   | [2, 1] | [2, 2] | [2, 3] |
+--------+--------+--------+--------+
| Hole   | [3, 1] | Hole   | [3, 3] |
+--------+--------+--------+--------+
| [4, 0] | [4, 1] | [4, 2] | [4, 3] |
+--------+--------+--------+--------+


(5,
 4,
 [1, 2],
 [4, 3],
 [[2, 0], [3, 2], [3, 0], [0, 0]],
 [[1, 2], [1, 3], [2, 3], [3, 3], [4, 3]],
 [[0, 0],
  [0, 1],
  [0, 2],
  [0, 3],
  [1, 0],
  [1, 1],
  [1, 2],
  [1, 3],
  [2, 0],
  [2, 1],
  [2, 2],
  [2, 3],
  [3, 0],
  [3, 1],
  [3, 2],
  [3, 3],
  [4, 0],
  [4, 1],
  [4, 2],
  [4, 3]])

In [4]:
def probability_distribution(grid_size,randomness):
    #random.seed(40)
    
    #by this function we generate probabilities which their sum is equal to 1
    def generate_probabilities(n):

        numbers = [random.random() for _ in range(n)]
        total_sum = sum(numbers)
        scaled_numbers = [num / total_sum for num in numbers]
        
        return scaled_numbers
    
    cells_prob = {}
    if randomness == 'stochastic':
        for cell in range(grid_size):
            
            #we set the number of probs to 4 due to 4 possible action for each cell (go to its neighbors)
            probs = generate_probabilities(4)

            cells_prob[cell] = probs
    elif randomness == 'equal probable':

        for cell in range(grid_size):

            cells_prob[cell] = [0.25,0.25,0.25,0.25]
    
    elif randomness == 'deterministic':
        for cell in range(grid_size):

            cells_prob[cell] = [0.03,0.06,0.01,0.9] #[0,0,0,1] ##[0.15,.15,0.1,0.6]


    #Note that we consider the correspondence between probabilities and actions as below:
    #probs = [p1, p2, p3, p4] ---> [[1,0],[-1,0],[0,1],[0,-1]]

    return cells_prob

def neighbor_cells(cell):

    grid_cells = environment[6]
    Actions = [[1,0],[-1,0],[0,1],[0,-1]]

    Neighbors = []
    Actions_Neighbors = []
    for action in Actions:

        neighbor = [x + y for x, y in zip(cell, action)]
        #if neighbor not in environment[4]:
        Neighbors.append(neighbor)
        Actions_Neighbors.append(action)

    return Neighbors, Actions_Neighbors

def arbitrary_policy(randomness):

        #random.seed(randomness)
        
    policy = {}
    policy_action = {}
    for state in environment[6]:

        if state not in environment[4]:

            neighbors = neighbor_cells(state)[0]
            Actions_Neighbors = neighbor_cells(state)[1]

            allowed_positions = []

            for neighbor in neighbors:
                
                if neighbor in environment[6] and neighbor not in environment[4]:
                    
                    allowed_positions.append(neighbor)
            
            if len(allowed_positions) > 0:
                
                next_state = random.choice(allowed_positions)
                row = next_state[0] - state[0]
                col = next_state[1] - state[1]
                PolicyAction = [row, col]

                policy['{}'.format(state)] = next_state
                policy_action['{}'.format(state)] = PolicyAction



    return policy, policy_action

def state_reward(next_state):

    if next_state in environment[4]:

        r = -3
    
    elif next_state == environment[3]:

        r = 10
    
    elif next_state not in environment[6]:

        r = -2
    
    else:

        r = -1
    
    return r

def reverse_dictionary(dict):
    reverse_dict = {}
    for key in list(dict.keys()):
        val = dict[key]
        reverse_dict[val] = key
    return reverse_dict


state_indice_dict = {}
counter = 0
for state in environment[6]:

    state = str(state)
    state_indice_dict[state] = counter
    counter = counter + 1

def generate_trajectory(policy,randomness,environment_stochasticity):

    policy_action = policy[1]
    probs = probability_distribution(environment[0]*environment[1],environment_stochasticity)
    start = environment[2]
    terminate = start
    trajectory = []
    pure_trajectory = [start]
    c = 0
    while terminate != environment[3]:
        random.seed(randomness+c)
        Actions = [[1,0],[-1,0],[0,1],[0,-1]]
        action = policy_action[str(terminate)]
        Actions.remove(action)
        sorted_actions = Actions + [action]
        state_indice = state_indice_dict[str(terminate)]
        actions_prob = probs[state_indice]
        actions_prob.sort()

        selected_action = random.choices(sorted_actions, actions_prob)[0]
        current_state = terminate
        next_state = [x + y for x, y in zip(terminate, selected_action)]
        pure_trajectory.append(next_state)
        
        #if the agent goes out of the gridworld, it stays in its current state
        if next_state not in environment[6]:
            next_state = terminate
        
        #if it drops into the holes, it goes to the start points
        elif next_state in environment[4]:
            next_state = start  

        terminate = next_state
        trajectory.append((current_state))
        c = c+1
    
    trajectory.append((environment[3]))
    pure_trajectory.append(environment[3])

    return trajectory,pure_trajectory

def extract_features(state):

    goal = environment[3]
    max_length = environment[0]
    max_width = environment[1]

    w1 = (goal[0] - state[0]) / max_width
    w2 = (goal[1] - state[1]) / max_length

    return abs(w1), abs(w2)

## Semi-gradient $TD(\lambda)$ for Estimating $\hat{v} \approx v_{\pi}$

In [5]:
def semi_gradient_TD_lambda(num_trials, gamma, policy, Lambda, alpha, environment_stochasticity):

    W = {}
    for state in environment[6]:

        if state not in environment[4]:

            W[str(state)] = {}

            Features = extract_features(state)

            for element in [0,1]:
                
                W[str(state)][element] = Features[element] + random.uniform(1e-9, 1e-8)


    V = {}
    state_observed = {}
    for state in environment[6]:

        if state not in environment[4]:
            
            V[str(state)] = 0
            state_observed[str(state)] = 0
    
    for trial in tqdm(range(num_trials)):

        TRAJECTORY = generate_trajectory(policy,trial,environment_stochasticity)
        

        trajectory = TRAJECTORY[0]

        z1 , z2 = 0 , 0

        for step_indx in range(len(trajectory[:-1])):

            step = trajectory[step_indx]
            next_step = trajectory[step_indx+1]

            r = state_reward(next_step)
            #print(W[str(step)][0],W[str(step)][1])
            gradient_w1 = -np.sin(abs(W[str(step)][0]) + abs(W[str(step)][1])) * (W[str(step)][0]/abs(W[str(step)][0]))
            gradient_w2 = -np.sin(abs(W[str(step)][1]) + abs(W[str(step)][0])) * (W[str(step)][1]/abs(W[str(step)][1]))
            
            z1 = gamma * Lambda * z1 + gradient_w1
            z2 = gamma * Lambda * z2 + gradient_w2

            delta = r + gamma * np.cos(abs(W[str(next_step)][0]) + abs(W[str(next_step)][1]))\
                         - np.cos(abs(W[str(step)][0]) + abs(W[str(step)][1]))

            #alpha = 1/math.log(step_indx+2)
            W[str(step)][0] = W[str(step)][0] + alpha * delta * z1

            W[str(step)][1] = W[str(step)][1] + alpha * delta * z2

            
            V[str(step)] = V[str(step)] + np.cos(abs(W[str(step)][0]) + abs(W[str(step)][1]))
            state_observed[str(step)] = state_observed[str(step)] + 1
    
    for state in environment[6]:

        if state not in environment[4]:

            if state_observed[str(state)] > 0:

                V[str(state)] = V[str(state)] / state_observed[str(state)]


    return V



In [6]:
policy_0 = arbitrary_policy(41)

In [7]:
semi_gradient_TD_lambda(10000, 0.9, policy_0, 0.5, 0.2, 'deterministic')

  0%|          | 0/10000 [00:00<?, ?it/s]

100%|██████████| 10000/10000 [00:24<00:00, 400.93it/s]


{'[0, 1]': -0.9999979959477547,
 '[0, 2]': -0.9999943864821053,
 '[0, 3]': -0.9998482418602734,
 '[1, 0]': -0.9994320720238307,
 '[1, 1]': -0.9998064299228693,
 '[1, 2]': -0.9999734493958976,
 '[1, 3]': -0.9999874784926045,
 '[2, 1]': -0.9998100019500074,
 '[2, 2]': -0.9995345080561471,
 '[2, 3]': -0.9998861917275234,
 '[3, 1]': -0.9997765349035409,
 '[3, 3]': -0.9994380990624914,
 '[4, 0]': -0.997328139417871,
 '[4, 1]': -0.9979638265415661,
 '[4, 2]': -0.10095922794665614,
 '[4, 3]': 0}

## True online $TD(\lambda)$ for Estimating $W^T X \approx v_{\pi}$

In [7]:
def true_online_TD_lambda(num_trials, gamma, policy, Lambda, alpha, environment_stochasticity):

    W = {}
    for state in environment[6]:

        if state not in environment[4]:

            Features = extract_features(state)
            
            W[str(state)] = np.array([0, 0])

    V = {}
    state_observed = {}
    for state in environment[6]:

        if state not in environment[4]:
            
            V[str(state)] = 0
            state_observed[str(state)] = 0
    

    for trial in tqdm(range(num_trials)):

        TRAJECTORY = generate_trajectory(policy,trial,environment_stochasticity)

        trajectory = TRAJECTORY[0]
        z = np.array([0,0])
        v_old = 0

        for step_indx in range(len(trajectory[:-1])):

            step = trajectory[step_indx]
            next_step = trajectory[step_indx+1]

            r = state_reward(next_step)

            x_features = np.array([extract_features(step)[0],extract_features(step)[1]])
            xprim_features = np.array([extract_features(next_step)[0],extract_features(next_step)[1]])

            v = np.matmul(np.transpose(W[str(step)]) , x_features)
            v_prime = np.matmul(np.transpose(W[str(step)]) , xprim_features)

            delta = r + gamma * v_prime - v 
            #print(gamma * Lambda * z)
            #print(np.matmul(z , x_features))
            #print( np.matmul(1 - alpha * gamma * Lambda * np.matmul(np.transpose(z) , x_features) , x_features))
            z = gamma * Lambda * z +\
                  (1 - alpha * gamma * Lambda * np.matmul(np.transpose(z) , x_features)) * x_features
            
            W[str(step)] = W[str(step)] +\
                            alpha * (delta + v - v_old) * z - alpha * (v - v_old) * x_features
            #print(W[str(step)])
            v_old = v_prime
            V[str(step)] = V[str(step)] + np.cos(abs(W[str(step)][0]) + abs(W[str(step)][1]))
            state_observed[str(step)] = state_observed[str(step)] + 1
    



    return V,W

        


In [None]:
true_online_TD_lambda(500, 0.9, policy_0, 0.03, 0.02, 'deterministic')

100%|██████████| 500/500 [00:04<00:00, 106.03it/s]


({'[0, 1]': 9.43536194556366,
  '[0, 2]': 28.276216443317836,
  '[0, 3]': 13.634662721241174,
  '[1, 0]': 36.79849297251622,
  '[1, 1]': -433.0221765091339,
  '[1, 2]': 1.1925062965601612,
  '[1, 3]': -28.51072569144077,
  '[2, 1]': 390.2462587758875,
  '[2, 2]': 72.00122084088336,
  '[2, 3]': 41.42228789851967,
  '[3, 1]': -108.14759638409062,
  '[3, 3]': 1.999935447527856,
  '[4, 0]': -1608.9836268413972,
  '[4, 1]': 219.91119694671116,
  '[4, 2]': 472.81097515783796,
  '[4, 3]': 0},
 {'[0, 1]': array([-6.79903526e+09, -1.26539568e+09]),
  '[0, 2]': array([1.25862686e+12, 3.67348767e+11]),
  '[0, 3]': array([-1.46972616e+09, -3.03282254e+08]),
  '[1, 0]': array([-6.06875029e+09, -3.16127388e+09]),
  '[1, 1]': array([7.85841292e+12, 1.76526918e+12]),
  '[1, 2]': array([-1.10314764e+14, -1.45604276e+15]),
  '[1, 3]': array([9.21434942e+10, 2.71493775e+10]),
  '[2, 1]': array([-7.43474150e+10, -3.66317043e+10]),
  '[2, 2]': array([531.02895986,  -1.26522396]),
  '[2, 3]': array([-391982

## Sarsa($ \lambda $) with binary features and linear function approximation for estimating $W^T X \approx q_{\pi}$ or $q_{*}$

In [7]:
def state_action_nextstate(Q,current_state,epsilon,environment_stochasticity):

    grid_size = environment[0]*environment[1]

    probs = probability_distribution(grid_size,environment_stochasticity)
    #print(probs)

    if type(current_state) == str:

        state = ast.literal_eval(current_state)
    else:
        state = current_state
    #Choose action using policy derived from Q===================================
    value_action_state = reverse_dictionary(Q[str(state)])
    Max_val = max(list(value_action_state.keys()))
    best_action = value_action_state[Max_val]
    best_action = ast.literal_eval(best_action)

    #============================================================================
    #Epsilon Greedy
    if random.uniform(0, 1) > epsilon:

        selected_action = best_action
    
    else:
        Actions = [[1,0],[-1,0],[0,1],[0,-1]]
        Actions.remove(best_action)
        epsilon_action = random.choice(Actions)

        selected_action = epsilon_action
    #============================================================================
    
    Actions = [[1,0],[-1,0],[0,1],[0,-1]]
    Actions.remove(selected_action)
    sorted_actions = Actions + [selected_action]
    state_indice = state_indice_dict[str(state)]
    #print(probs)
    #print(state_indice)
    actions_prob = probs[state_indice]
    actions_prob.sort()
    #due to stochasticity of the environment
    Final_action = random.choices(sorted_actions, actions_prob)[0]
    #print(type(state), type(Final_action))
    
    next_state = [x + y for x, y in zip(state, Final_action)]

    if next_state not in environment[6] or next_state in environment[4]:

        next_state = current_state
    
    value_action_state = reverse_dictionary(Q[str(next_state)])
    #max Q(s',s)
    Max_q_val = max(list(value_action_state.keys()))
    best_action = value_action_state[Max_q_val]
    best_action = ast.literal_eval(best_action)

    return Final_action, next_state, Max_q_val
    
def active_features(state):

    actives = []

    feature = extract_features(state)

    for i in range(2):

        if feature[i] != environment[3][i]:

            actives.append(i)
    
    return actives

def sarsa_lambda(num_trials, gamma, Lambda, alpha, environment_stochasticity,epsilon):

    W  = {}
    for state in environment[6]:

        if state not in environment[4]:

            Features = extract_features(state)
            W[str(state)] =  {}
            
            for action in ["[1, 0]","[-1, 0]","[0, 1]","[0, -1]"]:

                W[str(state)][action] = [Features[0] + random.uniform(1e-9, 1e-8),Features[1] + random.uniform(1e-9, 1e-8)]
    

    Q = {}
    for state in environment[6]:

        if state not in environment[4]:
            
            Q[str(state)] = {}

            for action in ["[1, 0]","[-1, 0]","[0, 1]","[0, -1]"]:

                Q[str(state)][action] = random.uniform(1e-9, 1e-8)

    
    for trial in tqdm(range(num_trials)):

        Z = {}
        for state in environment[6]:

            if state not in environment[4]:

                Z[str(state)] = {}
                
                for action in ["[1, 0]","[-1, 0]","[0, 1]","[0, -1]"]:

                    Z[str(state)][action] = [random.uniform(1e-9, 1e-8),random.uniform(1e-9, 1e-8)]


        current_state = environment[2] #start

        while current_state != environment[3]:

            epsilon_policy = state_action_nextstate(Q,current_state,epsilon,environment_stochasticity)
            action = epsilon_policy[0]
            next_state = epsilon_policy[1]
            
            r = state_reward(next_state)
            delta = r

            ActiveFeatures = active_features(current_state)

            for i in ActiveFeatures:

                delta = delta - W[str(current_state)][str(action)][i]
                Z[str(current_state)][str(action)][i] = Z[str(current_state)][str(action)][i] + 1
            
            if next_state == environment[3]:

                for i in range(2):

                    W[str(current_state)][str(action)][i] = W[str(current_state)][str(action)][i] +\
                    alpha * alpha * Z[str(current_state)][str(action)][i]
            

            NextActiveFeatures = active_features(next_state)

            for i in NextActiveFeatures:

                delta = delta - gamma * W[str(next_state)][str(action)][i]

                #print(i,type(W[str(next_state)][i]), type(Z[str(next_state)][i]))
                #print( alpha * delta * Z[str(next_state)][i])
            for i in NextActiveFeatures:

                W[str(next_state)][str(action)][i] = W[str(next_state)][str(action)][i] + alpha * delta * Z[str(next_state)][str(action)][i]

                Z[str(next_state)][str(action)][i] = gamma * Lambda * Z[str(next_state)][str(action)][i]

        
            Q[str(current_state)][str(action)] = np.cos(abs(W[str(current_state)][str(action)][0]) +\
                                                abs(W[str(current_state)][str(action)][1]))
            
            current_state = next_state
    
    return Q


In [10]:
sarsa_lambda(100, 0.9, 0.5, 0.2, 'deterministic',0.1)

  0%|          | 0/1000 [00:00<?, ?it/s]

  Q[str(current_state)][str(action)] = np.cos(abs(W[str(current_state)][str(action)][0]) +\
100%|██████████| 1000/1000 [24:25<00:00,  1.47s/it] 


{'[0, 1]': {'[1, 0]': 0.16996713642209252,
  '[-1, 0]': nan,
  '[0, 1]': 0.16996713184861328,
  '[0, -1]': 0.825335615063446},
 '[0, 2]': {'[1, 0]': 0.36235774007940674,
  '[-1, 0]': nan,
  '[0, 1]': -0.8893265739110678,
  '[0, -1]': -0.6063209331792516},
 '[0, 3]': {'[1, 0]': 0.5403023007241335,
  '[-1, 0]': 0.693904951076342,
  '[0, 1]': 0.5403022785946978,
  '[0, -1]': 0.5403022955776547},
 '[1, 0]': {'[1, 0]': 0.8646637019492132,
  '[-1, 0]': 0.8646637019492132,
  '[0, 1]': 0.21900667727102666,
  '[0, -1]': -0.3045487830204501},
 '[1, 1]': {'[1, 0]': -0.8893265715530619,
  '[-1, 0]': nan,
  '[0, 1]': -0.862563540369753,
  '[0, -1]': -0.9710976940697086},
 '[1, 2]': {'[1, 0]': -0.766676259804487,
  '[-1, 0]': nan,
  '[0, 1]': -0.6191603902293098,
  '[0, -1]': -0.3650142742096699},
 '[1, 3]': {'[1, 0]': -0.6063209277752457,
  '[-1, 0]': 0.36535032369881004,
  '[0, 1]': 0.08772729918717495,
  '[0, -1]': 0.7316888581537256},
 '[2, 1]': {'[1, 0]': -0.27736992032089147,
  '[-1, 0]': 0.81

## True online Sarsa($ \lambda $) for estimating $W^T X \approx q_{\pi}$ or $q_{*}$

In [18]:
def greedy_w(W,current_state,epsilon,environment_stochasticity):

    grid_size = environment[0]*environment[1]

    probs = probability_distribution(grid_size,environment_stochasticity)
    new_w = {}
    for state in environment[6]:

        if state not in environment[4]:

            new_w[str(state)] =  {}
            
            for action in ["[1, 0]","[-1, 0]","[0, 1]","[0, -1]"]:

                new_w[str(state)][action] = np.cos(abs(W[str(state)][str(action)][0]) +\
                                                abs(W[str(state)][str(action)][1]))
    #print(probs)

    if type(current_state) == str:

        state = ast.literal_eval(current_state)
    else:
        state = current_state
    
    #Choose action using policy derived from Q===================================
    value_action_state = reverse_dictionary(new_w[str(state)])
    Max_val = max(list(value_action_state.keys()))
    best_action = value_action_state[Max_val]
    best_action = ast.literal_eval(best_action)

    #============================================================================
    #Epsilon Greedy
    if random.uniform(0, 1) > epsilon:

        selected_action = best_action
    
    else:
        Actions = [[1,0],[-1,0],[0,1],[0,-1]]
        Actions.remove(best_action)
        epsilon_action = random.choice(Actions)

        selected_action = epsilon_action
    #============================================================================
    
    Actions = [[1,0],[-1,0],[0,1],[0,-1]]
    Actions.remove(selected_action)
    sorted_actions = Actions + [selected_action]
    state_indice = state_indice_dict[str(state)]
    #print(probs)
    #print(state_indice)
    actions_prob = probs[state_indice]
    actions_prob.sort()
    #due to stochasticity of the environment
    Final_action = random.choices(sorted_actions, actions_prob)[0]
    #print(type(state), type(Final_action))
    
    next_state = [x + y for x, y in zip(state, Final_action)]

    if next_state not in environment[6] or next_state in environment[4]:

        next_state = current_state
    
    value_action_state = reverse_dictionary(new_w[str(state)])
    #max Q(s',s)
    Max_q_val = max(list(value_action_state.keys()))
    best_action = value_action_state[Max_q_val]
    best_action = ast.literal_eval(best_action)

    return Final_action, next_state, Max_q_val


def true_online_sarsa(num_trials, gamma, Lambda, alpha, environment_stochasticity,epsilon):

    W  = {}
    for state in environment[6]:

        if state not in environment[4]:

            Features = extract_features(state)
            W[str(state)] =  {}
            
            for action in ["[1, 0]","[-1, 0]","[0, 1]","[0, -1]"]:

                W[str(state)][action] = np.array([Features[0] + random.uniform(1e-9, 1e-8),Features[1] + random.uniform(1e-9, 1e-8)])
    

    Q = {}
    for state in environment[6]:

        if state not in environment[4]:
            
            Q[str(state)] = {}

            for action in ["[1, 0]","[-1, 0]","[0, 1]","[0, -1]"]:

                Q[str(state)][action] = random.uniform(1e-9, 1e-8)
    
    
    for trial in tqdm(range(num_trials)):

        

        Z = {}
        for state in environment[6]:

            if state not in environment[4]:

                Z[str(state)] = {}
                
                for action in ["[1, 0]","[-1, 0]","[0, 1]","[0, -1]"]:

                    Z[str(state)][action] = np.array([0,0])
        
        Q_old = {}
        for state in environment[6]:

            if state not in environment[4]:

                Q_old[str(state)] = {}
                
                for action in ["[1, 0]","[-1, 0]","[0, 1]","[0, -1]"]:

                    Q_old[str(state)][action] = 0
        
        Q_prime = {}
        for state in environment[6]:

            if state not in environment[4]:

                Q_prime[str(state)] = {}
                
                for action in ["[1, 0]","[-1, 0]","[0, 1]","[0, -1]"]:

                    Q_prime[str(state)][action] = 0
        
        current_state = environment[2] #start
        
        while current_state != environment[3]:

            

            greedy_policy = greedy_w(W,current_state,epsilon,environment_stochasticity)
            action = greedy_policy[0]
            next_state = greedy_policy[1]
            #print(current_state,action)

            x = np.array([extract_features(next_state)[0],extract_features(next_state)[1]])

            next_next_state = greedy_w(W,next_state,epsilon,environment_stochasticity)[1]

            x_prime = np.array([extract_features(next_next_state)[0],extract_features(next_next_state)[1]])

            Q[str(current_state)][str(action)] = np.matmul(np.transpose(W[str(current_state)][str(action)]),x)
            Q_prime[str(current_state)][str(action)] = np.matmul(np.transpose(W[str(current_state)][str(action)]),x_prime)
            
            r = state_reward(next_state)
            delta = r + gamma * Q_prime[str(current_state)][str(action)] - Q[str(current_state)][str(action)]

            Z[str(current_state)][str(action)] = gamma * Lambda * Z[str(current_state)][str(action)] +\
                ((1 - alpha * gamma * Lambda * np.matmul(np.transpose(Z[str(current_state)][str(action)]) , x)) * x)

            W[str(current_state)][str(action)] = W[str(current_state)][str(action)] +\
                alpha * (delta + Q[str(current_state)][str(action)] - Q_old[str(current_state)][str(action)]) * Z[str(current_state)][str(action)]\
                     - alpha * (Q[str(current_state)][str(action)] - Q_old[str(current_state)][str(action)]) * x

            Q_old[str(current_state)][str(action)] = Q_prime[str(current_state)][str(action)]

            current_state = next_state
            x = x_prime
            #print(current_state)

    

    return Q

In [21]:
true_online_sarsa(5000, 0.9, 0.5, 0.2, 'deterministic',0.2)

  0%|          | 0/5000 [00:00<?, ?it/s]

100%|██████████| 5000/5000 [01:03<00:00, 78.57it/s] 


{'[0, 1]': {'[1, 0]': -76.89262328227105,
  '[-1, 0]': -7.561396121660657,
  '[0, 1]': -7.903332838177607,
  '[0, -1]': -7.628312257172707},
 '[0, 2]': {'[1, 0]': -9.974782542185366,
  '[-1, 0]': -7.944849854234295,
  '[0, 1]': -4.633836947346454,
  '[0, -1]': -6.895198670678157},
 '[0, 3]': {'[1, 0]': -5.304753784834761,
  '[-1, 0]': -7.626240113567025,
  '[0, 1]': -7.499625973759261,
  '[0, -1]': -8.363630903980031},
 '[1, 0]': {'[1, 0]': -9.635010951461,
  '[-1, 0]': -9.431716817872601,
  '[0, 1]': -20.288480998326,
  '[0, -1]': -9.352078423176987},
 '[1, 1]': {'[1, 0]': -10.280961714230699,
  '[-1, 0]': -6.706821101653243,
  '[0, 1]': -10.582696582646017,
  '[0, -1]': -7.7027253978061285},
 '[1, 2]': {'[1, 0]': -6.14846605195029,
  '[-1, 0]': -7.315489855670303,
  '[0, 1]': -5.044813393371052,
  '[0, -1]': -17.900661163748822},
 '[1, 3]': {'[1, 0]': -2.483009231300176,
  '[-1, 0]': -4.64324189342688,
  '[0, 1]': -10.542516465922022,
  '[0, -1]': -8.885592278498086},
 '[2, 1]': {'[1