In [1]:
import numpy as np
import random
from tabulate import tabulate
from tqdm import tqdm
import ast

In [2]:
def generate_grid_world(length, width,path_lenght,holes_number,Random_State):
    
    random.seed(Random_State)
    #store all cells in a list
    Grid_Cells = []
    for row in range(length):
        for col in range(width):
            Grid_Cells.append([row,col])


    #specify the number of holes in the gridworld
    
    #specify the start point as a random cell
    start = [random.randint(0, length), random.randint(0, width)]

    #create a path from start point
    """instead of defining start and goal points,
      we define just a start point and a random path with a random lenght to
       another point and name it as goal point"""
    
    def random_path(Start, Path_Lenght,length, width):
        
        Path = []
        Path.append(Start)
        for i in range(Path_Lenght):
            
            #there are two moves that take us on a random cell named Goal [1,0], [0,1]
            
            move = random.choice([[1,0], [0,1]])
            
            #update the start cell/point by the above move
            Start = [x + y for x, y in zip(Start, move)]
            
            #if the movement take us out of our gridworld, we reverse the change in the start point
            if Start[0] < 0 or Start[1] < 0 or Start[0] > length-1 or Start[1] > width-1:

                Start = [x - y for x, y in zip(Start, move)]

            else:
                
                #create a path history
                Path.append(Start)

        Goal = Start

        return Goal,Path
    

    GoalPath = random_path(start, path_lenght,length, width)

    goal = GoalPath[0]
    path = GoalPath[1]

    #now we must eliminate the path cells from the Grid_Cells to choose hole cells from remaining cells

    FreeCells = [x for x in Grid_Cells if x not in path]

    Holes = random.sample(FreeCells, holes_number)

    #Also, we can visualize our gridworld in a simple way

    def mark_holes(holes):
        marked_data = [["Hole" if [row, col] in holes else [row, col] for col in range(width)] for row in range(length)]
        return marked_data
    
    marked_matrix = mark_holes(Holes)

    print(tabulate(marked_matrix, tablefmt="grid"))

    
    return length, width, start, goal, Holes, path,Grid_Cells

In [3]:
#environment = generate_grid_world(50, 40,1300,400,39)
environment = generate_grid_world(5, 4,4,4,39)

environment

+--------+--------+--------+--------+
| Hole   | [0, 1] | [0, 2] | [0, 3] |
+--------+--------+--------+--------+
| [1, 0] | [1, 1] | [1, 2] | [1, 3] |
+--------+--------+--------+--------+
| Hole   | [2, 1] | [2, 2] | [2, 3] |
+--------+--------+--------+--------+
| Hole   | [3, 1] | Hole   | [3, 3] |
+--------+--------+--------+--------+
| [4, 0] | [4, 1] | [4, 2] | [4, 3] |
+--------+--------+--------+--------+


(5,
 4,
 [1, 2],
 [4, 3],
 [[2, 0], [3, 2], [3, 0], [0, 0]],
 [[1, 2], [1, 3], [2, 3], [3, 3], [4, 3]],
 [[0, 0],
  [0, 1],
  [0, 2],
  [0, 3],
  [1, 0],
  [1, 1],
  [1, 2],
  [1, 3],
  [2, 0],
  [2, 1],
  [2, 2],
  [2, 3],
  [3, 0],
  [3, 1],
  [3, 2],
  [3, 3],
  [4, 0],
  [4, 1],
  [4, 2],
  [4, 3]])

In [4]:
def probability_distribution(grid_size,randomness):
    #random.seed(40)
    
    #by this function we generate probabilities which their sum is equal to 1
    def generate_probabilities(n):

        numbers = [random.random() for _ in range(n)]
        total_sum = sum(numbers)
        scaled_numbers = [num / total_sum for num in numbers]
        
        return scaled_numbers
    
    cells_prob = {}
    if randomness == 'stochastic':
        for cell in range(grid_size):
            
            #we set the number of probs to 4 due to 4 possible action for each cell (go to its neighbors)
            probs = generate_probabilities(4)

            cells_prob[cell] = probs
    elif randomness == 'equal probable':

        for cell in range(grid_size):

            cells_prob[cell] = [0.25,0.25,0.25,0.25]
    
    elif randomness == 'deterministic':
        for cell in range(grid_size):

            cells_prob[cell] = [0.03,0.06,0.01,0.9] #[0,0,0,1] ##[0.15,.15,0.1,0.6]


    #Note that we consider the correspondence between probabilities and actions as below:
    #probs = [p1, p2, p3, p4] ---> [[1,0],[-1,0],[0,1],[0,-1]]

    return cells_prob

def neighbor_cells(cell):

    grid_cells = environment[6]
    Actions = [[1,0],[-1,0],[0,1],[0,-1]]

    Neighbors = []
    Actions_Neighbors = []
    for action in Actions:

        neighbor = [x + y for x, y in zip(cell, action)]
        #if neighbor not in environment[4]:
        Neighbors.append(neighbor)
        Actions_Neighbors.append(action)

    return Neighbors, Actions_Neighbors

def arbitrary_policy(randomness):

        #random.seed(randomness)
        
    policy = {}
    policy_action = {}
    for state in environment[6]:

        if state not in environment[4]:

            neighbors = neighbor_cells(state)[0]
            Actions_Neighbors = neighbor_cells(state)[1]

            allowed_positions = []

            for neighbor in neighbors:
                
                if neighbor in environment[6] and neighbor not in environment[4]:
                    
                    allowed_positions.append(neighbor)
            
            if len(allowed_positions) > 0:
                
                next_state = random.choice(allowed_positions)
                row = next_state[0] - state[0]
                col = next_state[1] - state[1]
                PolicyAction = [row, col]

                policy['{}'.format(state)] = next_state
                policy_action['{}'.format(state)] = PolicyAction



    return policy, policy_action

def state_reward(next_state):

    if next_state in environment[4]:

        r = -3
    
    elif next_state == environment[3]:

        r = 10
    
    elif next_state not in environment[6]:

        r = -2
    
    else:

        r = -1
    
    return r

def reverse_dictionary(dict):
    reverse_dict = {}
    for key in list(dict.keys()):
        val = dict[key]
        reverse_dict[val] = key
    return reverse_dict


state_indice_dict = {}
counter = 0
for state in environment[6]:

    state = str(state)
    state_indice_dict[state] = counter
    counter = counter + 1

def generate_trajectory(policy,randomness,environment_stochasticity):

    policy_action = policy[1]
    probs = probability_distribution(environment[0]*environment[1],environment_stochasticity)
    start = environment[2]
    terminate = start
    trajectory = []
    pure_trajectory = [start]
    c = 0
    while terminate != environment[3]:
        random.seed(randomness+c)
        Actions = [[1,0],[-1,0],[0,1],[0,-1]]
        action = policy_action[str(terminate)]
        Actions.remove(action)
        sorted_actions = Actions + [action]
        state_indice = state_indice_dict[str(terminate)]
        actions_prob = probs[state_indice]
        actions_prob.sort()

        selected_action = random.choices(sorted_actions, actions_prob)[0]
        current_state = terminate
        next_state = [x + y for x, y in zip(terminate, selected_action)]
        pure_trajectory.append(next_state)
        
        #if the agent goes out of the gridworld, it stays in its current state
        if next_state not in environment[6]:
            next_state = terminate
        
        #if it drops into the holes, it goes to the start points
        elif next_state in environment[4]:
            next_state = start  

        terminate = next_state
        trajectory.append((current_state))
        c = c+1
    
    trajectory.append((environment[3]))
    pure_trajectory.append(environment[3])

    return trajectory,pure_trajectory

def extract_features(state):

    goal = environment[3]
    max_length = environment[0]
    max_width = environment[1]

    w1 = (goal[0] - state[0]) / max_width
    w2 = (goal[1] - state[1]) / max_length

    return abs(w1), abs(w2)

## Episodic Semi-gradient Sarsa for Estimating $\hat{q} \approx q_{*}$

We define the $\hat{q}$ as th following:

$ \hat{q}(S,A,length, width) = Cos(|w_{1}^{s'}| + |w_{2}^{s'}|)$

$transition(S,A) = S'$

In [5]:
def state_action_nextstate(Q,current_state,epsilon,environment_stochasticity):

    grid_size = environment[0]*environment[1]

    probs = probability_distribution(grid_size,environment_stochasticity)
    #print(probs)

    if type(current_state) == str:

        state = ast.literal_eval(current_state)
    else:
        state = current_state
    #Choose action using policy derived from Q===================================
    value_action_state = reverse_dictionary(Q[str(state)])
    Max_val = max(list(value_action_state.keys()))
    best_action = value_action_state[Max_val]
    best_action = ast.literal_eval(best_action)

    #============================================================================
    #Epsilon Greedy
    if random.uniform(0, 1) > epsilon:

        selected_action = best_action
    
    else:
        Actions = [[1,0],[-1,0],[0,1],[0,-1]]
        Actions.remove(best_action)
        epsilon_action = random.choice(Actions)

        selected_action = epsilon_action
    #============================================================================
    
    Actions = [[1,0],[-1,0],[0,1],[0,-1]]
    Actions.remove(selected_action)
    sorted_actions = Actions + [selected_action]
    state_indice = state_indice_dict[str(state)]
    #print(probs)
    #print(state_indice)
    actions_prob = probs[state_indice]
    actions_prob.sort()
    #due to stochasticity of the environment
    Final_action = random.choices(sorted_actions, actions_prob)[0]
    #print(type(state), type(Final_action))
    
    next_state = [x + y for x, y in zip(state, Final_action)]

    if next_state not in environment[6] or next_state in environment[4]:

        next_state = current_state
    
    value_action_state = reverse_dictionary(Q[str(next_state)])
    #max Q(s',s)
    Max_q_val = max(list(value_action_state.keys()))
    best_action = value_action_state[Max_q_val]
    best_action = ast.literal_eval(best_action)

    return Final_action, next_state, Max_q_val

def semi_gradient_sarsa(num_trials, gamma, alpha, environment_stochasticity,epsilon):

    W = {}
    for state in environment[6]:

        if state not in environment[4]:

            W[str(state)] = {}

            Features = extract_features(state)
            for action in  ["[1, 0]","[-1, 0]","[0, 1]","[0, -1]"]:
            
                W[str(state)][action] = {}

                for element in [0,1]:
                    
                    W[str(state)][action][element] = Features[element] + random.uniform(1e-9, 1e-8)
                    #W[str(state)][action][element] = Features[1] + random.uniform(1e-9, 1e-8)


    Q = {}
    for state in environment[6]:

        if state not in environment[4]:
            
            Q[str(state)] = {}

            for action in ["[1, 0]","[-1, 0]","[0, 1]","[0, -1]"]:

                Q[str(state)][action] = random.uniform(1e-9, 1e-8)
    

    for trial in tqdm(range(num_trials)):

        current_state = environment[2] #start

        while current_state != environment[3]:

            epsilon_policy = state_action_nextstate(Q,current_state,epsilon,environment_stochasticity)
            action = epsilon_policy[0]
            next_state = epsilon_policy[1]

            if next_state != environment[3]:

                next_epsilon_policy = state_action_nextstate(Q,next_state,epsilon,environment_stochasticity)

                next_next_state = next_epsilon_policy[1]
                next_next_action = next_epsilon_policy[0]

                r = state_reward(next_state)
                #print(str(next_state),str(action))

                gradient_w1 = -np.sin(abs(W[str(next_state)][str(action)][0]) + abs(W[str(next_state)][str(action)][1])) * (W[str(next_state)][str(action)][0]/abs(W[str(next_state)][str(action)][0]))
                gradient_w2 = -np.sin(abs(W[str(next_state)][str(action)][1]) + abs(W[str(next_state)][str(action)][0])) * (W[str(next_state)][str(action)][1]/abs(W[str(next_state)][str(action)][1]))

                W[str(current_state)][str(action)][0] = W[str(current_state)][str(action)][0] +\
                        alpha * (r + gamma * np.cos(abs(W[str(next_next_state)][str(next_next_action)][0]) +\
                                                    abs(W[str(next_next_state)][str(next_next_action)][1]))\
                                                            - np.cos(abs(W[str(next_state)][str(action)][0])\
                                                                    + abs(W[str(next_state)][str(action)][1]))) * gradient_w1
                
                W[str(current_state)][str(action)][1] = W[str(current_state)][str(action)][1] +\
                        alpha * (r + gamma * np.cos(abs(W[str(next_next_state)][str(next_next_action)][0]) +\
                                                    abs(W[str(next_next_state)][str(next_next_action)][1]))\
                                                            - np.cos(abs(W[str(next_state)][str(action)][0])\
                                                                    + abs(W[str(next_state)][str(action)][1]))) * gradient_w2
                
                Q[str(current_state)][str(action)]= np.cos(abs(W[str(current_state)][str(action)][0]) +\
                                                    abs(W[str(current_state)][str(action)][1]))
            
            
                current_state = next_state

            else:

                r = state_reward(next_state)

                W[str(current_state)][str(action)][0] = W[str(current_state)][str(action)][0] +\
                        alpha * (r - np.cos(abs(W[str(next_state)][str(action)][0])\
                                                                    + abs(W[str(next_state)][str(action)][1]))) * gradient_w1
                
                W[str(current_state)][str(action)][1] = W[str(current_state)][str(action)][1] +\
                        alpha * (r - np.cos(abs(W[str(next_state)][str(action)][0])\
                                                                    + abs(W[str(next_state)][str(action)][1]))) * gradient_w2
                
                Q[str(current_state)][str(action)]= np.cos(abs(W[str(current_state)][str(action)][0]) +\
                                                    abs(W[str(current_state)][str(action)][1]))
                
                current_state = environment[3]
                
    return Q


In [11]:
semi_gradient_sarsa(10000, 0.9, 0.3, 'deterministic',0.1)

100%|██████████| 10000/10000 [01:10<00:00, 141.56it/s]


{'[0, 1]': {'[1, 0]': 0.9781749331689938,
  '[-1, 0]': -1.0,
  '[0, 1]': -0.9999839916661791,
  '[0, -1]': -1.0},
 '[0, 2]': {'[1, 0]': -0.9891755156230866,
  '[-1, 0]': -1.0,
  '[0, 1]': -0.988363105536931,
  '[0, -1]': -0.9303564697766723},
 '[0, 3]': {'[1, 0]': 0.8725937141109141,
  '[-1, 0]': -1.0,
  '[0, 1]': -1.0,
  '[0, -1]': -0.987659574310174},
 '[1, 0]': {'[1, 0]': -1.0,
  '[-1, 0]': -1.0,
  '[0, 1]': 0.24324264122732556,
  '[0, -1]': -1.0},
 '[1, 1]': {'[1, 0]': -0.8464676801958303,
  '[-1, 0]': 0.8238174283486567,
  '[0, 1]': -0.4814183171652369,
  '[0, -1]': 0.9908926585675624},
 '[1, 2]': {'[1, 0]': -0.4715663205872317,
  '[-1, 0]': 0.3351335627755793,
  '[0, 1]': 0.5520809144764132,
  '[0, -1]': -0.39550628558123435},
 '[1, 3]': {'[1, 0]': -0.8472215532300971,
  '[-1, 0]': -0.9666420716245767,
  '[0, 1]': -1.0,
  '[0, -1]': -0.7248687770387721},
 '[2, 1]': {'[1, 0]': 0.055297060047729656,
  '[-1, 0]': -0.8842754314499628,
  '[0, 1]': -0.8960945080699185,
  '[0, -1]': -1.

## Episodic Semi-gradient n-step Sarsa for Estimating $\hat{q} \approx q_{*}$ or $q_{\pi}$

In [6]:
def semi_gradient_n_step_sarsa(num_trials, n, gamma, alpha, environment_stochasticity,epsilon):
    
    grid_size = environment[0]*environment[1]

    probs = probability_distribution(grid_size,environment_stochasticity)

    Q = {}
    for state in environment[6]:

        if state not in environment[4]:
            
            Q[str(state)] = {}

            for action in ["[1, 0]","[-1, 0]","[0, 1]","[0, -1]"]:

                Q[str(state)][action] = random.uniform(1e-9, 1e-8)
    W = {}
    for state in environment[6]:

        if state not in environment[4]:

            W[str(state)] = {}

            Features = extract_features(state)
            for action in  ["[1, 0]","[-1, 0]","[0, 1]","[0, -1]"]:
            
                W[str(state)][action] = {}

                for element in [0,1]:
                    
                    W[str(state)][action][element] = Features[element] + random.uniform(1e-9, 1e-8)
       

    state_t  = environment[2]

    for trial in tqdm(range(num_trials)):
        
        T = float('inf')
        tau = 0
        t = -1
        trajectory = []
        state_t  = environment[2]
        while tau != T - 1:

            t = t + 1

            if t < T:
                trajectory.append(state_t)
                next_state = state_action_nextstate(Q,state_t,epsilon,environment_stochasticity)[1]
                #last_state = state_t
                state_t = next_state
                

                if next_state == environment[3]:
                    T = t + 1
                
                #else:
                #    next_action = 

            
            tau = t - n + 1

            if tau >= 0:

                G = 0
                
                state_tau = trajectory[tau]
                state_i = state_tau
                action_tau = state_action_nextstate(Q,state_t,epsilon,environment_stochasticity)[0]

                action_ii = 0
                for i in range(tau+1, min(tau+n,T)+1):

                    #actionTN = action_ii

                    state_ii = state_action_nextstate(Q,state_i,epsilon,environment_stochasticity)[1]
                    action_ii = state_action_nextstate(Q,state_i,epsilon,environment_stochasticity)[0]

                    #stateTN = state_i #s_(t+n) this is store for the next part
                     

                    #n_state_action.append((state_ii, action_ii))

                    #state_i = state_ii
                    r = state_reward(state_ii)

                    G = G + (gamma ** (i-tau-1)) * r

                    if i < min(tau+n,T):

                        state_i = state_ii

                if tau + n < T:

                    stateTN = state_action_nextstate(Q,state_ii,epsilon,environment_stochasticity)[1]
                    actionTN = state_action_nextstate(Q,state_ii,epsilon,environment_stochasticity)[0]

                    Q[str(stateTN)][str(actionTN)]= np.cos(abs(W[str(stateTN)][str(actionTN)][0]) +\
                                                    abs(W[str(stateTN)][str(actionTN)][1]))
            

                    G = G + (gamma ** n) * Q[str(stateTN)][str(actionTN)]

                if state_tau not in environment[4] and state_tau != environment[3]:

                    gradient_w1 = -np.sin(abs(W[str(state_tau)][str(action_tau)][0]) + abs(W[str(state_tau)][str(action_tau)][1])) * (W[str(state_tau)][str(action_tau)][0]/abs(W[str(state_tau)][str(action_tau)][0]))
                    gradient_w2 = -np.sin(abs(W[str(state_tau)][str(action_tau)][1]) + abs(W[str(state_tau)][str(action_tau)][0])) * (W[str(state_tau)][str(action_tau)][1]/abs(W[str(state_tau)][str(action_tau)][1]))


                    W[str(state_tau)][str(action_tau)][0] = W[str(state_tau)][str(action_tau)][0] +\
                        alpha * (G - np.cos(abs(W[str(state_tau)][str(action_tau)][0])\
                                                                    + abs(W[str(state_tau)][str(action_tau)][1]))) * gradient_w1
                
                    W[str(state_tau)][str(action_tau)][1] = W[str(state_tau)][str(action_tau)][1] +\
                            alpha * (G - np.cos(abs(W[str(state_tau)][str(action_tau)][0])\
                                                                        + abs(W[str(state_tau)][str(action_tau)][1]))) * gradient_w2

                    Q[str(state_tau)][str(action_tau)]= np.cos(abs(W[str(state_tau)][str(action_tau)][0]) +\
                                    abs(W[str(state_tau)][str(action_tau)][1]))
                

    del Q[str(environment[3])]
    return Q


In [8]:
semi_gradient_n_step_sarsa(1000, 3 , 0.9, 0.6, 'deterministic',0.1)

100%|██████████| 1000/1000 [00:45<00:00, 22.22it/s]


{'[0, 1]': {'[1, 0]': -0.034443928007809196,
  '[-1, 0]': -0.24909415084215505,
  '[0, 1]': -0.2805626049969402,
  '[0, -1]': -0.3111611637393039},
 '[0, 2]': {'[1, 0]': -0.7562896506966093,
  '[-1, 0]': -0.6035575958093825,
  '[0, 1]': -0.5978991772482377,
  '[0, -1]': -0.49361281547407093},
 '[0, 3]': {'[1, 0]': -0.3643275266999025,
  '[-1, 0]': -0.1661413778654941,
  '[0, 1]': -0.5322585666758011,
  '[0, -1]': -0.4934189041612724},
 '[1, 0]': {'[1, 0]': -0.7060047938234573,
  '[-1, 0]': -0.22256469365579887,
  '[0, 1]': -0.3030567626188142,
  '[0, -1]': -0.5003478612925795},
 '[1, 1]': {'[1, 0]': 0.2612936265093631,
  '[-1, 0]': 0.07196785804429534,
  '[0, 1]': -0.5576511613089304,
  '[0, -1]': -0.6047674091253425},
 '[1, 2]': {'[1, 0]': -0.9816590878242809,
  '[-1, 0]': -0.32819606538293233,
  '[0, 1]': -0.017935060714822402,
  '[0, -1]': -0.46912195419634517},
 '[1, 3]': {'[1, 0]': -0.2751904504453935,
  '[-1, 0]': -0.6336432984814608,
  '[0, 1]': -0.9051393605497231,
  '[0, -1]':