In [1]:
import numpy as np
import random
import math
from tabulate import tabulate
from tqdm import tqdm
import ast

In [2]:
def generate_grid_world(length, width,path_lenght,holes_number,Random_State):
    
    random.seed(Random_State)
    #store all cells in a list
    Grid_Cells = []
    for row in range(length):
        for col in range(width):
            Grid_Cells.append([row,col])


    #specify the number of holes in the gridworld
    
    #specify the start point as a random cell
    start = [random.randint(0, length), random.randint(0, width)]

    #create a path from start point
    """instead of defining start and goal points,
      we define just a start point and a random path with a random lenght to
       another point and name it as goal point"""
    
    def random_path(Start, Path_Lenght,length, width):
        
        Path = []
        Path.append(Start)
        for i in range(Path_Lenght):
            
            #there are two moves that take us on a random cell named Goal [1,0], [0,1]
            
            move = random.choice([[1,0], [0,1]])
            
            #update the start cell/point by the above move
            Start = [x + y for x, y in zip(Start, move)]
            
            #if the movement take us out of our gridworld, we reverse the change in the start point
            if Start[0] < 0 or Start[1] < 0 or Start[0] > length-1 or Start[1] > width-1:

                Start = [x - y for x, y in zip(Start, move)]

            else:
                
                #create a path history
                Path.append(Start)

        Goal = Start

        return Goal,Path
    

    GoalPath = random_path(start, path_lenght,length, width)

    goal = GoalPath[0]
    path = GoalPath[1]

    #now we must eliminate the path cells from the Grid_Cells to choose hole cells from remaining cells

    FreeCells = [x for x in Grid_Cells if x not in path]

    Holes = random.sample(FreeCells, holes_number)

    #Also, we can visualize our gridworld in a simple way

    def mark_holes(holes):
        marked_data = [["Hole" if [row, col] in holes else [row, col] for col in range(width)] for row in range(length)]
        return marked_data
    
    marked_matrix = mark_holes(Holes)

    print(tabulate(marked_matrix, tablefmt="grid"))

    
    return length, width, start, goal, Holes, path,Grid_Cells

In [3]:
#environment = generate_grid_world(50, 40,1300,400,39)
environment = generate_grid_world(5, 4,4,4,39)

environment

+--------+--------+--------+--------+
| Hole   | [0, 1] | [0, 2] | [0, 3] |
+--------+--------+--------+--------+
| [1, 0] | [1, 1] | [1, 2] | [1, 3] |
+--------+--------+--------+--------+
| Hole   | [2, 1] | [2, 2] | [2, 3] |
+--------+--------+--------+--------+
| Hole   | [3, 1] | Hole   | [3, 3] |
+--------+--------+--------+--------+
| [4, 0] | [4, 1] | [4, 2] | [4, 3] |
+--------+--------+--------+--------+


(5,
 4,
 [1, 2],
 [4, 3],
 [[2, 0], [3, 2], [3, 0], [0, 0]],
 [[1, 2], [1, 3], [2, 3], [3, 3], [4, 3]],
 [[0, 0],
  [0, 1],
  [0, 2],
  [0, 3],
  [1, 0],
  [1, 1],
  [1, 2],
  [1, 3],
  [2, 0],
  [2, 1],
  [2, 2],
  [2, 3],
  [3, 0],
  [3, 1],
  [3, 2],
  [3, 3],
  [4, 0],
  [4, 1],
  [4, 2],
  [4, 3]])

In [4]:
def probability_distribution(grid_size,randomness):
    #random.seed(40)
    
    #by this function we generate probabilities which their sum is equal to 1
    def generate_probabilities(n):

        numbers = [random.random() for _ in range(n)]
        total_sum = sum(numbers)
        scaled_numbers = [num / total_sum for num in numbers]
        
        return scaled_numbers
    
    cells_prob = {}
    if randomness == 'stochastic':
        for cell in range(grid_size):
            
            #we set the number of probs to 4 due to 4 possible action for each cell (go to its neighbors)
            probs = generate_probabilities(4)

            cells_prob[cell] = probs
    elif randomness == 'equal probable':

        for cell in range(grid_size):

            cells_prob[cell] = [0.25,0.25,0.25,0.25]
    
    elif randomness == 'deterministic':
        for cell in range(grid_size):

            cells_prob[cell] = [0.03,0.06,0.01,0.9] #[0,0,0,1] ##[0.15,.15,0.1,0.6]


    #Note that we consider the correspondence between probabilities and actions as below:
    #probs = [p1, p2, p3, p4] ---> [[1,0],[-1,0],[0,1],[0,-1]]

    return cells_prob

def neighbor_cells(cell):

    grid_cells = environment[6]
    Actions = [[1,0],[-1,0],[0,1],[0,-1]]

    Neighbors = []
    Actions_Neighbors = []
    for action in Actions:

        neighbor = [x + y for x, y in zip(cell, action)]
        #if neighbor not in environment[4]:
        Neighbors.append(neighbor)
        Actions_Neighbors.append(action)

    return Neighbors, Actions_Neighbors

def arbitrary_policy(randomness):

        #random.seed(randomness)
        
    policy = {}
    policy_action = {}
    for state in environment[6]:

        if state not in environment[4]:

            neighbors = neighbor_cells(state)[0]
            Actions_Neighbors = neighbor_cells(state)[1]

            allowed_positions = []

            for neighbor in neighbors:
                
                if neighbor in environment[6] and neighbor not in environment[4]:
                    
                    allowed_positions.append(neighbor)
            
            if len(allowed_positions) > 0:
                
                next_state = random.choice(allowed_positions)
                row = next_state[0] - state[0]
                col = next_state[1] - state[1]
                PolicyAction = [row, col]

                policy['{}'.format(state)] = next_state
                policy_action['{}'.format(state)] = PolicyAction



    return policy, policy_action

def state_reward(next_state):

    if next_state in environment[4]:

        r = -3
    
    elif next_state == environment[3]:

        r = 10
    
    elif next_state not in environment[6]:

        r = -2
    
    else:

        r = -1
    
    return r

def reverse_dictionary(dict):
    reverse_dict = {}
    for key in list(dict.keys()):
        val = dict[key]
        reverse_dict[val] = key
    return reverse_dict


state_indice_dict = {}
counter = 0
for state in environment[6]:

    state = str(state)
    state_indice_dict[state] = counter
    counter = counter + 1

def generate_trajectory(policy,randomness,environment_stochasticity):

    policy_action = policy[1]
    probs = probability_distribution(environment[0]*environment[1],environment_stochasticity)
    start = environment[2]
    terminate = start
    trajectory = []
    trajectory_actions = []
    pure_trajectory = [start]
    c = 0
    while terminate != environment[3]:
        random.seed(randomness+c)
        Actions = [[1,0],[-1,0],[0,1],[0,-1]]
        action = policy_action[str(terminate)]
        Actions.remove(action)
        sorted_actions = Actions + [action]
        state_indice = state_indice_dict[str(terminate)]
        actions_prob = probs[state_indice]
        actions_prob.sort()

        selected_action = random.choices(sorted_actions, actions_prob)[0]

        trajectory_actions.append(selected_action)
        current_state = terminate
        next_state = [x + y for x, y in zip(terminate, selected_action)]
        pure_trajectory.append(next_state)
        
        #if the agent goes out of the gridworld, it stays in its current state
        if next_state not in environment[6]:
            next_state = terminate
        
        #if it drops into the holes, it goes to the start points
        elif next_state in environment[4]:
            next_state = start  

        terminate = next_state
        trajectory.append((current_state))
        c = c+1
    
    trajectory.append((environment[3]))
    pure_trajectory.append(environment[3])

    return trajectory,trajectory_actions

def extract_features(state):

    goal = environment[3]
    max_length = environment[0]
    max_width = environment[1]

    w1 = (goal[0] - state[0]) / max_width
    w2 = (goal[1] - state[1]) / max_length

    return abs(w1), abs(w2)

In [5]:
def H_theta(environment):
    
    policy = {}
    policy_action = {}

    for state in environment[6]:

        if state != environment[3] and state not in environment[4]:

            Neighbors = neighbor_cells(state)

            Distances = {}

            for neighbor in Neighbors[0]:

                if neighbor not in environment[4] and neighbor in environment[6]:

                    distance = np.cos(extract_features(neighbor)[0]+extract_features(neighbor)[1])
                    Distances[distance] = neighbor
            
            #closest to the terminate state
            if list(Distances.keys()) != []:

                best_neighbor = Distances[max(list(Distances.keys()))]
            
            else:
                best_neighbor = state

            policy[str(state)] = best_neighbor

            row = best_neighbor[0] - state[0]
            col = best_neighbor[1] - state[1]
            PolicyAction = [row,col]
            policy_action[str(state)] = PolicyAction
    

    return policy, policy_action

In [8]:
H_theta(environment)

({'[0, 1]': [1, 1],
  '[0, 2]': [1, 2],
  '[0, 3]': [1, 3],
  '[1, 0]': [1, 1],
  '[1, 1]': [2, 1],
  '[1, 2]': [2, 2],
  '[1, 3]': [2, 3],
  '[2, 1]': [3, 1],
  '[2, 2]': [2, 3],
  '[2, 3]': [3, 3],
  '[3, 1]': [4, 1],
  '[3, 3]': [4, 3],
  '[4, 0]': [4, 1],
  '[4, 1]': [4, 2],
  '[4, 2]': [4, 3]},
 {'[0, 1]': [1, 0],
  '[0, 2]': [1, 0],
  '[0, 3]': [1, 0],
  '[1, 0]': [0, 1],
  '[1, 1]': [1, 0],
  '[1, 2]': [1, 0],
  '[1, 3]': [1, 0],
  '[2, 1]': [1, 0],
  '[2, 2]': [0, 1],
  '[2, 3]': [1, 0],
  '[3, 1]': [1, 0],
  '[3, 3]': [1, 0],
  '[4, 0]': [0, 1],
  '[4, 1]': [0, 1],
  '[4, 2]': [0, 1]})

## REINFORCCE: Monte-Carlo Policy-Gradient Control (episodic) for $\pi_{*}$

In [6]:
def pi_theta(environment,Theta):

    policy = {}
    policy_action = {}

    Actions = [[1, 0],[-1, 0],[0, 1],[0, -1]]

    for state in environment[6]:

        if state != environment[3] and state not in environment[4]:

            Distances = {}

            for action in Actions:

                distance = np.cos(abs(Theta[str(state)][str(action)][0]) + abs(Theta[str(state)][str(action)][1]))
                #print(distance)

                Distances[distance] = action
            
            #closest to the terminate state
            if list(Distances.keys()) != []:

                best_action = Distances[max(list(Distances.keys()))]
                next_state = [x + y for x, y in zip(state, best_action)]
                
            
            else:
                next_state = state
                best_action = random.choices(Actions)
    
            policy[str(state)] = next_state
            policy_action[str(state)] = best_action
    

    return policy, policy_action

In [55]:
Theta = {}
for state in environment[6]:

    if state not in environment[4]:

        Theta[str(state)] = {} 

        for action in [[1, 0],[-1, 0],[0, 1],[0, -1]]:

            next_state = [x + y for x, y in zip(state, action)]
            Features = extract_features(next_state)
            
            Theta[str(state)][str(action)] = [Features[0],Features[1]]

pi_theta(environment,Theta)

({'[0, 1]': [1, 1],
  '[0, 2]': [1, 2],
  '[0, 3]': [1, 3],
  '[1, 0]': [2, 0],
  '[1, 1]': [2, 1],
  '[1, 2]': [2, 2],
  '[1, 3]': [2, 3],
  '[2, 1]': [3, 1],
  '[2, 2]': [3, 2],
  '[2, 3]': [3, 3],
  '[3, 1]': [4, 1],
  '[3, 3]': [4, 3],
  '[4, 0]': [4, 1],
  '[4, 1]': [4, 2],
  '[4, 2]': [4, 3]},
 {'[0, 1]': [1, 0],
  '[0, 2]': [1, 0],
  '[0, 3]': [1, 0],
  '[1, 0]': [1, 0],
  '[1, 1]': [1, 0],
  '[1, 2]': [1, 0],
  '[1, 3]': [1, 0],
  '[2, 1]': [1, 0],
  '[2, 2]': [1, 0],
  '[2, 3]': [1, 0],
  '[3, 1]': [1, 0],
  '[3, 3]': [1, 0],
  '[4, 0]': [0, 1],
  '[4, 1]': [0, 1],
  '[4, 2]': [0, 1]})

In [13]:
def monte_carlo_policy_gradient(num_trials, gamma, alpha, environment_stochasticity):
    
    Theta = {}
    for state in environment[6]:

        if state not in environment[4]:

            Theta[str(state)] = {} 

            for action in [[1, 0],[-1, 0],[0, 1],[0, -1]]:

                next_state = [x + y for x, y in zip(state, action)]
                Features = extract_features(next_state)
                
                Theta[str(state)][str(action)] = [Features[0],Features[1]] #Features[element] + random.uniform(1e-9, 1e-8)

    Q = {}
    for state in environment[6]:

        if state not in environment[4]:
            
            Q[str(state)] = {}

            for action in ["[1, 0]","[-1, 0]","[0, 1]","[0, -1]"]:

                Q[str(state)][action] = random.uniform(1e-9, 1e-8)
    
    Best_Policy = {}
    for state in environment[6]:

        if state not in environment[4]:

            Best_Policy[str(state)] = state

    
    for trial in tqdm(range(num_trials)):

        policy = pi_theta(environment,Theta)
        

        TRAJECTORY = generate_trajectory(policy,trial,environment_stochasticity)
        
        

        trajectory = TRAJECTORY[0]
        print(len(trajectory))
        #print(trajectory)
        actions = TRAJECTORY[1]
        if len(trajectory) > 10000000000:
            break

        else:

            print(len(trajectory))
            print(policy)

            G = 0

            for step_indx in range(len(trajectory[:-1])):

                step = trajectory[step_indx]
                
                next_step = trajectory[step_indx+1]

                done_action = actions[step_indx]

                for k in range(step_indx+1,len(trajectory)):

                    
                    step_k = trajectory[k]

                    #next_step_k = trajectory[k+1]
                
                    r = state_reward(step_k)

                    G = G + gamma ** (k - step_indx - 1) * r
            
                softmax_denominator = 0.0001
                for action in ["[1, 0]","[-1, 0]","[0, 1]","[0, -1]"]:

                    softmax_denominator = softmax_denominator +\
                        -np.sin(Theta[str(step)][action][0] + Theta[str(step)][action][1]) *\
                    math.exp(np.cos(Theta[str(step)][action][0] + Theta[str(step)][action][1]))

                #print('softmax_denominators',softmax_denominator)
                gradient = (-np.sin(Theta[str(step)][str(done_action)][0] + Theta[str(step)][str(done_action)][1]) *\
                math.exp(np.cos(Theta[str(step)][str(done_action)][0] + Theta[str(step)][str(done_action)][1]))) - softmax_denominator

                #print('gradient',gradient)
                    
                t1 = Theta[str(step)][str(done_action)][0] +\
                    alpha * (gamma ** step_indx) * G * gradient
                
                t2 = Theta[str(step)][str(done_action)][1] +\
                    alpha * (gamma ** step_indx) * G * gradient

                Theta[str(step)][str(done_action)] = [t1,t2]

                Q[str(step)][str(done_action)] = np.cos(abs(t1)+abs(t2))
    
    for state in environment[6]:

        if str(state) in list(Q.keys()):

            value_action_state = reverse_dictionary(Q[str(state)])
            Max_val = max(list(value_action_state.keys()))
            best_action = value_action_state[Max_val]
            best_action = ast.literal_eval(best_action)
            next_state = [x + y for x, y in zip(state, best_action)]

            Best_Policy[str(state)] = next_state


    return Q, Best_Policy,policy

In [14]:
#pi_0 = pi_theta(environment)
monte_carlo_policy_gradient(500, 0.9,0.1, 'deterministic')

  0%|          | 0/500 [00:00<?, ?it/s]

47
47
({'[0, 1]': [1, 1], '[0, 2]': [1, 2], '[0, 3]': [1, 3], '[1, 0]': [2, 0], '[1, 1]': [2, 1], '[1, 2]': [2, 2], '[1, 3]': [2, 3], '[2, 1]': [3, 1], '[2, 2]': [3, 2], '[2, 3]': [3, 3], '[3, 1]': [4, 1], '[3, 3]': [4, 3], '[4, 0]': [4, 1], '[4, 1]': [4, 2], '[4, 2]': [4, 3]}, {'[0, 1]': [1, 0], '[0, 2]': [1, 0], '[0, 3]': [1, 0], '[1, 0]': [1, 0], '[1, 1]': [1, 0], '[1, 2]': [1, 0], '[1, 3]': [1, 0], '[2, 1]': [1, 0], '[2, 2]': [1, 0], '[2, 3]': [1, 0], '[3, 1]': [1, 0], '[3, 3]': [1, 0], '[4, 0]': [0, 1], '[4, 1]': [0, 1], '[4, 2]': [0, 1]})
7
7
({'[0, 1]': [1, 1], '[0, 2]': [1, 2], '[0, 3]': [1, 3], '[1, 0]': [2, 0], '[1, 1]': [2, 1], '[1, 2]': [2, 2], '[1, 3]': [2, 3], '[2, 1]': [3, 1], '[2, 2]': [2, 1], '[2, 3]': [2, 2], '[3, 1]': [4, 1], '[3, 3]': [3, 2], '[4, 0]': [4, 1], '[4, 1]': [4, 2], '[4, 2]': [4, 3]}, {'[0, 1]': [1, 0], '[0, 2]': [1, 0], '[0, 3]': [1, 0], '[1, 0]': [1, 0], '[1, 1]': [1, 0], '[1, 2]': [1, 0], '[1, 3]': [1, 0], '[2, 1]': [1, 0], '[2, 2]': [0, -1], '[2, 3]'

  0%|          | 2/500 [00:33<2:18:20, 16.67s/it]


KeyboardInterrupt: 