In [11]:
import numpy as np
import random
from tabulate import tabulate
from tqdm import tqdm
import ast

In [12]:
def generate_grid_world(length, width,path_lenght,holes_number,Random_State):
    
    random.seed(Random_State)
    #store all cells in a list
    Grid_Cells = []
    for row in range(length):
        for col in range(width):
            Grid_Cells.append([row,col])


    #specify the number of holes in the gridworld
    
    #specify the start point as a random cell
    start = [random.randint(0, length), random.randint(0, width)]

    #create a path from start point
    """instead of defining start and goal points,
      we define just a start point and a random path with a random lenght to
       another point and name it as goal point"""
    
    def random_path(Start, Path_Lenght,length, width):
        
        Path = []
        Path.append(Start)
        for i in range(Path_Lenght):
            
            #there are two moves that take us on a random cell named Goal [1,0], [0,1]
            
            move = random.choice([[1,0], [0,1]])
            
            #update the start cell/point by the above move
            Start = [x + y for x, y in zip(Start, move)]
            
            #if the movement take us out of our gridworld, we reverse the change in the start point
            if Start[0] < 0 or Start[1] < 0 or Start[0] > length-1 or Start[1] > width-1:

                Start = [x - y for x, y in zip(Start, move)]

            else:
                
                #create a path history
                Path.append(Start)

        Goal = Start

        return Goal,Path
    

    GoalPath = random_path(start, path_lenght,length, width)

    goal = GoalPath[0]
    path = GoalPath[1]

    #now we must eliminate the path cells from the Grid_Cells to choose hole cells from remaining cells

    FreeCells = [x for x in Grid_Cells if x not in path]

    Holes = random.sample(FreeCells, holes_number)

    #Also, we can visualize our gridworld in a simple way

    def mark_holes(holes):
        marked_data = [["Hole" if [row, col] in holes else [row, col] for col in range(width)] for row in range(length)]
        return marked_data
    
    marked_matrix = mark_holes(Holes)

    print(tabulate(marked_matrix, tablefmt="grid"))

    
    return length, width, start, goal, Holes, path,Grid_Cells

In [13]:
environment = generate_grid_world(5, 4,4,4,39)
environment

+--------+--------+--------+--------+
| Hole   | [0, 1] | [0, 2] | [0, 3] |
+--------+--------+--------+--------+
| [1, 0] | [1, 1] | [1, 2] | [1, 3] |
+--------+--------+--------+--------+
| Hole   | [2, 1] | [2, 2] | [2, 3] |
+--------+--------+--------+--------+
| Hole   | [3, 1] | Hole   | [3, 3] |
+--------+--------+--------+--------+
| [4, 0] | [4, 1] | [4, 2] | [4, 3] |
+--------+--------+--------+--------+


(5,
 4,
 [1, 2],
 [4, 3],
 [[2, 0], [3, 2], [3, 0], [0, 0]],
 [[1, 2], [1, 3], [2, 3], [3, 3], [4, 3]],
 [[0, 0],
  [0, 1],
  [0, 2],
  [0, 3],
  [1, 0],
  [1, 1],
  [1, 2],
  [1, 3],
  [2, 0],
  [2, 1],
  [2, 2],
  [2, 3],
  [3, 0],
  [3, 1],
  [3, 2],
  [3, 3],
  [4, 0],
  [4, 1],
  [4, 2],
  [4, 3]])

In [14]:
def probability_distribution(grid_size,randomness):
    #random.seed(40)
    
    #by this function we generate probabilities which their sum is equal to 1
    def generate_probabilities(n):

        numbers = [random.random() for _ in range(n)]
        total_sum = sum(numbers)
        scaled_numbers = [num / total_sum for num in numbers]
        
        return scaled_numbers
    
    cells_prob = {}
    if randomness == 'stochastic':
        for cell in range(grid_size):
            
            #we set the number of probs to 4 due to 4 possible action for each cell (go to its neighbors)
            probs = generate_probabilities(4)

            cells_prob[cell] = probs
    elif randomness == 'equal probable':

        for cell in range(grid_size):

            cells_prob[cell] = [0.25,0.25,0.25,0.25]
    
    elif randomness == 'deterministic':
        for cell in range(grid_size):

            cells_prob[cell] = [0.03,0.06,0.01,0.9] #[0,0,0,1] ##[0.15,.15,0.1,0.6]


    #Note that we consider the correspondence between probabilities and actions as below:
    #probs = [p1, p2, p3, p4] ---> [[1,0],[-1,0],[0,1],[0,-1]]

    return cells_prob

def neighbor_cells(cell):

    grid_cells = environment[6]
    Actions = [[1,0],[-1,0],[0,1],[0,-1]]

    Neighbors = []
    Actions_Neighbors = []
    for action in Actions:

        neighbor = [x + y for x, y in zip(cell, action)]
        #if neighbor not in environment[4]:
        Neighbors.append(neighbor)
        Actions_Neighbors.append(action)

    return Neighbors, Actions_Neighbors

#Note
"""As we want to use monte carlo method for estimating the state values
   it has been assumed that we have not any knowledge about the environment.
   Therefore, we should consider the transitions into the holes cells
   (against the case of policy iteration)"""

def arbitrary_policy(randomness):
    #random.seed(randomness)
    
    policy = {}
    policy_action = {}
    for state in environment[6]:

        if state not in environment[4]:

            neighbors = neighbor_cells(state)[0]
            Actions_Neighbors = neighbor_cells(state)[1]

            allowed_positions = []

            for neighbor in neighbors:
                
                if neighbor in environment[6] and neighbor not in environment[4]:
                    
                    allowed_positions.append(neighbor)
        
            next_state = random.choice(allowed_positions)

            row = next_state[0] - state[0]
            col = next_state[1] - state[1]
            PolicyAction = [row, col]

            policy['{}'.format(state)] = next_state
            policy_action['{}'.format(state)] = PolicyAction


    return policy, policy_action

state_indice_dict = {}
counter = 0
for state in environment[6]:

    state = str(state)
    state_indice_dict[state] = counter
    counter = counter + 1

def generate_trajectory(policy,randomness):

    policy_action = policy[1]

    probs = probability_distribution(environment[0]*environment[1],'stochastic')
    
    start = environment[2]

    terminate = start

    trajectory = [start]
    c = 0
    test = []
    while terminate != environment[3]:
        random.seed(randomness+c)
        Actions = [[1,0],[-1,0],[0,1],[0,-1]]

        action = policy_action[str(terminate)]
        Actions.remove(action)
        #sorted_actions = [action]
        sorted_actions = Actions + [action]
        #print(sorted_actions)
        state_indice = state_indice_dict[str(terminate)]
        actions_prob = probs[state_indice]
        actions_prob.sort()
        #print(actions_prob)
        #print(actions_prob)


        selected_action = random.choices(sorted_actions, actions_prob)[0]
        

        """if c==0:
           print(sorted_actions)
           print(actions_prob)
           print(selected_action)
           test.append(selected_action)"""
        
        next_state = [x + y for x, y in zip(terminate, selected_action)]
        
        #if the agent goes out of the gridworld, it stays in its current state
        if next_state not in environment[6]:

            next_state = terminate
        
        #if it drops into the holes, it goes to the start points
        elif next_state in environment[4]:

            next_state = start

        
        terminate = next_state

        trajectory.append(terminate)
        c = c+1

    return trajectory #,test

In [15]:
policy_0 = arbitrary_policy(41)
policy_0

({'[0, 1]': [0, 2],
  '[0, 2]': [0, 1],
  '[0, 3]': [0, 2],
  '[1, 0]': [1, 1],
  '[1, 1]': [2, 1],
  '[1, 2]': [1, 3],
  '[1, 3]': [1, 2],
  '[2, 1]': [3, 1],
  '[2, 2]': [2, 3],
  '[2, 3]': [3, 3],
  '[3, 1]': [4, 1],
  '[3, 3]': [2, 3],
  '[4, 0]': [4, 1],
  '[4, 1]': [4, 2],
  '[4, 2]': [4, 3],
  '[4, 3]': [4, 2]},
 {'[0, 1]': [0, 1],
  '[0, 2]': [0, -1],
  '[0, 3]': [0, -1],
  '[1, 0]': [0, 1],
  '[1, 1]': [1, 0],
  '[1, 2]': [0, 1],
  '[1, 3]': [0, -1],
  '[2, 1]': [1, 0],
  '[2, 2]': [0, 1],
  '[2, 3]': [1, 0],
  '[3, 1]': [1, 0],
  '[3, 3]': [-1, 0],
  '[4, 0]': [0, 1],
  '[4, 1]': [0, 1],
  '[4, 2]': [0, 1],
  '[4, 3]': [0, -1]})

## First-visit MC prediction, for estimationg $V \approx v_{\pi}$

In [20]:
def state_reward(policy,state):

    policy_state = policy[0]
    
    next_state = policy_state[str(state)]

    if next_state in environment[4]:

        r = -3
    
    elif next_state == environment[3]:

        r = 100
    
    elif next_state not in environment[6]:

        r = -2
    
    else:

        r = -1
    
    return r

#Note that here we want to evaluate just a fixed policy
# and so we are not trying to optimize it 
def monte_carlo_prediction(num_trials, policy, gamma):

    #V = np.zeros((environment[6],1))

    #store returns of each trajectory
    Returns = {} #np.zeros((environment[6],1))
    #Lens = []
    #Loop for ever (for each episode)
    for trial in tqdm(range(num_trials)):
        
        #generate an episode
        trajectory = generate_trajectory(policy,trial)
        #Lens.append(trajectory)

        #limit the lenght of trajectory

        #total reward
        G = 0

        trajectory.reverse()
        
        
        returns = {}

        for state in environment[6]:
            
            if state not in environment[4] and state != environment[3]:

                returns[str(state)] = 0

        first_visit = []
        for step in trajectory[1:]:

            if step not in first_visit:

                first_visit.append(step)

                r = state_reward(policy,step)

                G = gamma * G + r

                returns[str(step)] = returns[str(step)] + G
        
        #Returns[trial] = returns
    
    V = {}
    for step in list(returns.keys()):

        V[step] = returns[step]/num_trials
    

    return V,returns #, Lens

In [21]:
MC_prediction = monte_carlo_prediction(100000,policy_0,0.9)

  0%|          | 0/100000 [00:00<?, ?it/s]

100%|██████████| 100000/100000 [06:10<00:00, 269.98it/s]


## Value Function

In [22]:
MC_prediction[0]

{'[0, 1]': 0.00021067249012910008,
 '[0, 2]': 0.00017960524111619008,
 '[0, 3]': 0.0001516447170045711,
 '[1, 0]': 0.0005495390000000001,
 '[1, 1]': 0.0006217100000000001,
 '[1, 2]': 0.0004845851000000001,
 '[1, 3]': 0.00042612659000000006,
 '[2, 1]': 0.0007019000000000001,
 '[2, 2]': 0.0003261625379000001,
 '[2, 3]': 0.0002835462841100001,
 '[3, 1]': 0.000791,
 '[3, 3]': 0.00024519165569900005,
 '[4, 0]': 0.0003735139310000001,
 '[4, 1]': 0.00089,
 '[4, 2]': 0.001}

In [23]:
MC_prediction[1]

{'[0, 1]': 21.067249012910008,
 '[0, 2]': 17.96052411161901,
 '[0, 3]': 15.16447170045711,
 '[1, 0]': 54.95390000000001,
 '[1, 1]': 62.171000000000014,
 '[1, 2]': 48.45851000000001,
 '[1, 3]': 42.61265900000001,
 '[2, 1]': 70.19000000000001,
 '[2, 2]': 32.61625379000001,
 '[2, 3]': 28.354628411000007,
 '[3, 1]': 79.10000000000001,
 '[3, 3]': 24.519165569900007,
 '[4, 0]': 37.35139310000001,
 '[4, 1]': 89.0,
 '[4, 2]': 100.0}

## On-policy first-visit MC control (for $\epsilon$-soft policies), estimates ${\pi} \approx {\pi}_{*} $

In [24]:
def state_action_reward(policy,state):

    """if type(policy) == tuple:
        
        policy_state = policy[0]
        next_state = policy_state[str(state)]
    
    else:"""
    policy_action = policy[str(state)]
    next_state = [x + y for x, y in zip(state, policy_action)]
    

    if next_state in environment[4]:

        r = -3
    
    elif next_state == environment[3]:

        r = 100
    
    elif next_state not in environment[6]:

        r = -2
    
    else:

        r = -1
    
    return r
    

def derive_action(current_state, next_state):

    row = next_state[0] - current_state[0]
    col = next_state[1] - current_state[1]
    action = [row, col]

    return action



def generate_trajectory_probability_based(policy,randomness,epsilon,traj_len,action_prob_type):

    probs = probability_distribution(environment[0]*environment[1],action_prob_type)
  
    start = environment[2]
    terminate = start
    trajectory = [start]
    c = 0
    test = []
    while terminate != environment[3]:
        random.seed(randomness+c)
        Actions = [[1, 0],[-1, 0],[0, 1],[0, -1]]

        #we have two probabilities for epsilon-greedy action selection
        #It's a kind of exploration-exploitation balancing
        
        #probability for exploration on not best action values
        low_prob = epsilon/len(Actions)
        high_prob = 1 - epsilon #+ (epsilon/len(Actions))

        #this random action selection is for balancing exploration-exploitation trade-off

        exex_probs = [low_prob,low_prob,low_prob,high_prob]
        if type(policy) == tuple:
            policy = policy[1]
        
        best_action_value = policy[str(terminate)]
        #print(type(best_action_value))
        Actions_copy = Actions.copy()
        #print(Actions_copy)
        Actions_copy.remove(best_action_value)
        exex_actions = Actions_copy + [best_action_value]
        #print(exex_actions)
        #print(exex_probs)
        
        action = random.choices(exex_actions, exex_probs)[0]

        #second part of action selection
        Actions.remove(action)
        sorted_actions = Actions + [action]
        state_indice = state_indice_dict[str(terminate)]
        actions_prob = probs[state_indice]
        actions_prob.sort()

        #print(sorted_actions)
        #print(actions_prob)
        #this random action selection is due to the randomness of the environment
        selected_action = random.choices(sorted_actions, actions_prob)[0]
        #print(selected_action)
        #print('=====')
        next_state = [x + y for x, y in zip(terminate, selected_action)]
        #if the agent goes out of the gridworld, it stays in its current state
        if next_state not in environment[6]:
            next_state = terminate
        #if it drops into the holes, it goes to the start points
        elif next_state in environment[4]:
            next_state = start
        terminate = next_state
        trajectory.append(terminate)
        c = c+1

        if c >traj_len:
            break
            #c = traj_len + 1
    
    if c > traj_len:
        
        return False
        
    else:
        return trajectory
    

def OnPolicy_MC_prediction(num_trials, policy, gamma, epsilon,traj_len, action_prob_type):
    
    def reverse_dictionary(dict):
        reverse_dict = {}
        for key in list(dict.keys()):
            val = dict[key]
            reverse_dict[val] = key
        return reverse_dict

    Q = {}
    for state in environment[6]:

        if state not in environment[4]:
            
            Q[str(state)] = {}

            for action in ["[1, 0]","[-1, 0]","[0, 1]","[0, -1]"]:

                #next_state = [x + y for x, y in zip(state, ast.literal_eval(action))]

                #if (next_state in environment[6]) and next_state not in environment[4]:
                    
                Q[str(state)][action] = random.uniform(1e-9, 1e-8)
    
    counter = {}
    for state in environment[6]:

        if state not in environment[4]:
            
            counter[str(state)] = {}

            for action in ["[1, 0]","[-1, 0]","[0, 1]","[0, -1]"]:

                #next_state = [x + y for x, y in zip(state, ast.literal_eval(action))]

                #if (next_state in environment[6]) and next_state not in environment[4]:
                    
                counter[str(state)][action] = random.uniform(1e-9, 1e-8)
    
    done_trials = 0
    Policies = [policy]
    cp = 0
    for trial in tqdm(range(1,num_trials)):
        #print(policy['[3,3]'])
        policy = Policies[cp]
        trajectory = generate_trajectory_probability_based(policy, trial, epsilon,traj_len, action_prob_type)
        #print(len(trajectory))

        #if len(trajectory) < 100:
        #print(trajectory)
        
        if trajectory:
            #print(len(trajectory))

            done_trials +=1 
        

            G = 0
            returns = {}
            first_visit = []

            for state in environment[6]:

                if state not in environment[4]:# and state != environment[3]:

                    returns[str(state)] = {}

            for state in environment[6]:
                
                if state not in environment[4]:# and state != environment[3]:

                    for action in ["[1, 0]","[-1, 0]","[0, 1]","[0, -1]"]:

                        #next_state = [x + y for x, y in zip(state, ast.literal_eval(action))]

                        #if (next_state in environment[6]) and next_state not in environment[4]:

                        returns[str(state)][action] = random.uniform(1e-9, 1e-8)

                
            #print(returns)

            for i in range(len(trajectory[1:])):
                step = trajectory[1:][i]

                if step not in first_visit:
                    
                    """state_str = str(step)
                    if state_str not in returns:
                        returns[state_str] = {}
                        for action in ["[1,0]", "[-1,0]", "[0,1]", "[0,-1]"]:
                            returns[state_str][action] = 0"""  # Initialize all actions with value 0
                            
                    first_visit.append(step)
                    #action = derive_action(trajectory[1:][i + 1], trajectory[1:][i])
                    last_step = str(trajectory[1:][i])
                    if type(policy) == tuple:
                        policy = policy[1]
                        
                    action = policy[last_step]
                    #if action == [0,0]:
                    r = state_action_reward(policy, step)
                    G = gamma * G + r
                    #print(G)
                    #action_str = str(action)
                    #print(action_str)
                    #print(returns[str(step)])
                    returns[str(step)][str(action)] += G
                    #print(returns[str(step)][action_str])

            


            for state in list(returns.keys()):
                for action in ["[1, 0]","[-1, 0]","[0, 1]","[0, -1]"]:
                    #print(state,action)
                    #print('q',Q[state][action])
                    #print(returns[state][action])
                    #if returns[state]["[-1, 0]"] != 0:
                    #    print(returns[state]["[-1, 0]"])
                    #Q[state][action] = returns[state][action] / trial

                    if abs(returns[state][action]) > 1e-3:

                        counter[state][action] = counter[state][action] + 1

                        Q[state][action] = Q[state][action] + returns[state][action]

                        Q[state][action] = Q[state][action] / round(counter[state][action])
                        #print('f')
                    
                    #else:

                    #    Q[state][action] = Q[state][action] + returns[state][action]

            policy = {}
            for state in list(Q.keys()):
                #print('d')
                if Q[state] != {}:
                    value_action_state = reverse_dictionary(Q[state])
                    #print('value_action_state:',value_action_state)
                    #print(state)
                    #print(value_action_state)
                    Max_val = max(list(value_action_state.keys()))
                    best_action = value_action_state[Max_val]
                    policy[state] = ast.literal_eval(best_action)
            #print(policy)
            #if policy != policy_0:
            #    print('f')

            Policies.append(policy)
            cp = cp + 1
            if cp == 100:
                print(cp)
    
        
    return policy, Q, done_trials, Policies

## Deterministic

In [25]:
policy_0 = arbitrary_policy(41)
first_try = OnPolicy_MC_prediction(500000, policy_0, 0.99, 0.01,500, 'deterministic')

 30%|██▉       | 148392/499999 [21:57<49:07, 119.28it/s]  

100


100%|██████████| 499999/499999 [1:01:52<00:00, 134.67it/s]


In [9]:
print('number of done trials:',first_try[2])
print('======')
print('The last/best policy:')
print(first_try[0])
print('======')
print('The last Q-value:')
print(first_try[1])

number of done trials: 133695
The last/best policy:
{'[0, 1]': [1, 0], '[0, 2]': [0, -1], '[0, 3]': [1, 0], '[1, 0]': [0, 1], '[1, 1]': [0, -1], '[1, 2]': [0, 1], '[1, 3]': [0, -1], '[2, 1]': [0, 1], '[2, 2]': [0, -1], '[2, 3]': [0, 1], '[3, 1]': [1, 0], '[3, 3]': [1, 0], '[4, 0]': [-1, 0], '[4, 1]': [-1, 0], '[4, 2]': [0, 1], '[4, 3]': [0, -1]}
The last Q-value:
{'[0, 1]': {'[1, 0]': -1.8283883849486395e-05, '[-1, 0]': -10.561792490924729, '[0, 1]': -10.580512440120565, '[0, -1]': -13.444754677956233}, '[0, 2]': {'[1, 0]': -3.272345016064863, '[-1, 0]': -6.880996994491299, '[0, 1]': -4.930398993402942, '[0, -1]': -1.0091425198079127e-05}, '[0, 3]': {'[1, 0]': -1.994637377482096e-05, '[-1, 0]': -3.9700999933616004, '[0, 1]': -3.9700999894111524, '[0, -1]': -10.550257254364263}, '[1, 0]': {'[1, 0]': -9.754061204193054, '[-1, 0]': -5.445497494295858, '[0, 1]': -3.2437003371962826e-05, '[0, -1]': -4.940398983769299}, '[1, 1]': {'[1, 0]': -4.900994995343974, '[-1, 0]': -3.9501999898233535,

## Optimized Policy

In [26]:
first_try[0]

{'[0, 1]': [0, 1],
 '[0, 2]': [1, 0],
 '[0, 3]': [0, -1],
 '[1, 0]': [0, 1],
 '[1, 1]': [1, 0],
 '[1, 2]': [-1, 0],
 '[1, 3]': [0, 1],
 '[2, 1]': [1, 0],
 '[2, 2]': [-1, 0],
 '[2, 3]': [-1, 0],
 '[3, 1]': [-1, 0],
 '[3, 3]': [1, 0],
 '[4, 0]': [1, 0],
 '[4, 1]': [0, 1],
 '[4, 2]': [0, 1],
 '[4, 3]': [-1, 0]}

## optimized Action-Value Function

In [27]:
first_try[1]

{'[0, 1]': {'[1, 0]': -2.979999987139254,
  '[-1, 0]': -5.910697986174125,
  '[0, 1]': -3.573587644419302e-05,
  '[0, -1]': -6.900995004712646},
 '[0, 2]': {'[1, 0]': -7.909637128694817e-06,
  '[-1, 0]': -1.9999999919938294,
  '[0, 1]': -5.85198505216081,
  '[0, -1]': -3.9501999848769587},
 '[0, 3]': {'[1, 0]': -4.900995006840813,
  '[-1, 0]': -5.9106979888099715,
  '[0, 1]': -4.940398984044857,
  '[0, -1]': -2.8524644595298792e-05},
 '[1, 0]': {'[1, 0]': -11.62837523523017,
  '[-1, 0]': -9.7637641968131,
  '[0, 1]': -6.039999989894305e-05,
  '[0, -1]': -10.628375246559365},
 '[1, 1]': {'[1, 0]': -5.3833626427161314e-05,
  '[-1, 0]': -12.57829191040981,
  '[0, 1]': -6.793465194072248,
  '[0, -1]': -6.832085048349945},
 '[1, 2]': {'[1, 0]': -3.9403989864352393,
  '[-1, 0]': -1.5728423714962523e-05,
  '[0, 1]': -6.851591017026077,
  '[0, -1]': -3.9403989821862577},
 '[1, 3]': {'[1, 0]': -8.739883627714896,
  '[-1, 0]': -6.851591012512707,
  '[0, 1]': -6.046618060246079e-05,
  '[0, -1]': 