In [37]:
import numpy as np
import random
from tabulate import tabulate
from tqdm import tqdm
import ast

In [2]:
def generate_grid_world(length, width,path_lenght,holes_number,Random_State):
    
    random.seed(Random_State)
    #store all cells in a list
    Grid_Cells = []
    for row in range(length):
        for col in range(width):
            Grid_Cells.append([row,col])


    #specify the number of holes in the gridworld
    
    #specify the start point as a random cell
    start = [random.randint(0, length), random.randint(0, width)]

    #create a path from start point
    """instead of defining start and goal points,
      we define just a start point and a random path with a random lenght to
       another point and name it as goal point"""
    
    def random_path(Start, Path_Lenght,length, width):
        
        Path = []
        Path.append(Start)
        for i in range(Path_Lenght):
            
            #there are two moves that take us on a random cell named Goal [1,0], [0,1]
            
            move = random.choice([[1,0], [0,1]])
            
            #update the start cell/point by the above move
            Start = [x + y for x, y in zip(Start, move)]
            
            #if the movement take us out of our gridworld, we reverse the change in the start point
            if Start[0] < 0 or Start[1] < 0 or Start[0] > length-1 or Start[1] > width-1:

                Start = [x - y for x, y in zip(Start, move)]

            else:
                
                #create a path history
                Path.append(Start)

        Goal = Start

        return Goal,Path
    

    GoalPath = random_path(start, path_lenght,length, width)

    goal = GoalPath[0]
    path = GoalPath[1]

    #now we must eliminate the path cells from the Grid_Cells to choose hole cells from remaining cells

    FreeCells = [x for x in Grid_Cells if x not in path]

    Holes = random.sample(FreeCells, holes_number)

    #Also, we can visualize our gridworld in a simple way

    def mark_holes(holes):
        marked_data = [["Hole" if [row, col] in holes else [row, col] for col in range(width)] for row in range(length)]
        return marked_data
    
    marked_matrix = mark_holes(Holes)

    print(tabulate(marked_matrix, tablefmt="grid"))

    
    return length, width, start, goal, Holes, path,Grid_Cells

In [3]:
environment = generate_grid_world(5, 4,4,4,39)
environment

+--------+--------+--------+--------+
| Hole   | [0, 1] | [0, 2] | [0, 3] |
+--------+--------+--------+--------+
| [1, 0] | [1, 1] | [1, 2] | [1, 3] |
+--------+--------+--------+--------+
| Hole   | [2, 1] | [2, 2] | [2, 3] |
+--------+--------+--------+--------+
| Hole   | [3, 1] | Hole   | [3, 3] |
+--------+--------+--------+--------+
| [4, 0] | [4, 1] | [4, 2] | [4, 3] |
+--------+--------+--------+--------+


(5,
 4,
 [1, 2],
 [4, 3],
 [[2, 0], [3, 2], [3, 0], [0, 0]],
 [[1, 2], [1, 3], [2, 3], [3, 3], [4, 3]],
 [[0, 0],
  [0, 1],
  [0, 2],
  [0, 3],
  [1, 0],
  [1, 1],
  [1, 2],
  [1, 3],
  [2, 0],
  [2, 1],
  [2, 2],
  [2, 3],
  [3, 0],
  [3, 1],
  [3, 2],
  [3, 3],
  [4, 0],
  [4, 1],
  [4, 2],
  [4, 3]])

In [4]:
def probability_distribution(grid_size,randomness):
    #random.seed(40)
    
    #by this function we generate probabilities which their sum is equal to 1
    def generate_probabilities(n):

        numbers = [random.random() for _ in range(n)]
        total_sum = sum(numbers)
        scaled_numbers = [num / total_sum for num in numbers]
        
        return scaled_numbers
    
    cells_prob = {}
    if randomness == 'stochastic':
        for cell in range(grid_size):
            
            #we set the number of probs to 4 due to 4 possible action for each cell (go to its neighbors)
            probs = generate_probabilities(4)

            cells_prob[cell] = probs
    elif randomness == 'equal probable':

        for cell in range(grid_size):

            cells_prob[cell] = [0.25,0.25,0.25,0.25]
    
    elif randomness == 'deterministic':
        for cell in range(grid_size):

            cells_prob[cell] = [0.2,0.2,0.1,0.5] #[0,0,0,1] ##[0.15,.15,0.1,0.6]


    #Note that we consider the correspondence between probabilities and actions as below:
    #probs = [p1, p2, p3, p4] ---> [[1,0],[-1,0],[0,1],[0,-1]]

    return cells_prob

def neighbor_cells(cell):

    grid_cells = environment[6]
    Actions = [[1,0],[-1,0],[0,1],[0,-1]]

    Neighbors = []
    Actions_Neighbors = []
    for action in Actions:

        neighbor = [x + y for x, y in zip(cell, action)]
        #if neighbor not in environment[4]:
        Neighbors.append(neighbor)
        Actions_Neighbors.append(action)

    return Neighbors, Actions_Neighbors

#Note
"""As we want to use monte carlo method for estimating the state values
   it has been assumed that we have not any knowledge about the environment.
   Therefore, we should consider the transitions into the holes cells
   (against the case of policy iteration)"""

def arbitrary_policy(randomness):
    #random.seed(randomness)
    
    policy = {}
    policy_action = {}
    for state in environment[6]:

        if state not in environment[4]:

            neighbors = neighbor_cells(state)[0]
            Actions_Neighbors = neighbor_cells(state)[1]

            allowed_positions = []

            for neighbor in neighbors:
                
                if neighbor in environment[6]:
                    
                    allowed_positions.append(neighbor)
        
            next_state = random.choice(allowed_positions)

            row = next_state[0] - state[0]
            col = next_state[1] - state[1]
            PolicyAction = [row, col]

            policy['{}'.format(state)] = next_state
            policy_action['{}'.format(state)] = PolicyAction


    return policy, policy_action

state_indice_dict = {}
counter = 0
for state in environment[6]:

    state = str(state)
    state_indice_dict[state] = counter
    counter = counter + 1

def generate_trajectory(policy,randomness):

    policy_action = policy[1]

    probs = probability_distribution(environment[0]*environment[1],'stochastic')
    
    start = environment[2]

    terminate = start

    trajectory = [start]
    c = 0
    test = []
    while terminate != environment[3]:
        random.seed(randomness+c)
        Actions = [[1,0],[-1,0],[0,1],[0,-1]]

        action = policy_action[str(terminate)]
        Actions.remove(action)
        #sorted_actions = [action]
        sorted_actions = Actions + [action]
        #print(sorted_actions)
        state_indice = state_indice_dict[str(terminate)]
        actions_prob = probs[state_indice]
        actions_prob.sort()
        #print(actions_prob)
        #print(actions_prob)


        selected_action = random.choices(sorted_actions, actions_prob)[0]
        

        """if c==0:
           print(sorted_actions)
           print(actions_prob)
           print(selected_action)
           test.append(selected_action)"""
        
        next_state = [x + y for x, y in zip(terminate, selected_action)]
        
        #if the agent goes out of the gridworld, it stays in its current state
        if next_state not in environment[6]:

            next_state = terminate
        
        #if it drops into the holes, it goes to the start points
        elif next_state in environment[4]:

            next_state = start

        
        terminate = next_state

        trajectory.append(terminate)
        c = c+1

    return trajectory #,test

In [5]:
policy_0 = arbitrary_policy(41)
trjcty = generate_trajectory(policy_0,1)
trjcty

[[1, 2],
 [1, 3],
 [0, 3],
 [0, 3],
 [0, 3],
 [0, 3],
 [0, 2],
 [0, 1],
 [0, 2],
 [0, 1],
 [1, 2],
 [1, 1],
 [1, 0],
 [1, 2],
 [1, 3],
 [0, 3],
 [0, 3],
 [0, 3],
 [1, 3],
 [1, 2],
 [2, 2],
 [1, 2],
 [2, 2],
 [2, 3],
 [3, 3],
 [1, 2],
 [2, 2],
 [2, 3],
 [1, 3],
 [1, 2],
 [1, 1],
 [2, 1],
 [2, 2],
 [2, 1],
 [1, 2],
 [1, 1],
 [1, 0],
 [1, 1],
 [0, 1],
 [0, 2],
 [0, 1],
 [0, 2],
 [0, 3],
 [1, 3],
 [1, 2],
 [1, 1],
 [0, 1],
 [0, 2],
 [0, 1],
 [0, 1],
 [1, 2],
 [1, 1],
 [0, 1],
 [1, 2],
 [2, 2],
 [1, 2],
 [2, 2],
 [1, 2],
 [1, 1],
 [1, 2],
 [1, 1],
 [1, 0],
 [1, 1],
 [1, 0],
 [1, 1],
 [1, 0],
 [1, 2],
 [1, 3],
 [0, 3],
 [0, 3],
 [0, 2],
 [0, 1],
 [0, 1],
 [0, 2],
 [0, 3],
 [0, 3],
 [0, 3],
 [0, 2],
 [0, 3],
 [1, 3],
 [1, 3],
 [1, 2],
 [1, 3],
 [1, 2],
 [2, 2],
 [1, 2],
 [2, 2],
 [1, 2],
 [1, 1],
 [1, 2],
 [1, 1],
 [1, 2],
 [1, 1],
 [0, 1],
 [1, 2],
 [2, 2],
 [2, 1],
 [2, 2],
 [2, 1],
 [1, 2],
 [1, 3],
 [1, 2],
 [1, 3],
 [0, 3],
 [0, 2],
 [0, 3],
 [0, 2],
 [0, 1],
 [0, 1],
 [0, 2],
 [0, 3],
 

In [6]:
len(trjcty)

1056

In [7]:
policy_0 = arbitrary_policy(41)

a = []
test = []
for i in tqdm(range(1000)):
    trjcty = generate_trajectory(policy_0,i)
    test.append(trjcty[1])
    #print(i)
    
    #a.append(trjcty)

    #if len(trjcty) != 763:
    #    print(len(trjcty))
    
    #if i > 1 and a[i] != a[i-1]:

        #print(i)

#c = set(a)
#print(c)

100%|██████████| 1000/1000 [01:03<00:00, 15.80it/s]


## First-visit MC prediction, for estimationg $V \approx v_{\pi}$

In [2]:
def state_reward(policy,state):

    policy_state = policy[0]
    
    next_state = policy_state[str(state)]

    if next_state in environment[4]:

        r = -3
    
    elif next_state == environment[3]:

        r = 10
    
    else:

        r = -1
    
    return r

#Note that here we want to evaluate just a fixed policy
# and so we are not trying to optimize it 
def monte_carlo_prediction(num_trials, policy, gamma):

    #V = np.zeros((environment[6],1))

    #store returns of each trajectory
    Returns = {} #np.zeros((environment[6],1))
    Lens = []
    #Loop for ever (for each episode)
    for trial in tqdm(range(num_trials)):
        
        #generate an episode
        trajectory = generate_trajectory(policy,trial)
        Lens.append(trajectory)

        #limit the lenght of trajectory

        #total reward
        G = 0

        trajectory.reverse()
        
        
        returns = {}

        for state in environment[6]:
            
            if state not in environment[4] and state != environment[3]:

                returns[str(state)] = 0

        first_visit = []
        for step in trajectory[1:]:

            if step not in first_visit:

                first_visit.append(step)

                r = state_reward(policy)

                G = gamma * G + r

                returns[str(step)] = returns[str(step)] + G
        
        #Returns[trial] = returns
    
    V = {}
    for step in list(returns.keys()):

        V[step] = returns[step]/num_trials
    

    return V,returns #, Lens

In [45]:
x = monte_carlo_prediction(100000,policy_0,0.9)

100%|██████████| 100000/100000 [06:25<00:00, 259.52it/s]


In [47]:
x[0]

{'[0, 1]': -8.12579511e-05,
 '[0, 2]': -8.313215599e-05,
 '[0, 3]': -8.481894039099999e-05,
 '[1, 0]': -5.6953279000000005e-05,
 '[1, 1]': -5.2170310000000005e-05,
 '[1, 2]': -3.439e-05,
 '[1, 3]': -2.7099999999999998e-05,
 '[2, 1]': -4.6855900000000007e-05,
 '[2, 2]': -4.0951e-05,
 '[2, 3]': -1.8999999999999998e-05,
 '[3, 1]': -8.63370463519e-05,
 '[3, 3]': -1e-05,
 '[4, 0]': -8.770334171670999e-05,
 '[4, 1]': -8.8933007545039e-05,
 '[4, 2]': -0.0001100397067905351}

In [46]:
x[1]

{'[0, 1]': -8.12579511,
 '[0, 2]': -8.313215599,
 '[0, 3]': -8.481894039099998,
 '[1, 0]': -5.6953279000000006,
 '[1, 1]': -5.217031,
 '[1, 2]': -3.439,
 '[1, 3]': -2.71,
 '[2, 1]': -4.68559,
 '[2, 2]': -4.0951,
 '[2, 3]': -1.9,
 '[3, 1]': -8.63370463519,
 '[3, 3]': -1.0,
 '[4, 0]': -8.770334171671,
 '[4, 1]': -8.8933007545039,
 '[4, 2]': -11.00397067905351}

## On-policy first-visit MC control (for $\epsilon$-soft policies), estimates ${\pi} \approx {\pi}_{*} $

In [31]:
def state_action_reward(policy,state):

    policy_state = policy[0]
    
    next_state = policy_state[str(state)]

    if next_state in environment[4]:

        r = -3
    
    elif next_state == environment[3]:

        r = 10
    
    else:

        r = -1
    
    return r
    

def derive_action(current_state, next_state):

    row = next_state[0] - current_state[0]
    col = next_state[1] - current_state[1]
    action = [row, col]

    return action

def generate_trajectory_probability_based(policy,randomness,epsilon):

    
    probs = probability_distribution(environment[0]*environment[1],'stochastic')  
    start = environment[2]
    terminate = start
    trajectory = [start]
    c = 0
    test = []
    while terminate != environment[3]:
        random.seed(randomness+c)
        Actions = [[1,0],[-1,0],[0,1],[0,-1]]

        #we have two probabilities for epsilon-greedy action selection
        #It's a kind of exploration-exploitation balancing
        
        #probability for exploration on not best action values
        low_prob = epsilon/len(Actions)
        high_prob = 1 - epsilon + (epsilon/len(Actions))

        #this random action selection is for balancing exploration-exploitation trade-off

        exex_probs = [low_prob,low_prob,low_prob,high_prob]
        if type(policy) == tuple:
            policy = policy[1]
        
        best_action_value = policy[str(terminate)]
        print(best_action_value)
        Actions_copy = Actions.copy()
        print(Actions_copy)
        Actions_copy.remove(best_action_value)
        exex_actions = Actions_copy + [best_action_value]
        
        action = random.choices(exex_actions, exex_probs)[0]

        #second part of action selection
        Actions.remove(action)
        sorted_actions = Actions + [action]
        state_indice = state_indice_dict[str(terminate)]
        actions_prob = probs[state_indice]
        actions_prob.sort()
        #this random action selection is due to the randomness of the environment
        selected_action = random.choices(sorted_actions, actions_prob)[0]
        next_state = [x + y for x, y in zip(terminate, selected_action)]
        #if the agent goes out of the gridworld, it stays in its current state
        if next_state not in environment[6]:
            next_state = terminate
        #if it drops into the holes, it goes to the start points
        elif next_state in environment[4]:
            next_state = start
        terminate = next_state
        trajectory.append(terminate)
        c = c+1

    return trajectory

"""In this function, trials (averaging) and improving policy are sone at the same time.
   At each trial, the q function (here a dictionary) computed and a better policy created.
   The next trial generate a new trajectory, based on new better policy that obtained the last trial."""
def OnPolicy_MC_prediction(num_trials, policy, gamma, epsilon):

    def reverse_dictionary(dict):

        reverse_dict = {}
        
        counter = 0
        for key in list(dict.keys()):

            val = dict[key]

            reverse_dict[val] = key
        
        return reverse_dict

    #store returns of each trajectory
    Returns = {} #np.zeros((environment[6],1))
    Lens = []
    
    #Loop for ever (for each episode)
    for trial in tqdm(range(num_trials)):
        
        #generate an episode
        trajectory = generate_trajectory_probability_based(policy,trial,epsilon)
        Lens.append(trajectory)

        #limit the lenght of trajectory

        #total reward
        G = 0

        trajectory.reverse()
        
        returns = {}
        for i in range(len(trajectory[1:-1])):

            step = trajectory[1:][i]

            returns[str(step)] = {}

            for action in ["[1,0]","[-1,0]","[0,1]","[0,-1]"]:

                returns[str(step)][action] = 0


        for state in environment[6]:
            
            if state not in environment[4] and state != environment[3]:

                for action in ["[1,0]","[-1,0]","[0,1]","[0,-1]"]:

                    returns[str(step)][action] = 0

        first_visit = []
        for i in range(len(trajectory[1:-1])):

            step = trajectory[1:-1][i]

            if step not in first_visit:

                first_visit.append(step)

                #if step != environment[2]:

                action = derive_action(trajectory[1:-1][i+1],trajectory[1:-1][i])
                #else:


                r = state_action_reward(policy,step)

                G = gamma * G + r

                #returns[(str(step),str(action))] = returns[str(step),str(action)] + G
                print(str(step),str(action))
                returns[str(step)][str(action)] = returns[str(step)][str(action)] + G

        #Returns[trial] = returns
    
        Q = {}

        for state in environment[6]:

                Q[str(state)] = {}
        #pair=(step,action)
        for state in list(returns.keys()):

            for action in ["[1,0]","[-1,0]","[0,1]","[0,-1]"]:

                Q[str(state)][action] = returns[str(state)][action]/trial

        policy = {}
        for state in list(Q.keys()):

            value_action_state = reverse_dictionary(Q[state])
            Max_val = max(list(value_action_state.keys()))
            best_action = value_action_state[Max_val]

            policy[state] = best_action
        

    return policy,Q


In [69]:
def state_action_reward(policy,state):

    if type(policy) == tuple:
        
        policy_state = policy[0]
    
    else:
        policy_state = policy

    
    
    next_state = policy_state[str(state)]

    if next_state in environment[4]:

        r = -3
    
    elif next_state == environment[3]:

        r = 10
    
    else:

        r = -1
    
    return r
    

def derive_action(current_state, next_state):

    row = next_state[0] - current_state[0]
    col = next_state[1] - current_state[1]
    action = [row, col]

    return action

def generate_trajectory_probability_based(policy,randomness,epsilon):

    
    probs = probability_distribution(environment[0]*environment[1],'stochastic')  
    start = environment[2]
    terminate = start
    trajectory = [start]
    c = 0
    test = []
    while terminate != environment[3]:
        random.seed(randomness+c)
        Actions = [[1,0],[-1,0],[0,1],[0,-1]]

        #we have two probabilities for epsilon-greedy action selection
        #It's a kind of exploration-exploitation balancing
        
        #probability for exploration on not best action values
        low_prob = epsilon/len(Actions)
        high_prob = 1 - epsilon + (epsilon/len(Actions))

        #this random action selection is for balancing exploration-exploitation trade-off

        exex_probs = [low_prob,low_prob,low_prob,high_prob]
        if type(policy) == tuple:
            policy = policy[1]
        
        best_action_value = policy[str(terminate)]
        #print(type(best_action_value))
        Actions_copy = Actions.copy()
        #print(Actions_copy)
        Actions_copy.remove(best_action_value)
        exex_actions = Actions_copy + [best_action_value]
        
        action = random.choices(exex_actions, exex_probs)[0]

        #second part of action selection
        Actions.remove(action)
        sorted_actions = Actions + [action]
        state_indice = state_indice_dict[str(terminate)]
        actions_prob = probs[state_indice]
        actions_prob.sort()
        #this random action selection is due to the randomness of the environment
        selected_action = random.choices(sorted_actions, actions_prob)[0]
        next_state = [x + y for x, y in zip(terminate, selected_action)]
        #if the agent goes out of the gridworld, it stays in its current state
        if next_state not in environment[6]:
            next_state = terminate
        #if it drops into the holes, it goes to the start points
        elif next_state in environment[4]:
            next_state = start
        terminate = next_state
        trajectory.append(terminate)
        c = c+1

    return trajectory
def OnPolicy_MC_prediction(num_trials, policy, gamma, epsilon):
    
    def reverse_dictionary(dict):
        reverse_dict = {}
        for key in list(dict.keys()):
            val = dict[key]
            reverse_dict[val] = key
        return reverse_dict

    Q = {}
    for state in environment[6]:
        Q[str(state)] = {}

        for action in ["[1,0]", "[-1,0]", "[0,1]", "[0,-1]"]:
                
            Q[str(state)][action] = 0

    for trial in tqdm(range(1,num_trials)):
        trajectory = generate_trajectory_probability_based(policy, trial, epsilon)
        

        G = 0
        returns = {}
        first_visit = []
        for i in range(len(trajectory[1:-1])):
            step = trajectory[1:-1][i]

            if step not in first_visit:
                
                state_str = str(step)
                if state_str not in returns:
                    returns[state_str] = {}
                    for action in ["[1,0]", "[-1,0]", "[0,1]", "[0,-1]"]:
                        returns[state_str][action] = 0  # Initialize all actions with value 0
                        
                first_visit.append(step)
                action = derive_action(trajectory[1:][i + 1], trajectory[1:][i])
                r = state_action_reward(policy, step)
                G = gamma * G + r
                action_str = str(action)
                returns[str(step)][action_str] += G

        


        for state in list(returns.keys()):
            for action in ["[1,0]", "[-1,0]", "[0,1]", "[0,-1]"]:
                Q[state][action] = returns[state][action] / trial

        policy = {}
        for state in list(Q.keys()):
            if Q[state] != {}:
                value_action_state = reverse_dictionary(Q[state])
                #print(state)
                #print(value_action_state)
                Max_val = max(list(value_action_state.keys()))
                best_action = value_action_state[Max_val]
                policy[state] = ast.literal_eval(best_action)
    
        
    return policy, Q

In [70]:
policy_0 = arbitrary_policy(41)
first_try = OnPolicy_MC_prediction(100, policy_0, 0.9, 0.1)

  0%|          | 0/99 [00:00<?, ?it/s]


KeyError: '[0, -1]'

In [66]:
first_try[1]

{'[0, 0]': {'[1,0]': 0, '[-1,0]': 0, '[0,1]': 0, '[0,-1]': 0},
 '[0, 1]': {'[1,0]': 0.0,
  '[-1,0]': 0.0,
  '[0,1]': 0.0,
  '[0,-1]': -0.047329191919191924},
 '[0, 2]': {'[1,0]': 0.0,
  '[-1,0]': 0.0,
  '[0,1]': 0.0,
  '[0,-1]': -0.05752856464646465},
 '[0, 3]': {'[1,0]': 0.0,
  '[-1,0]': 0.0,
  '[0,1]': 0.0,
  '[0,-1]': -0.061876718282828286},
 '[1, 0]': {'[1,0]': 0.0,
  '[-1,0]': 0.0,
  '[0,1]': 0.0,
  '[0,-1]': -0.05269728282828283},
 '[1, 1]': {'[1,0]': 0.0,
  '[-1,0]': 0.0,
  '[0,1]': 0.0,
  '[0,-1]': -0.04136464646464647},
 '[1, 2]': {'[1,0]': 0.0,
  '[-1,0]': 0.0,
  '[0,1]': 0.0,
  '[0,-1]': -0.03473737373737374},
 '[1, 3]': {'[1,0]': 0.0,
  '[-1,0]': 0.0,
  '[0,1]': 0.0,
  '[0,-1]': -0.06579005655555556},
 '[2, 0]': {'[1,0]': 0, '[-1,0]': 0, '[0,1]': 0, '[0,-1]': 0},
 '[2, 1]': {'[1,0]': 0.0,
  '[-1,0]': 0.0,
  '[0,1]': 0.0,
  '[0,-1]': -0.027373737373737373},
 '[2, 2]': {'[1,0]': 0.0,
  '[-1,0]': 0.0,
  '[0,1]': 0.0,
  '[0,-1]': -0.010101010101010102},
 '[2, 3]': {'[1,0]': 0.0

In [41]:
for i in range(1,4):
    print(i)

1
2
3


In [21]:
policy_0 = arbitrary_policy(41)

first_try = OnPolicy_MC_prediction(10,policy_0,0.9,0.1)

  0%|          | 0/10 [00:00<?, ?it/s]

[3, 3] [1, 0]





KeyError: '[1, 0]'

In [15]:
m = generate_trajectory_probability_based(policy_0,42,0.1)
m.reverse()
m

[[4, 3],
 [4, 2],
 [4, 1],
 [4, 0],
 [4, 1],
 [4, 2],
 [4, 1],
 [3, 1],
 [4, 1],
 [3, 1],
 [2, 1],
 [1, 1],
 [1, 2],
 [0, 1],
 [1, 1],
 [0, 1],
 [1, 1],
 [2, 1],
 [1, 1],
 [0, 1],
 [1, 1],
 [0, 1],
 [0, 2],
 [0, 1],
 [1, 1],
 [2, 1],
 [2, 2],
 [2, 1],
 [2, 2],
 [2, 1],
 [1, 1],
 [1, 2],
 [2, 1],
 [1, 1],
 [2, 1],
 [1, 1],
 [2, 1],
 [1, 1],
 [2, 1],
 [2, 2],
 [1, 2],
 [1, 1],
 [2, 1],
 [1, 1],
 [2, 1],
 [2, 2],
 [1, 2],
 [1, 3],
 [1, 3],
 [1, 3],
 [1, 2],
 [1, 3],
 [0, 3],
 [1, 3],
 [2, 3],
 [2, 3],
 [2, 2],
 [1, 2],
 [1, 1],
 [0, 1],
 [0, 1],
 [1, 1],
 [0, 1],
 [0, 2],
 [1, 2],
 [2, 2],
 [2, 3],
 [2, 2],
 [2, 3],
 [2, 2],
 [1, 2],
 [2, 1],
 [2, 2],
 [1, 2],
 [0, 2],
 [0, 2],
 [0, 3],
 [0, 2],
 [1, 2],
 [1, 3],
 [0, 3],
 [0, 3],
 [0, 2],
 [1, 2],
 [1, 1],
 [2, 1],
 [2, 2],
 [1, 2],
 [1, 1],
 [2, 1],
 [1, 1],
 [1, 2],
 [1, 0],
 [1, 1],
 [1, 2],
 [1, 3],
 [0, 3],
 [0, 2],
 [0, 2],
 [0, 3],
 [0, 2],
 [1, 2],
 [1, 0],
 [1, 0],
 [1, 1],
 [2, 1],
 [2, 2],
 [1, 2],
 [2, 1],
 [1, 1],
 [1, 2],
 

In [33]:
a = (1,2)
if type(a) == tuple:
    print(1)

1


In [44]:
a = {}

for i in range(4):

    a[str(i)] = {}

    for c in ['e','q']:
       a[str(i)][c] = -1

a['1']['e'] = a['1']['e'] + 4
a


{'0': {'e': -1, 'q': -1},
 '1': {'e': 3, 'q': -1},
 '2': {'e': -1, 'q': -1},
 '3': {'e': -1, 'q': -1}}

In [40]:
policy_0

({'[0, 1]': [0, 2],
  '[0, 2]': [1, 2],
  '[0, 3]': [0, 2],
  '[1, 0]': [1, 1],
  '[1, 1]': [0, 1],
  '[1, 2]': [1, 1],
  '[1, 3]': [1, 2],
  '[2, 1]': [3, 1],
  '[2, 2]': [3, 2],
  '[2, 3]': [1, 3],
  '[3, 1]': [4, 1],
  '[3, 3]': [3, 2],
  '[4, 0]': [4, 1],
  '[4, 1]': [4, 2],
  '[4, 2]': [4, 3],
  '[4, 3]': [4, 2]},
 {'[0, 1]': [0, 1],
  '[0, 2]': [1, 0],
  '[0, 3]': [0, -1],
  '[1, 0]': [0, 1],
  '[1, 1]': [-1, 0],
  '[1, 2]': [0, -1],
  '[1, 3]': [0, -1],
  '[2, 1]': [1, 0],
  '[2, 2]': [1, 0],
  '[2, 3]': [-1, 0],
  '[3, 1]': [1, 0],
  '[3, 3]': [0, -1],
  '[4, 0]': [0, 1],
  '[4, 1]': [0, 1],
  '[4, 2]': [0, 1],
  '[4, 3]': [0, -1]})