<a href="https://colab.research.google.com/github/HamdanXI/nlp_adventure/blob/main/Lab_15.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Implement any algorithm from the lecture to solve the grid game (e.g., Q-Learning)

In [1]:
# Import necessary libraries
import numpy as np

In [2]:
# Environment Class
class Grid:
    def __init__(self, width, height, start):
        self.width = width
        self.height = height
        self.i = start[0]
        self.j = start[1]

    def set_grid(self, rewards, actions):
        # rewards should be a dict of: (i, j): r (row, col): reward
        # actions should be a dict of: (i, j): A (row, col): list of possible actions
        self.rewards = rewards
        self.actions = actions

    def set_state(self, s):
        self.i = s[0]
        self.j = s[1]

    def current_state(self):
        return (self.i, self.j)

    def is_terminal(self, s):
        return s not in self.actions

    def move(self, action):
        # check if legal move first
        if action in self.actions[(self.i, self.j)]:
            if action == 'U':
                self.i -= 1
            elif action == 'D':
                self.i += 1
            elif action == 'R':
                self.j += 1
            elif action == 'L':
                self.j -= 1
        # return a reward (if any)
        return self.rewards.get((self.i, self.j), 0)

    def game_over(self):
        # returns true if game is over, else false
        # true if we are in a state where no actions are possible
        return (self.i, self.j) not in self.actions

    def all_states(self):
        # returns either a position that has possible next actions
        # or a position that yields a reward
        return set(self.actions.keys()) | set(self.rewards.keys())

In [3]:
### CREATE GRID OBJECT INSTANCE WITH ACTIONS/REWARDS
# Step 1: Initialize the Grid object
grid = Grid(4, 4, (0, 0))

# Step 2: Define rewards and actions
rewards = {(3, 3): 1, (2, 3): -1}  # example rewards at positions (3, 3) and (2, 3)
actions = {
    (0, 0): ['D', 'R'],
    (0, 1): ['L', 'R'],
    (0, 2): ['L', 'D', 'R'],
    # ... (define actions for other cells)
    (3, 2): ['U', 'R'],
    (3, 3): []  # no actions possible at terminal state
}

# Step 3: Set the grid with rewards and actions
grid.set_grid(rewards, actions)

In [4]:
ALL_POSSIBLE_ACTIONS = ('U', 'D', 'L', 'R')
### INITIALIZE NECCESSARY DATA STRUCTS (e.g., Q table)
import random

# Define the grid dimensions and start position
grid_width = 4
grid_height = 4
start_position = (0, 0)

# Initialize the Grid
grid = Grid(grid_width, grid_height, start_position)

# Define all possible actions
ALL_POSSIBLE_ACTIONS = ('U', 'D', 'L', 'R')

# Initialize the Q-table
# For each state, assign a random small value for each action
Q = {}
states = grid.all_states()
for state in states:
    Q[state] = {}
    for action in ALL_POSSIBLE_ACTIONS:
        Q[state][action] = random.uniform(0, 0.1)  # Small random values

# Example of how to access Q-value for a specific state-action pair
# Q[(1, 2)]['R']  # Q-value for state (1, 2) and action 'R'

AttributeError: ignored

In [None]:
### DEFINE HELPER FUNCTIONS, IF NEEDED (e.g., random_action)
import random

def random_action(a, eps=0.1):
    """
    Choose an action based on epsilon-greedy strategy.
    - a: current best action based on max Q-value.
    - eps: probability of choosing a random action.
    """
    p = np.random.random()
    if p < (1 - eps):
        return a
    else:
        return np.random.choice(ALL_POSSIBLE_ACTIONS)

def max_dict(d):
    """
    Returns the key and value with the highest value from dictionary.
    """
    max_key = None
    max_val = float('-inf')
    for k, v in d.items():
        if v > max_val:
            max_val = v
            max_key = k
    return max_key, max_val

def print_values(V, g):
    """
    Print the values of the grid.
    - V: dictionary of state to value
    - g: grid object
    """
    for i in range(g.width):
        print("---------------------------")
        for j in range(g.height):
            v = V.get((i, j), 0)
            if v >= 0:
                print(" %.2f|" % v, end="")
            else:
                print("%.2f|" % v, end="")  # negative sign takes up an extra space
        print("")

def print_policy(P, g):
    """
    Print the policy for each grid state.
    - P: dictionary of state to action
    - g: grid object
    """
    for i in range(g.width):
        print("---------------------------")
        for j in range(g.height):
            a = P.get((i, j), ' ')
            print("  %s  |" % a, end="")
        print("")

In [None]:
### SET ALGORITHM HYPERPARAMETERS
### HERE

In [None]:
s = (0, 0) # start state
grid.set_state(s)

### IMPLEMENT TRAINING LOOP
### HERE

In [None]:
### PRINT FINAL POLICY (actions learnt for each state)
### HERE