In [1]:
import numpy as np

In [2]:
grid = [[0,  0, 1],\
        [0, -1, 0]]

In [3]:
actions = ['up', 'down', 'left', 'right']

In [41]:
Q = np.zeros((2,3,4)) # rows x cols x actions

In [42]:
V = {} # dictioary to store the return for each (state, action)

In [6]:
gamma = 0.9

In [60]:
def get_next_state(state, action):
    r,c = state # r is row, c is column number
    if action =='up':
        r = max(0, r-1)
    if action =='down':
        r = min(1, r+1)
    if action =='left':
        c = max(0, c-1)
    if action =='right':
        c = min(2, c+1)  
    return (r,c)    

In [10]:
get_next_state((0,1), 'down')

(1, 1)

In [81]:
episodes = 100

np.random.seed(42)

for i in range(episodes):

    episode=[]
    state = (0,0)
    while grid[state[0]][state[1]]==0:
        idx = np.random.randint(4)
        action = actions[idx]
        next_state = get_next_state(state, action)
        reward = grid[next_state[0]][ next_state[1]]
        episode.append((state, action, reward))
        state= next_state

    #print(episode)    
    G = 0

    for state, action, reward in reversed(episode):
        G = reward + gamma*G  
        state_action = tuple(state) + (action,)
        if state_action not in V:
            V[state_action]=[]
        V[state_action].append(G)
        idx = actions.index(action)
        
        Q[state[0]][state[1]][idx]=np.mean(V[state_action])
print("Monte Carlo Q-table:\n", Q)         

Monte Carlo Q-table:
 [[[-0.24072987 -0.55452494 -0.2560369  -0.03740747]
  [-0.03842835 -1.         -0.22582771  1.        ]
  [ 0.          0.          0.          0.        ]]

 [[-0.2981654  -0.55510095 -0.47618509 -1.        ]
  [ 0.          0.          0.          0.        ]
  [ 0.          0.          0.          0.        ]]]


In [82]:
episodes = 1000
eta = 0.1  # Learning rate: how fast Q updates
np.random.seed(42)
Q = np.zeros((2,3,4)) # rows x cols x actions

for i in range(episodes):

    episode=[]
    state = (0,0)
    while grid[state[0]][state[1]]==0:
        idx = np.random.randint(4)
        action = actions[idx]
        next_state = get_next_state(state, action)
        reward = grid[next_state[0]][ next_state[1]]
        episode.append((state, action, reward))
        state= next_state

    #print(episode)    
    G = 0

    for state, action, reward in reversed(episode):
        G = reward + gamma*G  
        idx = actions.index(action)
        Q[state[0]][state[1]][idx] +=\
        -eta * (Q[state[0]][state[1]][idx]-G)
print("Monte Carlo Q-table Using Utility:\n", Q)    

Monte Carlo Q-table Using Utility:
 [[[-0.29363462 -0.56999525 -0.36810418 -0.34573226]
  [ 0.13477342 -1.         -0.31915079  1.        ]
  [ 0.          0.          0.          0.        ]]

 [[-0.29647478 -0.4546927  -0.48827501 -1.        ]
  [ 0.          0.          0.          0.        ]
  [ 0.          0.          0.          0.        ]]]


In [83]:
episodes = 1000
eta = 0.1  # Learning rate: how fast Q updates
gamma = 0.9 # discount rate
np.random.seed(42)
grid = [[0, 0, 1], [0, -1, 0]]
actions = ['up', 'down', 'left', 'right']
Q = np.zeros((2, 3, 4))

for i in range(episodes):

    state = (0,0) # initial state
    action_idx = np.random.randint(4) # initial action
    while grid[state[0]][state[1]]==0:
        
        action = actions[action_idx]
        next_state = get_next_state(state, action)
        reward = grid[next_state[0]][ next_state[1]]
        next_action_idx = np.random.randint(4)
        
        
        Q[state[0]][state[1]][action_idx] += eta * (
            reward + gamma * Q[next_state[0]][next_state[1]][next_action_idx]\
            - Q[state[0]][state[1]][action_idx]
        )
        state = next_state
        action_idx = next_action_idx  # Follow the policy
        
print("SARSA Q-table:\n", Q)

SARSA Q-table:
 [[[-0.2179384  -0.49029948 -0.1516475  -0.23227311]
  [ 0.00148482 -1.         -0.16555867  1.        ]
  [ 0.          0.          0.          0.        ]]

 [[-0.14464207 -0.69375646 -0.50408981 -1.        ]
  [ 0.          0.          0.          0.        ]
  [ 0.          0.          0.          0.        ]]]


In [84]:
import numpy as np

grid = [[0, 0, 1], [0, -1, 0]]
actions = ['up', 'down', 'left', 'right']
Q = np.zeros((2, 3, 4))
alpha = 0.1  # Learning rate: how fast Q updates
gamma = 0.9
epsilon = 0.1  # Exploration: 10% random actions
episodes = 1000

def choose_action(state, Q, epsilon):
    if np.random.rand() < epsilon:
        return np.random.randint(4)  # Random action
    return np.argmax(Q[state[0]][state[1]])  # Best action

for _ in range(episodes):
    state = [0, 0]
    action_idx = choose_action(state, Q, epsilon)  # Initial action
    while grid[state[0]][state[1]] == 0:
        next_state = get_next_state(state, actions[action_idx])
        reward = grid[next_state[0]][next_state[1]]
        next_action_idx = choose_action(next_state, Q, epsilon)  # Next action
        # SARSA update
        Q[state[0]][state[1]][action_idx] += alpha * (
            reward + gamma * Q[next_state[0]][next_state[1]][next_action_idx]\
            - Q[state[0]][state[1]][action_idx]
        )
        state = next_state
        action_idx = next_action_idx  # Follow the policy

print("SARSA Q-table with epsilon-greedy algorithm:\n", Q)

SARSA Q-table with epsilon-greedy algorithm:
 [[[ 0.61123594  0.58454454  0.60276723  0.74699543]
  [ 0.80070854 -0.9774716   0.62482031  1.        ]
  [ 0.          0.          0.          0.        ]]

 [[ 0.7157109   0.10045814  0.          0.        ]
  [ 0.          0.          0.          0.        ]
  [ 0.          0.          0.          0.        ]]]
