# CliffWorld

Problem 1: A programming problem on tabular RL methods (30 points)
In this problem, you will implement Q-learning and SARSA on a Cliff Walking problem and compare
their performance.

Problem statement: Cliffworld! This is a gridworld with a cliff. The description can be found
in Sutton and Barto’s Example 6.6. It is a useful example to gain intuition into how Q-learning and
SARSA’s approach to learning differs from each other.

![image.png](attachment:image.png)

In [None]:
def getTransitions(width = 6,height = 4):
	pos = {}
	ind=0
	for y in range(height):
		for x in range(width):
			pos[complex(x,y)]=ind
			ind+=1
	transitions = {}
	cliffs = list(range(1,width-1))
	"""
	8 9 1011
	4 5 6 7
	0 1 2 3 
	"""


	for y in range(height):
		for x in range(width):
			# UDLR
			trs = []
			point = complex(x,y)
			for delta in (1j, -1j, -1, 1):
				trs.append(pos[point+delta]if point+delta in pos else pos[point])
			transitions[pos[point]] = trs if x not in cliffs or y != 0 else [0,0,0,0]
	transitions[pos[complex(width-1,0)]] = [pos[complex(width-1,0)]]*4

	return [transitions[i] for i in range(len(transitions))]

In [None]:
import numpy as np
import random 
import matplotlib.pyplot as plt

We're making the world half the size since we don't want to write out all the states. We should still see the same results though

In [None]:
# UDLR
transitions = getTransitions(12,4)

In [None]:
# Reward Function
def r(s):
    if s in (1,2,3,4,5,6,7,8,9,10):
        return -100
    else:
        return -1


In [None]:
def moving_average(data, num_points=3):
    average_arr = []
    for i in range(num_points, len(data) -num_points):
        average_arr.append(sum(data[i:i+num_points])/num_points)
    return average_arr

In [None]:
def egreedy_policy(q_values, state, epsilon=0.1):  
    if np.random.random() < epsilon:
        return np.random.choice(4)
    else:
        return np.argmax(q_values[state])

In [None]:
def Q_Learning(num_episodes, num_states = 48, num_actions = 4, gamma = 1, alpha = 0.5):
    q_val = np.zeros((num_states, num_actions))
    episode_rewards = []
    for _ in range(num_episodes):
        episode_reward = 0
        state = 0 # Initialize S
        while state != 11: 
            action = egreedy_policy(q_val, state, epsilon = 0.00) #Choose Action from state based on policy
            next_state = transitions[state][action] #Take Action A, Observe R, S'
            q_val[state][action] = q_val[state][action] + alpha * (r(next_state) + gamma*np.max(q_val[next_state]) - q_val[state][action]) 
            episode_reward += r(next_state)
            state = next_state
        episode_rewards.append(episode_reward)
    return q_val, episode_rewards

In [None]:
def sarsa(num_episodes, num_states = 48, num_actions = 4, gamma = 1, alpha = 0.5):
    q_val = np.zeros((num_states, num_actions))
    episode_rewards = []
    for _ in range(num_episodes):
        episode_reward = 0
        state = 0 # Initialize S
        action = egreedy_policy(q_val, state, epsilon = 0.00) #Choose Action from state based on policy

        while state != 11: 
            next_state = transitions[state][action] #Take Action A, Observe R, S'
            next_action = egreedy_policy(q_val, next_state, epsilon = 0.01) #Choose Action from state based on policy
            q_val[state][action] = q_val[state][action] + alpha*( r(next_state) + gamma*q_val[next_state][next_action] - q_val[state][action]) 
            episode_reward += r(next_state)
            state = next_state
            action = next_action
            
        episode_rewards.append(episode_reward)
    return q_val, episode_rewards

In [None]:
num_states = 48
num_actions = 4 
gamma = 1
alpha = 1

q_val, Q_rewards = Q_Learning(500)
SARSA_val, SARSA_rewards = sarsa(500)

dir = ['^', 'v', '<', '>']

print("------QLearn------")
for i in reversed(range(4)):
    for j in range(12):
        index = i * 12 + j
        print(dir[np.argmax(q_val[index])],end = ' ')
    print()

print("------SARSA------")
for i in reversed(range(4)):
    for j in range(12):
        index = i * 12 + j
        print(dir[np.argmax(SARSA_val[index])],end = ' ')
    print()

In [None]:
avg_points = 1
Q_average = moving_average(Q_rewards, avg_points)
plt.plot(Q_average)
SARSA_average = moving_average(SARSA_rewards, avg_points)
plt.plot(SARSA_average)
plt.legend(["Q Learning", "SARSA"])
plt.show()