In [1]:
from statistics import mode
import gym
import numpy as np
import sys
from collections import defaultdict

env = gym.make('Taxi-v3')

num_episodes = 10000
gamma = 1.0
epsilon = 0.1
alpha = 0.1

In [13]:
env.close()

In [None]:
def choose_action(state, Q):
    if np.random.rand() < epsilon:
        return np.random.randint(env.action_space.n)
    else:
        return np.argmax(Q[state])

def sarsa_update_value(state, action, reward, next_state, next_action, done, Q):
    if done:
        Q[state][action] += alpha * (reward - Q[state][action])
    else:
        Q[state][action] += alpha * (reward + gamma * Q[next_state][next_action] - Q[state][action])

def run_sarsa(num_episodes, render = False):
    Q = defaultdict(lambda: np.zeros(env.action_space.n))
    total_reward = []
    for i in range(num_episodes):
        rewards = 0
        state = env.reset()
        state = state[0]
        action = choose_action(state, Q)
        while True:
            if i == num_episodes - 1 and render:
                env.render()
            next_state, reward, done, MDP, info = env.step(action)
            next_action = choose_action(next_state, Q)
            sarsa_update_value(state, action, reward, next_state, next_action, done, Q)
            rewards += reward
            if done:
                break
            state, action = next_state, next_action
        total_reward.append(rewards)
        print(f'\repisode: {i + 1}/{num_episodes}', end = '')
        sys.stdout.flush()
    print(f'\nMax Reward: {max(total_reward)}')
    return total_reward
   
run_sarsa(num_episodes, True)

In [None]:
def q_update_value(state, action, reward, next_state, done, Q):
    if done:
        Q[state][action] += alpha * (reward - Q[state][action])
    else:
        Q[state][action] += alpha * (reward + gamma * np.max(Q[next_state]) - Q[state][action])
def run_q_learning(num_episodes, render = False):
    Q = defaultdict(lambda: np.zeros(env.action_space.n))
    total_reward = []
    for i in range(num_episodes):
        rewards = 0
        state = env.reset()
        state = state[0]
        while True:
            if i == num_episodes - 1 and render:
                env.render()
            action = choose_action(state, Q)
            next_state, reward, done, MDP, info = env.step(action)
            q_update_value(state, action, reward, next_state, done, Q)
            rewards += reward
            if done:
                break
            state = next_state
        total_reward.append(rewards)
        print(f'\repisode: {i + 1}/{num_episodes}', end = '')
        sys.stdout.flush()
    print(f'\nMax Reward: {max(total_reward)}')
    return total_reward
    
run_q_learning(num_episodes, True)

In [None]:
def expected_sarsa_update_value(state, action, reward, next_state, done, Q):
    if done:
        Q[state][action] += alpha * (reward - Q[state][action])
    else:
        policy = np.ones(env.action_space.n) * epsilon / env.action_space.n
        policy[np.argmax(Q[next_state])] += 1 - epsilon
        Q[state][action] += alpha * (reward + gamma * np.dot(policy, Q[next_state]) - Q[state][action])
def run_expected_sarsa(num_episodes, render = False):
    Q = defaultdict(lambda: np.zeros(env.action_space.n))
    total_reward = []
    for i in range(num_episodes):
        rewards = 0
        state = env.reset()
        state = state[0]
        while True:
            if i == num_episodes - 1 and render:
                env.render()
            action = choose_action(state, Q)
            next_state, reward, done, MDP, info = env.step(action, Q)
            expected_sarsa_update_value(state, action, reward, next_state, done, Q)
            rewards += reward
            if done:
                break
            state = next_state
        total_reward.append(rewards)
        print(f'\repisode: {i + 1}/{num_episodes}', end = '')
        sys.stdout.flush()
    print(f'\nMax Reward: {max(total_reward)}')
    return total_reward
    
run_expected_sarsa(num_episodes, True)

In [None]:
import matplotlib.pyplot as plt
sarsa_reward = [0 for i in range(500)]
expected_sarsa_reward = [0 for i in range(500)]
q_reward = [0 for i in range(500)]
for i in range(100):
    sarsa_reward = [sum(x) for x in zip(sarsa_reward, run_sarsa(1000))]
    expected_sarsa_reward = [sum(x) for x in zip(expected_sarsa_reward, run_expected_sarsa(1000))]
    q_reward = [sum(x) for x in zip(q_reward, run_q_learning(1000))]
    
sarsa_reward = np.array(sarsa_reward) / 100
expected_sarsa_reward = np.array(expected_sarsa_reward) / 100
q_reward = np.array(q_reward) / 100
plt.plot(sarsa_reward)
plt.plot(expected_sarsa_reward)
plt.plot(q_reward)
plt.legend(['sarss reward', 'expected sarsa reward', 'q learning reward'])
plt.show()

In [None]:
import gym
env = gym.make("LunarLander-v2", render_mode="human")
env.action_space.seed(42)

observation, info = env.reset(seed=42)

for _ in range(1000):
    observation, reward, terminated, truncated, info = env.step(env.action_space.sample())

    if terminated or truncated:
        observation, info = env.reset()

env.close()