# Week 8, Day 2: Q-Learning and SARSA

## Learning Objectives
- Understand Q-Learning algorithm
- Learn SARSA algorithm
- Master temporal difference learning
- Practice implementing value-based methods

## Topics Covered
1. Q-Learning
2. SARSA
3. Exploration Strategies
4. Algorithm Comparison

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gym
from collections import defaultdict
import random

## 1. Q-Learning Implementation

In [None]:
class QLearningAgent:
    def __init__(self, n_states, n_actions, learning_rate=0.1, discount_factor=0.95, epsilon=0.1):
        self.q_table = defaultdict(lambda: np.zeros(n_actions))
        self.lr = learning_rate
        self.gamma = discount_factor
        self.epsilon = epsilon
        self.n_actions = n_actions

    def get_action(self, state):
        # Epsilon-greedy action selection
        if random.random() < self.epsilon:
            return random.randint(0, self.n_actions - 1)
        return np.argmax(self.q_table[state])

    def learn(self, state, action, reward, next_state):
        # Q-Learning update
        best_next_action = np.argmax(self.q_table[next_state])
        td_target = reward + self.gamma * self.q_table[next_state][best_next_action]
        td_error = td_target - self.q_table[state][action]
        self.q_table[state][action] += self.lr * td_error

def train_q_learning():
    # Initialize environment
    env = gym.make('FrozenLake-v1')
    agent = QLearningAgent(env.observation_space.n, env.action_space.n)

    # Training parameters
    episodes = 1000
    rewards_history = []

    # Training loop
    for episode in range(episodes):
        state = env.reset()
        total_reward = 0
        done = False

        while not done:
            action = agent.get_action(state)
            next_state, reward, done, _ = env.step(action)

            agent.learn(state, action, reward, next_state)
            state = next_state
            total_reward += reward

        rewards_history.append(total_reward)

    # Plot results
    plt.figure(figsize=(10, 5))
    plt.plot(pd.Series(rewards_history).rolling(100).mean())
    plt.title('Q-Learning: Average Reward over Episodes')
    plt.xlabel('Episode')
    plt.ylabel('Average Reward')
    plt.show()

    return agent

q_learning_agent = train_q_learning()

## 2. SARSA Implementation

In [None]:
class SARSAAgent:
    def __init__(self, n_states, n_actions, learning_rate=0.1, discount_factor=0.95, epsilon=0.1):
        self.q_table = defaultdict(lambda: np.zeros(n_actions))
        self.lr = learning_rate
        self.gamma = discount_factor
        self.epsilon = epsilon
        self.n_actions = n_actions

    def get_action(self, state):
        # Epsilon-greedy action selection
        if random.random() < self.epsilon:
            return random.randint(0, self.n_actions - 1)
        return np.argmax(self.q_table[state])

    def learn(self, state, action, reward, next_state, next_action):
        # SARSA update
        td_target = reward + self.gamma * self.q_table[next_state][next_action]
        td_error = td_target - self.q_table[state][action]
        self.q_table[state][action] += self.lr * td_error

def train_sarsa():
    # Initialize environment
    env = gym.make('FrozenLake-v1')
    agent = SARSAAgent(env.observation_space.n, env.action_space.n)

    # Training parameters
    episodes = 1000
    rewards_history = []

    # Training loop
    for episode in range(episodes):
        state = env.reset()
        action = agent.get_action(state)
        total_reward = 0
        done = False

        while not done:
            next_state, reward, done, _ = env.step(action)
            next_action = agent.get_action(next_state)

            agent.learn(state, action, reward, next_state, next_action)

            state = next_state
            action = next_action
            total_reward += reward

        rewards_history.append(total_reward)

    # Plot results
    plt.figure(figsize=(10, 5))
    plt.plot(pd.Series(rewards_history).rolling(100).mean())
    plt.title('SARSA: Average Reward over Episodes')
    plt.xlabel('Episode')
    plt.ylabel('Average Reward')
    plt.show()

    return agent

sarsa_agent = train_sarsa()

## 3. Algorithm Comparison

In [None]:
def compare_algorithms():
    # Initialize environment
    env = gym.make('FrozenLake-v1')

    # Training parameters
    episodes = 1000
    n_runs = 10

    # Storage for results
    q_learning_rewards = np.zeros((n_runs, episodes))
    sarsa_rewards = np.zeros((n_runs, episodes))

    # Run multiple training sessions
    for run in range(n_runs):
        # Q-Learning
        agent = QLearningAgent(env.observation_space.n, env.action_space.n)
        for episode in range(episodes):
            state = env.reset()
            total_reward = 0
            done = False

            while not done:
                action = agent.get_action(state)
                next_state, reward, done, _ = env.step(action)
                agent.learn(state, action, reward, next_state)
                state = next_state
                total_reward += reward

            q_learning_rewards[run, episode] = total_reward

        # SARSA
        agent = SARSAAgent(env.observation_space.n, env.action_space.n)
        for episode in range(episodes):
            state = env.reset()
            action = agent.get_action(state)
            total_reward = 0
            done = False

            while not done:
                next_state, reward, done, _ = env.step(action)
                next_action = agent.get_action(next_state)
                agent.learn(state, action, reward, next_state, next_action)
                state = next_state
                action = next_action
                total_reward += reward

            sarsa_rewards[run, episode] = total_reward

    # Plot comparison
    plt.figure(figsize=(12, 6))

    plt.plot(pd.Series(q_learning_rewards.mean(axis=0)).rolling(100).mean(),
             label='Q-Learning')
    plt.plot(pd.Series(sarsa_rewards.mean(axis=0)).rolling(100).mean(),
             label='SARSA')

    plt.title('Q-Learning vs SARSA: Average Reward over Episodes')
    plt.xlabel('Episode')
    plt.ylabel('Average Reward')
    plt.legend()
    plt.show()

compare_algorithms()

## Practical Exercises

In [None]:
# Exercise 1: Q-Learning Implementation

def q_learning_exercise():
    print("Task: Implement Q-Learning algorithm")
    print("1. Initialize Q-table")
    print("2. Implement epsilon-greedy")
    print("3. Implement Q-update")
    print("4. Train and evaluate")

    # Your code here

q_learning_exercise()

In [None]:
# Exercise 2: SARSA Implementation

def sarsa_exercise():
    print("Task: Implement SARSA algorithm")
    print("1. Initialize Q-table")
    print("2. Implement action selection")
    print("3. Implement SARSA update")
    print("4. Train and evaluate")

    # Your code here

sarsa_exercise()

## MCQ Quiz

1. What is Q-Learning?
   - a) Policy gradient method
   - b) Off-policy TD learning
   - c) Model-based method
   - d) Supervised learning

2. What is SARSA?
   - a) Model-based method
   - b) On-policy TD learning
   - c) Supervised learning
   - d) Policy gradient

3. What is temporal difference learning?
   - a) Supervised learning
   - b) Bootstrapping method
   - c) Policy gradient
   - d) Model-based learning

4. What is epsilon-greedy?
   - a) Learning rate
   - b) Exploration strategy
   - c) Reward function
   - d) Value function

5. What is off-policy learning?
   - a) Online learning
   - b) Different behavior policy
   - c) Model-based learning
   - d) Policy gradient

6. What is the Q-value?
   - a) Reward
   - b) State-action value
   - c) Policy
   - d) Model

7. What is the learning rate?
   - a) Exploration rate
   - b) Update step size
   - c) Discount factor
   - d) Reward scale

8. What is bootstrapping?
   - a) Exploration
   - b) Using estimates
   - c) Policy update
   - d) Model learning

9. What is the difference between Q-Learning and SARSA?
   - a) Learning rate
   - b) Policy difference
   - c) Model type
   - d) Reward scale

10. What is the exploration-exploitation tradeoff?
    - a) Learning rate
    - b) Action selection balance
    - c) Policy type
    - d) Value function

Answers: 1-b, 2-b, 3-b, 4-b, 5-b, 6-b, 7-b, 8-b, 9-b, 10-b