# Week 8, Day 5: Advanced Reinforcement Learning Topics

## Learning Objectives
- Understand advanced RL concepts
- Learn multi-agent RL
- Master hierarchical RL
- Practice implementing advanced techniques

## Topics Covered
1. Multi-Agent RL
2. Hierarchical RL
3. Meta-Learning
4. Exploration Strategies

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import gym
from collections import deque
import random

## 1. Multi-Agent RL

In [None]:
class MultiAgentDQN:
    def __init__(self, state_size, action_size, n_agents):
        self.state_size = state_size
        self.action_size = action_size
        self.n_agents = n_agents
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.agents = [self._build_model() for _ in range(n_agents)]
    
    def _build_model(self):
        model = tf.keras.Sequential([
            tf.keras.layers.Dense(24, input_dim=self.state_size, activation='relu'),
            tf.keras.layers.Dense(24, activation='relu'),
            tf.keras.layers.Dense(self.action_size, activation='linear')
        ])
        model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(lr=self.learning_rate))
        return model
    
    def remember(self, state, actions, rewards, next_state, done):
        self.memory.append((state, actions, rewards, next_state, done))
    
    def act(self, states):
        actions = []
        for i, state in enumerate(states):
            if random.random() <= self.epsilon:
                actions.append(random.randrange(self.action_size))
            else:
                act_values = self.agents[i].predict(state)
                actions.append(np.argmax(act_values[0]))
        return actions
    
    def replay(self, batch_size):
        if len(self.memory) < batch_size:
            return
        
        minibatch = random.sample(self.memory, batch_size)
        for agent_id in range(self.n_agents):
            states = np.zeros((batch_size, self.state_size))
            targets = np.zeros((batch_size, self.action_size))
            
            for i, (state, actions, rewards, next_state, done) in enumerate(minibatch):
                target = rewards[agent_id]
                if not done:
                    target += self.gamma * np.amax(
                        self.agents[agent_id].predict(next_state[agent_id])[0]
                    )
                
                target_f = self.agents[agent_id].predict(state[agent_id])
                target_f[0][actions[agent_id]] = target
                
                states[i] = state[agent_id]
                targets[i] = target_f[0]
            
            self.agents[agent_id].fit(states, targets, epochs=1, verbose=0)
        
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

## 2. Hierarchical RL

In [None]:
class HierarchicalRL:
    def __init__(self, state_size, action_size, n_options=4):
        self.state_size = state_size
        self.action_size = action_size
        self.n_options = n_options
        
        # Meta-controller (selects options)
        self.meta_controller = self._build_meta_controller()
        
        # Option policies
        self.option_policies = [self._build_option_policy() for _ in range(n_options)]
        
        # Option termination conditions
        self.termination_nets = [self._build_termination_net() for _ in range(n_options)]
    
    def _build_meta_controller(self):
        model = tf.keras.Sequential([
            tf.keras.layers.Dense(32, input_dim=self.state_size, activation='relu'),
            tf.keras.layers.Dense(32, activation='relu'),
            tf.keras.layers.Dense(self.n_options, activation='softmax')
        ])
        model.compile(optimizer='adam', loss='categorical_crossentropy')
        return model
    
    def _build_option_policy(self):
        model = tf.keras.Sequential([
            tf.keras.layers.Dense(32, input_dim=self.state_size, activation='relu'),
            tf.keras.layers.Dense(32, activation='relu'),
            tf.keras.layers.Dense(self.action_size, activation='softmax')
        ])
        model.compile(optimizer='adam', loss='categorical_crossentropy')
        return model
    
    def _build_termination_net(self):
        model = tf.keras.Sequential([
            tf.keras.layers.Dense(32, input_dim=self.state_size, activation='relu'),
            tf.keras.layers.Dense(1, activation='sigmoid')
        ])
        model.compile(optimizer='adam', loss='binary_crossentropy')
        return model
    
    def select_option(self, state):
        state = np.reshape(state, [1, self.state_size])
        option_probs = self.meta_controller.predict(state)[0]
        return np.random.choice(self.n_options, p=option_probs)
    
    def select_action(self, state, option):
        state = np.reshape(state, [1, self.state_size])
        action_probs = self.option_policies[option].predict(state)[0]
        return np.random.choice(self.action_size, p=action_probs)
    
    def should_terminate(self, state, option):
        state = np.reshape(state, [1, self.state_size])
        return self.termination_nets[option].predict(state)[0, 0] > 0.5

## 3. Meta-Learning

In [None]:
class MAMLAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.meta_lr = 0.01
        self.task_lr = 0.1
        self.model = self._build_model()
    
    def _build_model(self):
        model = tf.keras.Sequential([
            tf.keras.layers.Dense(32, input_dim=self.state_size, activation='relu'),
            tf.keras.layers.Dense(32, activation='relu'),
            tf.keras.layers.Dense(self.action_size, activation='softmax')
        ])
        model.compile(optimizer=tf.keras.optimizers.Adam(lr=self.meta_lr),
                     loss='categorical_crossentropy')
        return model
    
    def adapt_to_task(self, task_data):
        # Create task-specific model
        task_model = tf.keras.models.clone_model(self.model)
        task_model.set_weights(self.model.get_weights())
        task_model.compile(optimizer=tf.keras.optimizers.SGD(lr=self.task_lr),
                          loss='categorical_crossentropy')
        
        # Adapt to task
        states, actions = task_data
        task_model.fit(states, actions, epochs=1, verbose=0)
        
        return task_model
    
    def meta_update(self, tasks_data):
        meta_gradients = []
        
        for task_data in tasks_data:
            # Compute task-specific gradients
            with tf.GradientTape() as tape:
                states, actions = task_data
                predictions = self.model(states)
                loss = tf.keras.losses.categorical_crossentropy(actions, predictions)
            
            gradients = tape.gradient(loss, self.model.trainable_variables)
            meta_gradients.append(gradients)
        
        # Average gradients across tasks
        avg_gradients = [
            tf.reduce_mean([g[i] for g in meta_gradients], axis=0)
            for i in range(len(meta_gradients[0]))
        ]
        
        # Apply meta-update
        self.model.optimizer.apply_gradients(
            zip(avg_gradients, self.model.trainable_variables)
        )

## 4. Advanced Exploration

In [None]:
class NoveltySearch:
    def __init__(self, state_size, k_neighbors=10):
        self.state_size = state_size
        self.k_neighbors = k_neighbors
        self.archive = []
    
    def compute_novelty(self, state):
        if len(self.archive) < self.k_neighbors:
            return float('inf')
        
        distances = [np.linalg.norm(state - archived_state)
                    for archived_state in self.archive]
        distances.sort()
        
        return np.mean(distances[:self.k_neighbors])
    
    def update_archive(self, state, novelty_threshold=1.0):
        novelty = self.compute_novelty(state)
        if novelty > novelty_threshold:
            self.archive.append(state)
        
        return novelty

class IntrinsicMotivation:
    def __init__(self, state_size):
        self.state_size = state_size
        self.predictor = self._build_predictor()
        self.target = self._build_predictor()
    
    def _build_predictor(self):
        model = tf.keras.Sequential([
            tf.keras.layers.Dense(32, input_dim=self.state_size, activation='relu'),
            tf.keras.layers.Dense(32, activation='relu'),
            tf.keras.layers.Dense(self.state_size)
        ])
        model.compile(optimizer='adam', loss='mse')
        return model
    
    def compute_curiosity(self, state, next_state):
        prediction = self.predictor.predict(state)
        target_prediction = self.target.predict(state)
        
        prediction_error = np.mean(
            (prediction - next_state) ** 2
        )
        disagreement = np.mean(
            (prediction - target_prediction) ** 2
        )
        
        return prediction_error + disagreement
    
    def update(self, state, next_state):
        self.predictor.fit(state, next_state, verbose=0)
        # Slow update of target network
        target_weights = self.target.get_weights()
        predictor_weights = self.predictor.get_weights()
        new_weights = [0.95 * t + 0.05 * p
                      for t, p in zip(target_weights, predictor_weights)]
        self.target.set_weights(new_weights)

## Practical Exercises

In [None]:
# Exercise 1: Multi-Agent System

def multi_agent_exercise():
    print("Task: Implement multi-agent coordination")
    print("1. Create agent network")
    print("2. Implement communication")
    print("3. Train agents")
    print("4. Evaluate performance")
    
    # Your code here

multi_agent_exercise()

In [None]:
# Exercise 2: Exploration Strategy

def exploration_exercise():
    print("Task: Implement advanced exploration")
    print("1. Design exploration metric")
    print("2. Implement search strategy")
    print("3. Test exploration")
    print("4. Analyze results")
    
    # Your code here

exploration_exercise()

## MCQ Quiz

1. What is multi-agent RL?
   - a) Single agent learning
   - b) Multiple agent learning
   - c) Model-based learning
   - d) Supervised learning

2. What is hierarchical RL?
   - a) Flat policy
   - b) Nested policies
   - c) Single policy
   - d) Random policy

3. What is meta-learning?
   - a) Single task learning
   - b) Learning to learn
   - c) Supervised learning
   - d) Model-based learning

4. What is novelty search?
   - a) Random search
   - b) Behavioral diversity
   - c) Policy search
   - d) Value search

5. What is intrinsic motivation?
   - a) External rewards
   - b) Internal rewards
   - c) Fixed rewards
   - d) Random rewards

6. What is option learning?
   - a) Single action
   - b) Temporal abstraction
   - c) Random policy
   - d) Value function

7. What is curriculum learning?
   - a) Random tasks
   - b) Progressive tasks
   - c) Single task
   - d) Fixed task

8. What is transfer learning in RL?
   - a) Single task
   - b) Knowledge reuse
   - c) Random learning
   - d) Fixed policy

9. What is exploration vs exploitation?
   - a) Random actions
   - b) Learning tradeoff
   - c) Fixed policy
   - d) Value function

10. What is model-based RL?
    - a) Model-free learning
    - b) Environment modeling
    - c) Random policy
    - d) Fixed policy

Answers: 1-b, 2-b, 3-b, 4-b, 5-b, 6-b, 7-b, 8-b, 9-b, 10-b