# Week 8, Day 6: Reinforcement Learning Hackathon Challenge

## Challenge Overview
Build an end-to-end RL solution using concepts learned throughout Week 8:
- Basic RL Concepts
- Q-Learning and SARSA
- Deep Q-Networks
- Policy Gradients
- Advanced RL Topics

## Problem: Multi-Task Learning Agent
Create a system that can learn and perform multiple tasks:
1. Navigation and Pathfinding
2. Resource Collection
3. Strategic Decision Making
4. Multi-Agent Coordination

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import gym
from collections import deque
import random

## Part 1: Environment Setup

In [None]:
class MultiTaskEnvironment(gym.Env):
    def __init__(self):
        super(MultiTaskEnvironment, self).__init__()
        
        # Define action and observation space
        self.action_space = gym.spaces.Discrete(4)
        self.observation_space = gym.spaces.Box(
            low=0, high=1,
            shape=(10,),
            dtype=np.float32
        )
        
        # Initialize environment state
        self.reset()
    
    def reset(self):
        self.state = np.random.random(10)
        return self.state
    
    def step(self, action):
        # Execute action and get next state
        next_state = self.state + np.random.normal(0, 0.1, 10)
        next_state = np.clip(next_state, 0, 1)
        
        # Calculate reward
        reward = self._calculate_reward(action)
        
        # Update state
        self.state = next_state
        
        # Check if episode is done
        done = np.random.random() < 0.1
        
        return next_state, reward, done, {}
    
    def _calculate_reward(self, action):
        # Implement reward calculation based on task and action
        return np.random.random()

# Create environment
env = MultiTaskEnvironment()

# Test environment
state = env.reset()
for _ in range(10):
    action = env.action_space.sample()
    next_state, reward, done, _ = env.step(action)
    print(f"Action: {action}, Reward: {reward:.2f}, Done: {done}")
    if done:
        break

## Part 2: Agent Implementation

In [None]:
class MultiTaskAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        
        # DQN for navigation
        self.navigation_net = self._build_dqn()
        
        # Policy network for resource collection
        self.resource_net = self._build_policy_net()
        
        # Actor-Critic for strategic decisions
        self.actor = self._build_actor()
        self.critic = self._build_critic()
        
        # Experience replay buffer
        self.memory = deque(maxlen=2000)
        
        # Learning parameters
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
    
    def _build_dqn(self):
        model = tf.keras.Sequential([
            tf.keras.layers.Dense(24, input_dim=self.state_size, activation='relu'),
            tf.keras.layers.Dense(24, activation='relu'),
            tf.keras.layers.Dense(self.action_size, activation='linear')
        ])
        model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(lr=self.learning_rate))
        return model
    
    def _build_policy_net(self):
        model = tf.keras.Sequential([
            tf.keras.layers.Dense(24, input_dim=self.state_size, activation='relu'),
            tf.keras.layers.Dense(24, activation='relu'),
            tf.keras.layers.Dense(self.action_size, activation='softmax')
        ])
        model.compile(loss='categorical_crossentropy',
                     optimizer=tf.keras.optimizers.Adam(lr=self.learning_rate))
        return model
    
    def _build_actor(self):
        model = tf.keras.Sequential([
            tf.keras.layers.Dense(24, input_dim=self.state_size, activation='relu'),
            tf.keras.layers.Dense(24, activation='relu'),
            tf.keras.layers.Dense(self.action_size, activation='softmax')
        ])
        model.compile(loss='categorical_crossentropy',
                     optimizer=tf.keras.optimizers.Adam(lr=self.learning_rate))
        return model
    
    def _build_critic(self):
        model = tf.keras.Sequential([
            tf.keras.layers.Dense(24, input_dim=self.state_size, activation='relu'),
            tf.keras.layers.Dense(24, activation='relu'),
            tf.keras.layers.Dense(1, activation='linear')
        ])
        model.compile(loss='mse',
                     optimizer=tf.keras.optimizers.Adam(lr=self.learning_rate))
        return model
    
    def act(self, state, task):
        if task == 'navigation':
            return self._act_dqn(state)
        elif task == 'resource':
            return self._act_policy(state)
        else:  # strategic
            return self._act_actor(state)
    
    def _act_dqn(self, state):
        if random.random() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.navigation_net.predict(state)
        return np.argmax(act_values[0])
    
    def _act_policy(self, state):
        policy = self.resource_net.predict(state)[0]
        return np.random.choice(self.action_size, p=policy)
    
    def _act_actor(self, state):
        policy = self.actor.predict(state)[0]
        return np.random.choice(self.action_size, p=policy)
    
    def train(self, task):
        if len(self.memory) < 32:
            return
        
        if task == 'navigation':
            self._train_dqn()
        elif task == 'resource':
            self._train_policy()
        else:  # strategic
            self._train_actor_critic()
    
    def _train_dqn(self):
        minibatch = random.sample(self.memory, 32)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = reward + self.gamma * np.amax(
                    self.navigation_net.predict(next_state)[0]
                )
            target_f = self.navigation_net.predict(state)
            target_f[0][action] = target
            self.navigation_net.fit(state, target_f, epochs=1, verbose=0)
        
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
    
    def _train_policy(self):
        minibatch = random.sample(self.memory, 32)
        states = np.array([x[0] for x in minibatch])
        actions = np.array([x[1] for x in minibatch])
        rewards = np.array([x[2] for x in minibatch])
        
        self.resource_net.fit(states, actions, sample_weight=rewards,
                             epochs=1, verbose=0)
    
    def _train_actor_critic(self):
        minibatch = random.sample(self.memory, 32)
        states = np.array([x[0] for x in minibatch])
        actions = np.array([x[1] for x in minibatch])
        rewards = np.array([x[2] for x in minibatch])
        next_states = np.array([x[3] for x in minibatch])
        dones = np.array([x[4] for x in minibatch])
        
        # Train critic
        values = self.critic.predict(states)
        next_values = self.critic.predict(next_states)
        targets = rewards + self.gamma * next_values * (1 - dones)
        self.critic.fit(states, targets, epochs=1, verbose=0)
        
        # Train actor
        advantages = targets - values
        self.actor.fit(states, actions, sample_weight=advantages,
                      epochs=1, verbose=0)

## Part 3: Training Loop

In [None]:
def train_agent():
    env = MultiTaskEnvironment()
    agent = MultiTaskAgent(env.observation_space.shape[0], env.action_space.n)
    
    tasks = ['navigation', 'resource', 'strategic']
    episodes = 1000
    scores = {task: [] for task in tasks}
    
    for episode in range(episodes):
        # Randomly select task
        task = random.choice(tasks)
        state = env.reset()
        state = np.reshape(state, [1, env.observation_space.shape[0]])
        total_reward = 0
        
        while True:
            # Select and perform action
            action = agent.act(state, task)
            next_state, reward, done, _ = env.step(action)
            next_state = np.reshape(next_state, [1, env.observation_space.shape[0]])
            
            # Store experience
            agent.memory.append((state, action, reward, next_state, done))
            
            # Update state and reward
            state = next_state
            total_reward += reward
            
            # Train agent
            agent.train(task)
            
            if done:
                scores[task].append(total_reward)
                print(f"Episode: {episode + 1}, Task: {task}, Score: {total_reward}")
                break
    
    return scores

# Train agent
scores = train_agent()

# Plot results
plt.figure(figsize=(12, 6))
for task in scores:
    plt.plot(pd.Series(scores[task]).rolling(100).mean(), label=task)
plt.title('Training Progress by Task')
plt.xlabel('Episode')
plt.ylabel('Average Score')
plt.legend()
plt.show()

## Evaluation Criteria

Your solution will be evaluated based on:

1. Navigation (25%)
   - Path optimization
   - Obstacle avoidance
   - Learning efficiency

2. Resource Collection (25%)
   - Collection strategy
   - Resource prioritization
   - Efficiency metrics

3. Strategic Decisions (25%)
   - Decision quality
   - Adaptation ability
   - Long-term planning

4. Multi-Agent Coordination (25%)
   - Communication
   - Cooperation
   - Team performance

## Submission Guidelines
1. Complete all tasks in this notebook
2. Document your approach and decisions
3. Include visualizations and analysis
4. Provide suggestions for improvement