In [5]:
!pip install gym==0.26.2 numpy stable-baselines3 shimmy>=2.0

# Verify CUDA availability
import torch
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA Device: {torch.cuda.get_device_name(0)}")

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
dopamine-rl 4.1.2 requires gym<=0.25.2, but you have gym 0.26.2 which is incompatible.[0m[31m
[0mCUDA Available: True
CUDA Device: Tesla T4


In [6]:
import gym
from gym import spaces
import numpy as np

class ResourceAllocationEnv(gym.Env):
    def __init__(self, N=5, M=10, p=0.5, C=1, max_steps=1000):
        """
        Initialize the resource allocation environment.

        Args:
            N (int): Number of slots for tasks
            M (int): Maximum processing requirement for a task
            p (float): Probability of a new task arriving
            C (int): Resource capacity allocated per action
            max_steps (int): Maximum steps per episode
        """
        super(ResourceAllocationEnv, self).__init__()
        self.N = N  # Number of slots
        self.M = M  # Maximum task requirement
        self.p = p  # Probability of new task arrival
        self.C = C  # Resource allocation per action
        self.max_steps = max_steps
        self.current_step = 0

        # Define observation and action spaces
        self.observation_space = spaces.MultiDiscrete([M + 1] * N)  # 0 to M for each slot
        self.action_space = spaces.Discrete(N)  # Choose one of N slots
        self.state = np.zeros(N, dtype=np.int32)  # Initial state: all slots empty

    def reset(self):
        """Reset the environment to initial state."""
        self.state = np.zeros(self.N, dtype=np.int32)
        self.current_step = 0
        return self.state

    def step(self, action):
        """
        Take a step in the environment based on the agent's action.

        Args:
            action (int): Slot to allocate resources to (0 to N-1)

        Returns:
            state (np.array): New state
            reward (float): Reward for the action
            done (bool): Whether the episode is finished
            info (dict): Additional info
        """
        assert self.action_space.contains(action), "Invalid action"
        reward = 0

        # Allocate resources to the chosen slot if it has a task
        if self.state[action] > 0:
            self.state[action] -= self.C
            if self.state[action] <= 0:
                self.state[action] = 0
                reward += 1  # Reward for completing a task

        # New task arrival with probability p
        if np.random.rand() < self.p:
            empty_slots = np.where(self.state == 0)[0]
            if len(empty_slots) > 0:
                slot = np.random.choice(empty_slots)
                self.state[slot] = np.random.randint(1, self.M + 1)

        self.current_step += 1
        done = (self.current_step >= self.max_steps)
        return self.state.copy(), reward, done, {}

    def render(self, mode='human'):
        """Render the current state."""
        print(f"Step {self.current_step}: State: {self.state}, Queue: {np.sum(self.state > 0)} tasks")

In [7]:
env = ResourceAllocationEnv(N=5, M=10, p=0.5, C=1, max_steps=1000)

In [8]:
from stable_baselines3 import DQN

# Initialize DQN with CUDA if available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

model = DQN(
    policy="MlpPolicy",  # Multi-layer perceptron policy
    env=env,
    learning_rate=1e-3,
    buffer_size=10000,
    batch_size=32,
    verbose=1,
    device=device  # Use CUDA if available
)

Using device: cuda
Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




In [9]:
print("Training the DQN agent...")
model.learn(total_timesteps=100000)

Training the DQN agent...
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1e+03    |
|    ep_rew_mean      | 177      |
|    exploration_rate | 0.62     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 650      |
|    time_elapsed     | 6        |
|    total_timesteps  | 4000     |
| train/              |          |
|    learning_rate    | 0.001    |
|    loss             | 0.000233 |
|    n_updates        | 974      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1e+03    |
|    ep_rew_mean      | 172      |
|    exploration_rate | 0.24     |
| time/               |          |
|    episodes         | 8        |
|    fps              | 687      |
|    time_elapsed     | 11       |
|    total_timesteps  | 8000     |
| train/              |          |
|    learning_rate    | 0.001    |
|    loss             | 0.000

<stable_baselines3.dqn.dqn.DQN at 0x7b6590eb7290>

In [10]:
from stable_baselines3.common.evaluation import evaluate_policy

print("\nEvaluating the trained agent...")
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10)
print(f"Mean reward over 10 episodes: {mean_reward:.2f} ± {std_reward:.2f}")


Evaluating the trained agent...




Mean reward over 10 episodes: 158.80 ± 7.69


In [11]:
print("\nTesting the trained agent for 100 steps:")
state = env.reset()
total_reward = 0
for step in range(100):
    action, _ = model.predict(state, deterministic=True)  # Use deterministic policy
    state, reward, done, _ = env.step(action)
    total_reward += reward
    env.render()
    if done:
        print("Episode finished early due to max steps reached.")
        break
print(f"Total reward over 100 steps: {total_reward}")


Testing the trained agent for 100 steps:
Step 1: State: [0 0 0 0 0], Queue: 0 tasks
Step 2: State: [0 0 0 0 0], Queue: 0 tasks
Step 3: State: [0 0 0 0 0], Queue: 0 tasks
Step 4: State: [0 0 0 0 0], Queue: 0 tasks
Step 5: State: [0 0 1 0 0], Queue: 1 tasks
Step 6: State: [3 0 0 0 0], Queue: 1 tasks
Step 7: State: [2 0 0 0 4], Queue: 2 tasks
Step 8: State: [1 0 0 0 4], Queue: 2 tasks
Step 9: State: [0 0 0 1 4], Queue: 2 tasks
Step 10: State: [0 0 6 1 3], Queue: 3 tasks
Step 11: State: [0 0 6 1 2], Queue: 3 tasks
Step 12: State: [0 0 6 1 1], Queue: 3 tasks
Step 13: State: [0 3 6 1 0], Queue: 3 tasks
Step 14: State: [0 3 6 2 0], Queue: 3 tasks
Step 15: State: [3 3 6 1 0], Queue: 4 tasks
Step 16: State: [3 2 6 1 0], Queue: 4 tasks
Step 17: State: [3 1 6 1 8], Queue: 5 tasks
Step 18: State: [ 3 10  6  1  8], Queue: 5 tasks
Step 19: State: [ 2 10  6  1  8], Queue: 5 tasks
Step 20: State: [ 1 10  6  1  8], Queue: 5 tasks
Step 21: State: [ 2 10  6  1  8], Queue: 5 tasks
Step 22: State: [ 1 10 

In [19]:
!pip install gym==0.26.2 numpy stable-baselines3 shimmy>=2.0 matplotlib ipywidgets

# Verify CUDA availability
import torch
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA Device: {torch.cuda.get_device_name(0)}")
else:
    print("CUDA not available. Using CPU.")

CUDA Available: True
CUDA Device: Tesla T4


In [21]:
import gym
from gym import spaces
import numpy as np

class AdvancedResourceAllocationEnv(gym.Env):
    def __init__(self, N=5, M=10, p=0.5, cpu_capacity=2, mem_capacity=3, max_steps=1000):
        """
        Initialize the advanced resource allocation environment with CPU and memory.

        Args:
            N (int): Number of slots for tasks
            M (int): Maximum resource requirement for a task (CPU or memory)
            p (float): Probability of a new task arriving
            cpu_capacity (int): CPU resources allocated per action
            mem_capacity (int): Memory resources allocated per action
            max_steps (int): Maximum steps per episode
        """
        super(AdvancedResourceAllocationEnv, self).__init__()
        self.N = N
        self.M = M
        self.p = p
        self.cpu_capacity = cpu_capacity
        self.mem_capacity = mem_capacity
        self.max_steps = max_steps
        self.current_step = 0

        # State: [CPU, Mem] for each slot
        self.observation_space = spaces.MultiDiscrete([M + 1] * N * 2)
        self.action_space = spaces.Discrete(N)
        self.state = np.zeros((N, 2), dtype=np.int32)  # [CPU, Mem] for each slot

    def reset(self):
        """Reset the environment to initial state."""
        self.state = np.zeros((self.N, 2), dtype=np.int32)
        self.current_step = 0
        return self.state.flatten()

    def step(self, action):
        """Take a step by allocating CPU and memory to a chosen slot."""
        assert self.action_space.contains(action), "Invalid action"
        reward = 0

        # Allocate resources if the slot has a task
        if np.any(self.state[action] > 0):
            self.state[action, 0] = max(0, self.state[action, 0] - self.cpu_capacity)
            self.state[action, 1] = max(0, self.state[action, 1] - self.mem_capacity)
            if np.all(self.state[action] <= 0):
                self.state[action] = [0, 0]
                reward += 1  # Reward for completing a task

        # New task arrival with probability p
        if np.random.rand() < self.p:
            empty_slots = np.where(np.all(self.state == 0, axis=1))[0]
            if len(empty_slots) > 0:
                slot = np.random.choice(empty_slots)
                self.state[slot] = [np.random.randint(1, self.M + 1), np.random.randint(1, self.M + 1)]

        self.current_step += 1
        done = (self.current_step >= self.max_steps)
        return self.state.flatten(), reward, done, {'cpu_used': self.cpu_capacity if np.any(self.state[action] > 0) else 0,
                                                   'mem_used': self.mem_capacity if np.any(self.state[action] > 0) else 0}

    def render(self, mode='human'):
        """Render the current state for logging."""
        return f"Step {self.current_step}: CPU Loads: {self.state[:, 0]}, Memory Loads: {self.state[:, 1]}, Tasks: {np.sum(np.any(self.state > 0, axis=1))}"

In [22]:
env = AdvancedResourceAllocationEnv(N=5, M=10, p=0.5, cpu_capacity=2, mem_capacity=3, max_steps=1000)

In [23]:
from stable_baselines3 import PPO

# Initialize PPO with CUDA if available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

model = PPO(
    policy="MlpPolicy",
    env=env,
    learning_rate=3e-4,
    n_steps=2048,
    batch_size=64,
    n_epochs=10,
    verbose=0,
    device=device
)

Using device: cuda




In [24]:
print("Training the agent... This may take a few minutes.")
model.learn(total_timesteps=100000)
print("Training complete!")

Training the agent... This may take a few minutes.
Training complete!


In [25]:
from stable_baselines3.common.evaluation import evaluate_policy

print("\nChecking how well the agent learned...")
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10)
print(f"The agent scored an average of {mean_reward:.2f} points (± {std_reward:.2f}) over 10 runs!")


Checking how well the agent learned...
The agent scored an average of 224.50 points (± 5.24) over 10 runs!


In [26]:
import matplotlib.pyplot as plt
from IPython.display import clear_output, display
import time
import ipywidgets as widgets
from ipywidgets import interact_manual

# Initialize environment and sliders
env = AdvancedResourceAllocationEnv(N=5, M=10, p=0.5, cpu_capacity=2, mem_capacity=3, max_steps=1000)
p_slider = widgets.FloatSlider(value=0.5, min=0.1, max=1.0, step=0.1, description='Task Arrival (p):')
cpu_slider = widgets.IntSlider(value=2, min=1, max=5, step=1, description='CPU Capacity:')
mem_slider = widgets.IntSlider(value=3, min=1, max=5, step=1, description='Memory Capacity:')

# Store history for trends
cpu_usage_history = []
mem_usage_history = []
reward_history = []

def run_simulation(p, cpu_capacity, mem_capacity):
    global cpu_usage_history, mem_usage_history, reward_history
    env.p = p
    env.cpu_capacity = cpu_capacity
    env.mem_capacity = mem_capacity
    state = env.reset()
    prev_state = state.copy()
    total_reward = 0
    cpu_usage_history = []
    mem_usage_history = []
    reward_history = []

    for step in range(100):
        action, _ = model.predict(state, deterministic=True)
        state, reward, done, info = env.step(action)
        total_reward += reward
        cpu_usage_history.append(info['cpu_used'])
        mem_usage_history.append(info['mem_used'])
        reward_history.append(total_reward)

        # Detect events
        new_tasks = [i for i in range(env.N) if np.any(state.reshape(env.N, 2)[i] > 0) and np.all(prev_state.reshape(env.N, 2)[i] == 0)]
        completed = action if reward > 0 else None

        # Create dual bar charts
        clear_output(wait=True)
        fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5), gridspec_kw={'width_ratios': [1, 1, 1.5]})

        # CPU Bar Chart
        slots = [f"Slot {i+1}" for i in range(env.N)]
        cpu_bars = ax1.bar(slots, state.reshape(env.N, 2)[:, 0], color='skyblue', edgecolor='black')
        cpu_bars[action].set_color('salmon')
        ax1.text(slots[action], state.reshape(env.N, 2)[action, 0] + 0.5, 'Working', ha='center', va='bottom', fontsize=10, weight='bold')
        if completed is not None:
            cpu_bars[completed].set_color('limegreen')
            ax1.text(slots[completed], 0.5, 'Done!', color='white', ha='center', va='bottom', fontsize=10, weight='bold')
        for i in new_tasks:
            ax1.text(slots[i], state.reshape(env.N, 2)[i, 0] + 0.5, 'New!', color='orange', ha='center', va='bottom', fontsize=10, weight='bold')
        ax1.set_title("CPU Loads")
        ax1.set_ylim(0, env.M + 1.5)
        ax1.set_ylabel("CPU Load")

        # Memory Bar Chart
        mem_bars = ax2.bar(slots, state.reshape(env.N, 2)[:, 1], color='lightcoral', edgecolor='black')
        mem_bars[action].set_color('salmon')
        ax2.text(slots[action], state.reshape(env.N, 2)[action, 1] + 0.5, 'Working', ha='center', va='bottom', fontsize=10, weight='bold')
        if completed is not None:
            mem_bars[completed].set_color('limegreen')
            ax2.text(slots[completed], 0.5, 'Done!', color='white', ha='center', va='bottom', fontsize=10, weight='bold')
        for i in new_tasks:
            ax2.text(slots[i], state.reshape(env.N, 2)[i, 1] + 0.5, 'New!', color='orange', ha='center', va='bottom', fontsize=10, weight='bold')
        ax2.set_title("Memory Loads")
        ax2.set_ylim(0, env.M + 1.5)
        ax2.set_ylabel("Memory Load")

        # Trends Plot
        steps = range(len(reward_history))
        ax3.plot(steps, reward_history, label='Total Points', color='green')
        ax3.plot(steps, cpu_usage_history, label='CPU Usage', color='blue', linestyle='--')
        ax3.plot(steps, mem_usage_history, label='Memory Usage', color='red', linestyle='--')
        ax3.set_title("Performance Trends")
        ax3.set_xlabel("Steps")
        ax3.set_ylabel("Values")
        ax3.legend()

        plt.suptitle(f"Move {step + 1} - Total Points: {total_reward}", fontsize=14)
        plt.tight_layout()
        plt.show()

        # User-friendly logs
        print(f"Move {step + 1}: The agent worked on slot {action + 1}.")
        if completed is not None:
            print(f"Great job! The agent finished a task in slot {action + 1} and earned a point!")
        for i in new_tasks:
            print(f"A new task popped up in slot {i + 1} with CPU load {state.reshape(env.N, 2)[i, 0]} and Memory load {state.reshape(env.N, 2)[i, 1]}.")
        if not new_tasks and completed is None:
            print("Nothing new happened this time.")
        print(f"Total points so far: {total_reward}\n")

        time.sleep(0.5)  # Animation delay
        prev_state = state.copy()
        if done:
            print("Simulation finished early!")
            break

    print(f"Simulation over! Final score: {total_reward} points")

# Run interactive simulation
interact_manual(run_simulation, p=p_slider, cpu_capacity=cpu_slider, mem_capacity=mem_slider)



interactive(children=(FloatSlider(value=0.5, description='Task Arrival (p):', max=1.0, min=0.1), IntSlider(val…