In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [21]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
import torch
from stable_baselines3 import PPO
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from torch import nn
from torch.optim.lr_scheduler import CosineAnnealingLR


def linear_schedule(initial_value: float, final_value: float):
    def schedule(progress_remaining: float) -> float:
        return final_value + (initial_value - final_value) * progress_remaining
    return schedule

class CustomNetwork(BaseFeaturesExtractor):
    def __init__(self, observation_space: spaces.Space, features_dim: int = 128):
        super().__init__(observation_space, features_dim)
        self.policy_net = nn.Sequential(
            nn.Linear(np.prod(observation_space.shape), 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, features_dim),
            nn.Tanh()
        )
        
        self.value_net = nn.Sequential(
            nn.Linear(np.prod(observation_space.shape), 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, 256),  
            nn.ReLU(),
            nn.Linear(256, features_dim),
            nn.Tanh()
        )

    def forward(self, observations: torch.Tensor) -> torch.Tensor:
        return self.policy_net(observations)

    def value_head(self, observations: torch.Tensor) -> torch.Tensor:
        return self.value_net(observations)

class ServerAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}
    
    def __init__(self, max_servers=10, demand_pattern='poisson'):
        super().__init__()
        
        # Server configuration
        self.max_servers = max_servers
        self.server_capacity = 10  
        self.active_servers = 1
        
        # Demand configuration
        self.demand_pattern = demand_pattern
        self.queue = []
        self.time = 0
        self.demand_history = []
        
        # Enhanced observation space
        self.observation_space = spaces.Box(
            low=np.array([0, 0, 0, 0], dtype=np.float32),
            high=np.array([1, 1, 1, 1], dtype=np.float32),
            shape=(4,), dtype=np.float32
        )
        
        self.action_space = spaces.Discrete(3) 
        
    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.active_servers = 1
        self.queue = []
        self.time = 0
        self.demand_history = []
        return self._get_state(), {}
    
    def _get_state(self):
        trend = 0
        if len(self.demand_history) > 1:
            trend = (self.demand_history[-1] - np.mean(self.demand_history[:-1]))/20
            
        return np.array([
            self.active_servers / self.max_servers,
            len(self.queue) / 100,
            self._generate_demand() / 20,
            np.clip(trend, 0, 1)
        ], dtype=np.float32)
    
    def _generate_demand(self):
        # Smoother demand patterns
        if self.demand_pattern == 'poisson':
            demand = self.np_random.poisson(lam=10)
        elif self.demand_pattern == 'sinusoidal':
            demand = 10 + 10 * np.sin(self.time * 0.1)
        else:
            demand = 10
            
        self.demand_history.append(demand)
        return demand
    
    def step(self, action):
        server_change = action - 1
        self.active_servers = np.clip(
            self.active_servers + server_change, 1, self.max_servers
        )
        
        demand = self._generate_demand()
        self.queue.extend([1] * int(demand))
        processed = min(len(self.queue), self.active_servers * self.server_capacity)
        self.queue = self.queue[processed:]
        
        reward = self._calculate_reward(processed)
        self.time += 1
        
        done = len(self.queue) > 200  
        truncated = self.time >= 1000
        
        return self._get_state(), reward, done, truncated, {}
    
    def _calculate_reward(self, processed):
        target_utilization = 0.7 
        actual_utilization = processed / (self.active_servers * self.server_capacity + 1e-8)
        
        # Quadratic queue penalty
        queue_penalty = (len(self.queue) / 100) ** 2
        
        utilization_bonus = np.exp(-((actual_utilization - target_utilization) ** 2) / 0.1)
        
        # Scaling factors
        reward = (
            2.0 * utilization_bonus
            - 1.5 * queue_penalty
            - 0.1 * (self.active_servers / self.max_servers)  
            + 0.2 * (processed / self.server_capacity)  
        )
        
        return float(reward)
    
    def render(self, mode='human'):
        print(f"Time: {self.time} | Servers: {self.active_servers} | Queue: {len(self.queue)}")

if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    env = ServerAllocationEnv(max_servers=15) 
    
    model = PPO(
        "MlpPolicy",
        env,
        policy_kwargs={
            "features_extractor_class": CustomNetwork,
            "net_arch": dict(pi=[128], vf=[128])  
        },
        verbose=1,
        device=device,
        learning_rate=linear_schedule(3e-4, 1e-5), 
        n_steps=2048,
        batch_size=256,
        n_epochs=10, 
        gamma=0.99,
        gae_lambda=0.98, 
        clip_range=0.3,  
        ent_coef=0.02, 
        vf_coef=1.5, 
        max_grad_norm=0.8,
        target_kl=0.03  
    )

    for phase in range(3):
        model.learn(total_timesteps=200_000)
        # Save intermediate models
        # model.save(f"server_allocation_phase_{phase}")
    
    # Final save
    # model.save("server_allocation_final")

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | 448      |
| time/              |          |
|    fps             | 787      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 2048     |
---------------------------------
Early stopping at step 5 due to reaching max kl: 0.05
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1e+03       |
|    ep_rew_mean          | 783         |
| time/                   |             |
|    fps                  | 760         |
|    iterations           | 2           |
|    time_elapsed         | 5           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.043691866 |
|    clip_fraction        | 0.0904      |
|    cli