In [1]:
import numpy as np
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv

In [2]:
# Define the cellular system environment (simplified for the example)
class CellularEnv(gym.Env):
    def __init__(self, num_users=10, num_rb=5):
        super(CellularEnv, self).__init__()
        self.num_users = num_users
        self.num_rb = num_rb
        
        # Define action and observation space
        # Action space: Allocation matrix (num_users x num_rb)
        self.action_space = gym.spaces.Box(low=0, high=1, shape=(self.num_users, self.num_rb), dtype=np.float32)
        
        # Observation space: Could be rates for each user, channel gains, etc.
        self.observation_space = gym.spaces.Box(low=0, high=np.inf, shape=(self.num_users, ), dtype=np.float32)

    def reset(self):
        # Initialize or reset the state (e.g., initial channel conditions or rates)
        self.state = np.random.rand(self.num_users)
        return self.state
    
    def step(self, action):
        # Apply the action (resource allocation)
        allocation_matrix = action
        
        # Simulate the environment dynamics
        # Here, we calculate user rates based on resource allocation (simplified)
        rates = np.dot(allocation_matrix, np.random.rand(self.num_rb))
        
        # Reward: we can define it as the sum of the user rates or based on a custom function
        reward = np.sum(rates)
        # Observation: return new rates
        obs = rates
        
        # Done flag (usually when episode is over, but we can make it continuous)
        done = False
        
        return obs, reward, done, {}

    def render(self, mode='human'):
        pass

In [3]:
# Instantiate the environment
env = CellularEnv(num_users=10, num_rb=5)
env = DummyVecEnv([lambda: env])

# Create the PPO agent
ppo_agent = PPO("MlpPolicy", env, verbose=1)

# Train the agent
ppo_agent.learn(total_timesteps=10000)

# Save the agent
ppo_agent.save("ppo_cellular_env")

# Test the trained agent
obs = env.reset()
for i in range(100):
    action, _states = ppo_agent.predict(obs)
    obs, reward, done, info = env.step(action)
    print(f"Step {i+1}: Action: {action}, Reward: {reward}")
    if done:
        obs = env.reset()



Using cpu device
-----------------------------
| time/              |      |
|    fps             | 1012 |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 689         |
|    iterations           | 2           |
|    time_elapsed         | 5           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.024505755 |
|    clip_fraction        | 0.306       |
|    clip_range           | 0.2         |
|    entropy_loss         | -71         |
|    explained_variance   | -7.9e-05    |
|    learning_rate        | 0.0003      |
|    loss                 | 7.4e+03     |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0474     |
|    std                  | 1           |
|    value_loss           | 1.57e+04    |
-----------------