V6 Update: in readme.md

In [None]:
# Check for TPU availability and set it up
import os

# Check if TPU is available
try:
    import torch_xla
    import torch_xla.core.xla_model as xm
    print("PyTorch XLA already installed")
    TPU_AVAILABLE = True
except ImportError:
    TPU_AVAILABLE = False
    print("PyTorch XLA not found, will attempt to install")

# Install necessary packages including PyTorch/XLA
!pip install pygame-ce pymunk stable-baselines3 stable-baselines3[extra] shimmy>=2.0 optuna
!pip install -q cloud-tpu-client

if not TPU_AVAILABLE:
    # Check what version of PyTorch we need
    import torch
    if torch.__version__.startswith('2'):
        # For PyTorch 2.x
        !pip install -q torch_xla[tpu]>=2.0
    else:
        # For PyTorch 1.x
        !pip install -q torch_xla

    # Restart runtime (required after installing PyTorch/XLA)
    print("TPU support installed. Please restart the runtime now.")
    import IPython
    IPython.display.display(IPython.display.HTML(
        "<script>google.colab.kernel.invokeFunction('notebook.Runtime.restartRuntime', [], {})</script>"
    ))
else:
    # Initialize TPU if available
    import torch_xla.core.xla_model as xm
    device = xm.xla_device()
    print(f"XLA device detected: {device}")

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
!ls /content/

In [None]:
!rm -r /content/capture
!rm -r /content/game_history
!rm -r /content/logs

# Classes

## Import

In [None]:
from game.balancing_ball_game import BalancingBallGame
from game.gym_env import BalancingBallEnv

## Test

In [None]:
# from balancing_ball_game import BalancingBallGame

def run_standalone_game(render_mode="human", difficulty="medium", capture_per_second=3, window_x=1000, window_y=600, level=3):
    """Run the game in standalone mode with visual display"""

    platform_shape = "circle"
    platform_proportion = 0.333

    game = BalancingBallGame(
        render_mode = render_mode,
        difficulty = difficulty,
        window_x = window_x,
        window_y = window_y,
        platform_shape = platform_shape,
        platform_proportion = platform_proportion,
        level = level,
        fps = 30,
        capture_per_second = 3,
    )

    game.run_standalone()

def test_gym_env(episodes=3, difficulty="medium"):
    """Test the OpenAI Gym environment with continuous actions"""
    import time
    # from gym_env import BalancingBallEnv

    fps = 30
    env = BalancingBallEnv(
        render_mode="rgb_array_and_human_in_colab",
        difficulty=difficulty,
        fps=fps,
        level=3,  # Use level 3 for adversarial training
        num_players=2,
    )

    for episode in range(episodes):
        observation, info = env.reset()
        total_reward = 0
        step = 0
        done = False

        while not done:
            # Sample continuous actions for both players
            action = env.action_space.sample()  # Returns array of shape (2,) with values in [-1, 1]

            # Take step
            observation, reward, terminated, truncated, info = env.step(action)

            done = terminated or truncated
            total_reward += reward
            step += 1

            # Render
            env.render()

            # Print some info
            if step % 100 == 0:
                print(f"Step {step}: Action: {action}, Reward: {reward:.2f}, Individual Rewards: {info.get('individual_rewards', [])}")

        winner = info.get('winner', None)
        winner_text = f"Winner: Player {winner + 1}" if winner is not None else "Draw"
        print(f"Episode {episode+1}: Steps: {step}, Total Reward: {total_reward:.2f}, {winner_text}")

    env.close()

## Train

In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import gymnasium as gym
import sys
import optuna

from stable_baselines3 import PPO
from stable_baselines3.common.policies import ActorCriticPolicy, ActorCriticCnnPolicy  # MLP policy instead of CNN
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback
from stable_baselines3.common.evaluation import evaluate_policy

class Train:
    def __init__(self,
                 learning_rate=0.0003,
                 n_steps=2048,
                 batch_size=64,
                 n_epochs=10,
                 gamma=0.99,
                 gae_lambda=0.95,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 policy_kwargs=None,
                 n_envs=4,
                 difficulty="medium",
                 level=3,  # Default to level 3 for adversarial training
                 load_model=None,
                 log_dir="./logs/",
                 model_dir="./models/",
                 obs_type="game_screen",
                 num_players=2,  # Number of players for adversarial training
                ):

        # Create directories
        os.makedirs(log_dir, exist_ok=True)
        os.makedirs(model_dir, exist_ok=True)
        self.log_dir = log_dir
        self.model_dir = model_dir
        self.n_envs = n_envs
        self.obs_type = obs_type
        self.level = level
        self.num_players = num_players

        # Setup environments
        env = make_vec_env(
            self.make_env(render_mode="rgb_array", difficulty=difficulty, obs_type=obs_type, num_players=num_players),
            n_envs=n_envs
        )
        self.env = env

        # Setup evaluation environment
        eval_env = make_vec_env(
            self.make_env(render_mode="rgb_array", difficulty=difficulty, obs_type=obs_type, num_players=num_players),
            n_envs=1
        )
        self.eval_env = eval_env

        # Create the PPO model
        if load_model:
            print(f"Loading model from {load_model}")
            self.model = PPO.load(
                load_model,
                env=self.env,
                tensorboard_log=log_dir,
            )
        else:
            # 優化的超參數，特別針對對抗訓練
            hyper_param = {
                'learning_rate': 0.0001,  # 降低學習率以提高穩定性
                'gamma': 0.995,  # 提高折扣因子以重視長期獎勵
                'clip_range': 0.15,  # 降低裁切範圍以提高穩定性
                'gae_lambda': 0.98,  # 提高GAE lambda
                'ent_coef': 0.02,  # 提高熵係數以增加探索
                'vf_coef': 0.5,
            }

            policy_kwargs = {
                "features_extractor_kwargs": {"features_dim": 512},
                "net_arch": [512, 512, 256],  # 增加網絡深度以處理複雜策略
                "activation_fn": torch.nn.ReLU,
            }

            policy = ActorCriticCnnPolicy if obs_type == "game_screen" else ActorCriticPolicy
            print("obs type: ", self.obs_type)
            print("policy: ", policy)
            print("num_players: ", self.num_players)

            # PPO for continuous action space with adversarial training
            self.model = PPO(
                policy=policy,
                env=self.env,
                learning_rate=hyper_param["learning_rate"],
                n_steps=n_steps,
                batch_size=batch_size,
                n_epochs=n_epochs,
                gamma=hyper_param["gamma"],
                clip_range=hyper_param["clip_range"],
                gae_lambda=hyper_param["gae_lambda"],
                ent_coef=hyper_param["ent_coef"],
                vf_coef=hyper_param["vf_coef"],
                max_grad_norm=max_grad_norm,
                tensorboard_log=log_dir,
                policy_kwargs=policy_kwargs,
                verbose=1,
            )

    def make_env(self, render_mode="rgb_array", difficulty="medium", obs_type="game_screen", num_players=2):
        """
        Create and return an environment function to be used with VecEnv
        """
        def _init():
            env = BalancingBallEnv(
                render_mode=render_mode,
                difficulty=difficulty,
                level=self.level,
                obs_type=obs_type,
                num_players=num_players
            )
            return env
        return _init

    def train_ppo(self,
                  total_timesteps=1000000,
                  save_freq=10000,
                  eval_freq=10000,
                  eval_episodes=5,
                 ):
        """
        Train a PPO agent to play the Balancing Ball game

        Args:
            total_timesteps: Total number of steps to train for
            n_envs: Number of parallel environments
            save_freq: How often to save checkpoints (in timesteps)
            log_dir: Directory for tensorboard logs
            model_dir: Directory to save models
            eval_freq: How often to evaluate the model (in timesteps)
            eval_episodes: Number of episodes to evaluate on
            difficulty: Game difficulty level
            load_model: Path to model to load for continued training
        """

        # Setup callbacks
        checkpoint_callback = CheckpointCallback(
            save_freq=save_freq // self.n_envs,  # Divide by n_envs as save_freq is in timesteps
            save_path=self.model_dir,
            name_prefix="ppo_balancing_ball_" + str(self.obs_type),
        )

        eval_callback = EvalCallback(
            self.eval_env,
            best_model_save_path=self.model_dir,
            log_path=self.log_dir,
            eval_freq=eval_freq // self.n_envs,
            n_eval_episodes=eval_episodes,
            deterministic=True,
            render=False
        )

        # Train the model
        print("Starting training...")
        self.model.learn(
            total_timesteps=total_timesteps,
            callback=[checkpoint_callback, eval_callback],
        )

        # Save the final model
        self.model.save(f"{self.model_dir}/ppo_balancing_ball_final_" + str(self.obs_type))

        print("Training completed!")
        return self.model

    def evaluate(self, model_path, n_episodes=10, difficulty="medium"):
        """
        Evaluate a trained model

        Args:
            model_path: Path to the saved model
            n_episodes: Number of episodes to evaluate on
            difficulty: Game difficulty level
        """
        # Load the model
        model = PPO.load(model_path)

        # Evaluate
        mean_reward, std_reward = evaluate_policy(
            model,
            self.env,
            n_eval_episodes=n_episodes,
            deterministic=True,
            render=True
        )

        print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

        self.env.close()


# if args.mode == "train":
#     train_ppo(
#         total_timesteps=args.timesteps,
#         difficulty=args.difficulty,
#         n_envs=args.n_envs,
#         load_model=args.load_model,
#         eval_episodes=args.eval_episodes,
#     )
# else:
#     if args.load_model is None:
#         print("Error: Must provide --load_model for evaluation")
#     else:
#         evaluate(
#             model_path=args.load_model,
#             n_episodes=args.eval_episodes,
#             difficulty=args.difficulty
#         )

## Optuna

In [None]:
class Optuna_optimize:
    def __init__(self, obs_type="game_screen", num_players=2):
        self.obs_type = obs_type
        self.num_players = num_players
        self.env = make_vec_env(
            self.make_env(render_mode="rgb_array", difficulty="medium", obs_type=self.obs_type, num_players=num_players),
            n_envs=1
        )

    def make_env(self, render_mode="rgb_array", difficulty="medium", obs_type="game_screen", num_players=2):
        """
        Create and return an environment function to be used with VecEnv
        """
        def _init():
            env = BalancingBallEnv(
                render_mode=render_mode,
                difficulty=difficulty,
                level=3,  # Level 3 for adversarial training
                obs_type=obs_type,
                num_players=num_players
            )
            return env
        return _init

    def optuna_parameter_tuning(self, n_trials):
        print("You are using optuna for automatic parameter tuning, it will create a new model")

        pruner = optuna.pruners.HyperbandPruner(
            min_resource=100,        # 最小资源量
            max_resource='auto',   # 最大资源量 ('auto' 或 整数)
            reduction_factor=3     # 折减因子 (eta)
        )

        # 建立 study 物件，並指定剪枝器
        study = optuna.create_study(direction='maximize', pruner=pruner)

        # 執行優化
        try:
            study.optimize(self.objective, n_trials=n_trials)

            # 分析結果
            print("最佳試驗的超參數：", study.best_trial.params)
            print("最佳試驗的平均回報：", study.best_trial.value)

            import pandas as pd
            df = study.trials_dataframe()
            print(df.head())
        finally:
            self.env.close()
            del self.env


    def objective(self, trial):
        import gc

        # 1. 建議超參數 - Adjusted for continuous action space
        learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True)
        gamma = trial.suggest_float('gamma', 0.95, 0.999)
        clip_range = trial.suggest_float('clip_range', 0.1, 0.3)
        gae_lambda = trial.suggest_float('gae_lambda', 0.8, 0.99)
        ent_coef = trial.suggest_float('ent_coef', 0.005, 0.02)  # Lower for continuous actions
        vf_coef = trial.suggest_float('vf_coef', 0.1, 1)
        # features_dim = trial.suggest_categorical('features_dim', [128, 256, 512])

        policy_kwargs = {
            # "features_extractor_kwargs": {"features_dim": features_dim},
            "net_arch": [256, 256],  # Architecture for continuous actions
        }

        n_steps=2048
        batch_size=64
        n_epochs=10
        max_grad_norm=0.5

        policy = ActorCriticCnnPolicy if self.obs_type == "game_screen" else ActorCriticPolicy
        print("obs type: ", self.obs_type)
        print("policy: ", policy)

        # 3. 建立模型 - PPO for continuous action space
        model = PPO(
                policy=policy,
                env=self.env,
                learning_rate=learning_rate,
                n_steps=n_steps,
                batch_size=batch_size,
                n_epochs=n_epochs,
                gamma=gamma,
                clip_range=clip_range,
                gae_lambda=gae_lambda,
                ent_coef=ent_coef,
                vf_coef=vf_coef,
                max_grad_norm=max_grad_norm,
                tensorboard_log=None,
                policy_kwargs=policy_kwargs,
                verbose=0,
            )

        try:
            # 4. 訓練模型
            model.learn(total_timesteps=50000)  # Increased timesteps for adversarial training
            # 5. 評估模型
            mean_reward = evaluate_policy(model, self.env, n_eval_episodes=10)[0]
        finally:
            # Always cleanup
            del model
            gc.collect()

            if TPU_AVAILABLE:
                import torch_xla.core.xla_model as xm
                xm.mark_step()

        return mean_reward

# Training

In [None]:
import gc

# Memory-optimized training setup
def get_tpu_memory_info():
    """Get memory information from TPU device if available"""
    pass

# Display memory information
get_tpu_memory_info()

n_envs = 1
batch_size = 64
n_steps = 2048

# Choose whether to do hyperparameter optimization or direct training
do_optimization = True

if do_optimization: # game_screen, state_based
    optuna_optimizer = Optuna_optimize(obs_type="state_based", num_players=2)
    n_trials = 10
    best_trial = optuna_optimizer.optuna_parameter_tuning(n_trials=n_trials)
    print(f"best_trial found: {best_trial}")
else:
    # Create trainer for adversarial training
    training = Train(
        n_steps=n_steps,
        batch_size=batch_size,
        difficulty="medium",
        n_envs=n_envs,
        level=3,  # Level 3 for adversarial training
        load_model=None,  # Start fresh for adversarial training
        obs_type='game_screen',
        num_players=2,  # Two players for adversarial training
    )

    # Run training with continuous action space
    total_timesteps = 1000000  # More timesteps for adversarial training

    model = training.train_ppo(
        total_timesteps=total_timesteps,
        eval_episodes=5,
        save_freq=10000,
        eval_freq=10000
    )

    print("Adversarial training completed!")

In [None]:
# Copy the best model to a stable location
!cp /content/models/best_model.zip /content/drive/MyDrive/RL_Models/best_model_$(date +%Y%m%d_%H%M%S).zip

# Optional: Monitor TPU usage
if TPU_AVAILABLE:
    !sudo lsof -w /dev/accel0

In [None]:
# Load a saved model and continue training or evaluate
model_path = "/content/models/best_model.zip"

if os.path.exists(model_path):
    print(f"Loading model from {model_path} for evaluation")

    # Create trainer with the saved model
    eval_trainer = Train(
        n_steps=1024,
        batch_size=batch_size,
        difficulty="medium",
        n_envs=1  # Use 1 env for evaluation
    )

    # Evaluate the model
    eval_trainer.evaluate(
        model_path=model_path,
        n_episodes=5,
        difficulty="medium"
    )
else:
    print(f"Model not found at {model_path}")

# --

In [None]:
# Test the adversarial training environment
run_standalone_game(render_mode="rgb_array_and_human_in_colab", difficulty="medium", window_x=1000, window_y=600, level=3)
# test_gym_env(difficulty="medium")

In [None]:
# Example of creating the environment with continuous action space for adversarial training
env = BalancingBallEnv(
    render_mode="rgb_array",
    difficulty="medium",
    fps=30,
    obs_type="game_screen",
    image_size=(84, 84),
    level=3,  # Level 3 for adversarial training
    num_players=2,  # Two players
)

# Reset environment to get initial observation
obs, info = env.reset()

# Print observation and action space info
print(f"Observation shape: {obs.shape}")  # Should be (84, 84, 3) for grayscale with 3 stacked frames
print(f"Action space: {env.action_space}")  # Should be Box(low=-1, high=1, shape=(2,))
print(f"Action space shape: {env.action_space.shape}")  # Should be (2,) for two players

# Test a random continuous action
action = env.action_space.sample()
print(f"Sample action: {action}")  # Should be array of 2 values between -1 and 1

# Take a step
obs, reward, terminated, truncated, info = env.step(action)
print(f"Step result - Reward: {reward}, Individual rewards: {info.get('individual_rewards', [])}")

# Display a sample observation (first frame only)
import matplotlib.pyplot as plt
plt.figure(figsize=(4, 4))
plt.imshow(obs[:,:,0], cmap='gray')
plt.title("Adversarial Training - Grayscale Observation")
plt.axis('off')
plt.show()

env.close()