V4.2 Update: in readme.md

In [None]:
# Check for TPU availability and set it up
import os

# Check if TPU is available
try:
    import torch_xla
    import torch_xla.core.xla_model as xm
    print("PyTorch XLA already installed")
    TPU_AVAILABLE = True
except ImportError:
    TPU_AVAILABLE = False
    print("PyTorch XLA not found, will attempt to install")

# Install necessary packages including PyTorch/XLA
!pip install pygame-ce pymunk stable-baselines3 stable-baselines3[extra] shimmy>=2.0 optuna
!pip install -q cloud-tpu-client

if not TPU_AVAILABLE:
    # Check what version of PyTorch we need
    import torch
    if torch.__version__.startswith('2'):
        # For PyTorch 2.x
        !pip install -q torch_xla[tpu]>=2.0
    else:
        # For PyTorch 1.x
        !pip install -q torch_xla

    # Restart runtime (required after installing PyTorch/XLA)
    print("TPU support installed. Please restart the runtime now.")
    import IPython
    IPython.display.display(IPython.display.HTML(
        "<script>google.colab.kernel.invokeFunction('notebook.Runtime.restartRuntime', [], {})</script>"
    ))
else:
    # Initialize TPU if available
    import torch_xla.core.xla_model as xm
    device = xm.xla_device()
    print(f"XLA device detected: {device}")

PyTorch XLA not found, will attempt to install
TPU support installed. Please restart the runtime now.


In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
!ls /content/

'=2.0'	 capture   game_history   logs	 models   sample_data


In [None]:
!rm -r /content/capture
!rm -r /content/game_history
!rm -r /content/logs

# Classes

## Recorder

In [None]:
import json
import os
import datetime

class Recorder:

    def __init__(self, task: str = "game_history_record"):
        """
        tasks:
        1. game_history_record
        2. temp_memory
        """
        # CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
        CURRENT_DIR = ""
        if task == "game_history_record":
            collection_name = self.get_newest_record_name()
            self.json_file_path = CURRENT_DIR + "./game_history/" + collection_name + ".json"

        # Ensure directory exists
        os.makedirs(os.path.dirname(self.json_file_path), exist_ok=True)

        if os.path.exists(self.json_file_path):
            print("Loading the json memory file")
            self.memory = self.load(self.json_file_path)
        else:
            print("The json memory file does not exist. Creating new file.")
            self.memory = {"game_records": []}  # Direct dictionary instead of json.loads
            with open(self.json_file_path, "w") as f:
                json.dump(self.memory, f)

    def get(self):
        print("Getting the json memory")
        return self.memory

    def add_no_limit(self, data: float, ):
        """
        Add a records.

        Args:
            role: The role of the sender (e.g., 'user', 'assistant')
            message: The message content
        """
        self.memory["game_records"].append({
            "game_total_duration": data,
            "timestamp": str(datetime.datetime.now())
        })

        self.save(self.json_file_path)

    def save(self, file_path):
        try:
            with open(file_path, 'w') as f:
                json.dump(self.memory, f)
        except Exception as e:
            print(f"Error saving memory to {file_path}: {e}")

    def load(self, file_path):
        try:
            with open(file_path, 'r') as f:
                return json.load(f)
        except Exception as e:
            print(f"Error loading memory from {file_path}: {e}")
            return {"game_records": []}

    def get_newest_record_name(self) -> str:
        """
        傳回最新的對話歷史資料和集的名稱 (game_YYYY_MM)
            - 例如: "game_2022-01"
        """

        this_month = datetime.datetime.now().strftime("%Y-%m")
        return "record_" + this_month

## Shapes & Objects

In [None]:
import pymunk
from typing import Tuple, Optional

class Shape:

    def __init__(
                self,
                position: Tuple[float, float] = (300, 100),
                velocity: Tuple[float, float] = (0, 0),
                body: Optional[pymunk.Body] = None,
                shape: Optional[pymunk.Shape] = None,
            ):
        """
        Initialize a physical shape with associated body.

        Args:
            position: Initial position (x, y) of the body
            velocity: Initial velocity (vx, vy) of the body
            body: The pymunk Body to attach to this shape
            shape: The pymunk Shape for collision detection
        """

        self.body = body
        self.default_position = position
        self.default_velocity = velocity
        self.body.position = position
        self.body.velocity = velocity
        self.default_angular_velocity = 0

        self.shape = shape

    def reset(self):
        """Reset the body to its default position, velocity and angular velocity."""
        self.body.position = self.default_position
        self.body.velocity = self.default_velocity
        self.body.angular_velocity = self.default_angular_velocity


In [None]:
import pymunk

# from shapes.shape import Shape
from typing import Tuple, Optional

class Circle(Shape):

    def __init__(
                self,
                position: Tuple[float, float] = (300, 100),
                velocity: Tuple[float, float] = (0, 0),
                body: Optional[pymunk.Body] = None,
                shape_radio: float = 20,
                shape_mass: float = 1,
                shape_friction: float = 0.1,
            ):
        """
        Initialize a circular physics object.

        Args:
            position: Initial position (x, y) of the circle
            velocity: Initial velocity (vx, vy) of the circle
            body: The pymunk Body to attach this circle to
            shape_radio: Radius of the circle in pixels
            shape_mass: Mass of the circle
            shape_friction: Friction coefficient for the circle
        """

        super().__init__(position, velocity, body)
        self.shape_radio = shape_radio
        self.shape = pymunk.Circle(self.body, shape_radio)
        self.shape.mass = shape_mass
        self.shape.friction = shape_friction
        self.shape.elasticity = 0.8  # Add some bounce to make the simulation more interesting


## Game class

In [None]:
import pymunk
import pygame
import random
import time
import numpy as np
import os
from typing import Dict, Tuple, Optional

import numpy as np
from IPython.display import display, Image, clear_output
import matplotlib.pyplot as plt
from io import BytesIO
import base64
import IPython.display as ipd
# from shapes.circle import Circle
# from record import Recorder

class BalancingBallGame:
    """
    A physics-based balancing ball game that can run standalone or be used as a Gym environment.
    """

    # Game constants


    # Visual settings for indie style
    BACKGROUND_COLOR = (41, 50, 65)  # Dark blue background
    BALL_COLOR = (255, 213, 79)  # Bright yellow ball
    PLATFORM_COLOR = (235, 64, 52)  # Red platform
    PARTICLE_COLORS = [(252, 186, 3), (252, 127, 3), (252, 3, 3)]  # Fire-like particles



    def __init__(self,
                 render_mode: str = "human",
                 sound_enabled: bool = True,
                 difficulty: str = "medium",
                 window_x: int = 1000,
                 window_y: int = 600,
                 max_step: int = 30000,
                 player_ball_speed: int = 5,
                 reward_staying_alive: float = 0.1,
                 reward_ball_centered: float = 0.2,
                 penalty_falling: float = -10.0,
                 fps: int = 120,
                 platform_shape: str = "circle",
                 platform_proportion: int = 0.4,
                 capture_per_second: int = None,
                ):
        """
        Initialize the balancing ball game.

        Args:
            render_mode: "human" for visible window, "rgb_array" for gym env, "headless" for no rendering
            sound_enabled: Whether to enable sound effects
            difficulty: Game difficulty level ("easy", "medium", "hard")
            max_step: 1 step = 1/fps, if fps = 120, 1 step = 1/120
            reward_staying_alive: float = 0.1,
            reward_ball_centered: float = 0.2,
            penalty_falling: float = -10.0,
            fps: frame per second
            platform_proportion: platform_length = window_x * platform_proportion
            capture_per_second: save game screen as a image every second, None means no capture
        """
        # Game parameters
        self.max_step = max_step
        self.reward_staying_alive = reward_staying_alive
        self.reward_ball_centered = reward_ball_centered
        self.penalty_falling = penalty_falling
        self.fps = fps
        self.window_x = window_x
        self.window_y = window_y
        self.player_ball_speed

        self.recorder = Recorder("game_history_record")
        self.render_mode = render_mode
        self.sound_enabled = sound_enabled
        self.difficulty = difficulty

        platform_length = int(window_x * platform_proportion)
        self._get_x_axis_max_reward_rate(platform_length)

        # Initialize physics space
        self.space = pymunk.Space()
        self.space.gravity = (0, 1000)
        self.space.damping = 0.9

        # Create game bodies
        self.dynamic_body = pymunk.Body()  # Ball body
        self.kinematic_body = pymunk.Body(body_type=pymunk.Body.KINEMATIC)  # Platform body
        self.kinematic_body.position = (self.window_x / 2, (self.window_y / 3) * 2)
        self.default_kinematic_position = self.kinematic_body.position

        # Create game objects
        self._create_ball()
        self._create_platform(platform_shape=platform_shape, platform_length=platform_length)
        # self._create_platform("rectangle")

        # Add all objects to space
        self.space.add(self.dynamic_body, self.kinematic_body,
                       self.circle.shape, self.platform)

        # Game state tracking
        self.steps = 0
        self.start_time = time.time()
        self.game_over = False
        self.score = 0
        self.particles = []

        # Initialize Pygame if needed
        if self.render_mode in ["human", "rgb_array", "rgb_array_and_human", "rgb_array_and_human_in_colab"]:
            self._setup_pygame()
        else:
            print("render_mode is not human or rgb_array, so no pygame setup.")

        # Set difficulty parameters
        self._apply_difficulty()
        self.capture_per_second = capture_per_second

        # Create folders for captures if needed
        # CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
        CURRENT_DIR = "."
        os.makedirs(os.path.dirname(CURRENT_DIR + "/capture/"), exist_ok=True)

    def _setup_pygame(self):
        """Set up PyGame for rendering"""
        pygame.init()
        self.frame_count = 0

        if self.sound_enabled:
            self._load_sounds()

        if self.render_mode == "human":
            self.screen = pygame.display.set_mode((self.window_x, self.window_y))
            pygame.display.set_caption("Balancing Ball - Indie Game")
            self.font = pygame.font.Font(None, int(self.window_x / 34))

        elif self.render_mode == "rgb_array":
            self.screen = pygame.Surface((self.window_x, self.window_y))

        elif self.render_mode == "rgb_array_and_human": # todo
            print("rgb_array_and_human mode is not supported yet.")

        elif self.render_mode == "rgb_array_and_human_in_colab": # todo
            from pymunk.pygame_util import DrawOptions

            self.screen = pygame.Surface((self.window_x, self.window_y))  # Create hidden surface

            # Set up display in Colab
            self.draw_options = DrawOptions(self.screen)
            html_display = ipd.HTML('''
                <div id="pygame-output" style="width:100%;">
                    <img id="pygame-img" style="width:100%;">
                </div>
            ''')
            self.display_handle = display(html_display, display_id='pygame_display')

            self.last_update_time = time.time()
            self.update_interval = 1.0 / 15  # Update display at 15 FPS to avoid overwhelming Colab
            self.font = pygame.font.Font(None, int(self.window_x / 34))


        else:
            print("Invalid render mode. Using headless mode.")

        self.clock = pygame.time.Clock()

        # Create custom draw options for indie style

    def _load_sounds(self):
        """Load game sound effects"""
        try:
            pygame.mixer.init()
            self.sound_bounce = pygame.mixer.Sound("assets/bounce.wav") if os.path.exists("assets/bounce.wav") else None
            self.sound_fall = pygame.mixer.Sound("assets/fall.wav") if os.path.exists("assets/fall.wav") else None
        except Exception:
            print("Sound loading error")
            self.sound_enabled = False
            pass

    def _create_ball(self):
        """Create the ball with physics properties"""
        self.ball_radius = int(self.window_x / 67)
        self.circle = Circle(
            position=(self.window_x / 2, self.window_y / 3),
            velocity=(0, 0),
            body=self.dynamic_body,
            shape_radio=self.ball_radius,
            shape_friction=100,
        )
        # Store initial values for reset
        self.default_ball_position = self.dynamic_body.position

    def _create_platform(self,
                         platform_shape: str = "circle",
                         platform_length: int = 200
                        ):
        """
        Create the platform with physics properties
        platform_shape: circle, rectangle
        platform_length: Length of a rectangle or Diameter of a circle
        """
        if platform_shape == "circle":
            self.platform_length = platform_length / 2 # radius
            self.platform = pymunk.Circle(self.kinematic_body, self.platform_length)
            self.platform.mass = 1  # 质量对 Kinematic 物体无意义，但需要避免除以零错误
            self.platform.friction = 0.7
        elif platform_shape == "rectangle":
            self.platform_length = platform_length
            vs = [(-self.platform_length/2, -10),
                (self.platform_length/2, -10),
                (self.platform_length/2, 10),
                (-self.platform_length/2, 10)]

            self.platform = pymunk.Poly(self.kinematic_body, vs)
        self.platform.friction = 0.7
        self.platform_rotation = 0
        self.kinematic_body.angular_velocity = random.randrange(-1, 2, 2)

    def _apply_difficulty(self):
        """Apply difficulty settings to the game"""
        if self.difficulty == "easy":
            self.max_platform_speed = 1.5
            self.ball_elasticity = 0.5
        elif self.difficulty == "medium":
            self.max_platform_speed = 2.5
            self.ball_elasticity = 0.7
        else:  # hard
            self.max_platform_speed = 3.5
            self.ball_elasticity = 0.9

        self.circle.shape.elasticity = self.ball_elasticity

    def reset(self) -> np.ndarray:
        """Reset the game state and return the initial observation"""
        # Reset physics objects
        self.dynamic_body.position = self.default_ball_position
        self.dynamic_body.velocity = (0, 0)
        self.dynamic_body.angular_velocity = 0

        self.kinematic_body.position = self.default_kinematic_position
        self.kinematic_body.angular_velocity = random.randrange(-1, 2, 2)

        # Reset game state
        self.steps = 0
        self.start_time = time.time()
        self.game_over = False
        self.score = 0
        self.particles = []

        # Return initial observation
        return self._get_observation()

    def step(self, action: float) -> Tuple[np.ndarray, float, bool, Dict]:
        """
        Take a step in the game using the given action.

        Args:
            action: Float value between -1.0 and 1.0 controlling platform rotation

        Returns:
            observation: Game state observation
            reward: Reward for this step
            terminated: Whether episode is done
            info: Additional information
        """
        # Apply action to platform rotation
        action_value = (0 - self.player_ball_speed) if action == 0 else self.player_ball_speed

        self.dynamic_body.angular_velocity += action_value

        # Step the physics simulation
        self.space.step(1/self.fps)

        # Update particle effects
        self._update_particles()

        # Check game state
        self.steps += 1
        terminated = False
        reward = self.reward_staying_alive

        # Calculate reward for keeping ball centered on platform
        ball_x = self.dynamic_body.position[0]

        # Check if ball falls off screen
        if (self.dynamic_body.position[1] > self.kinematic_body.position[1] or
            self.dynamic_body.position[0] < 0 or
            self.dynamic_body.position[0] > self.window_x or
            self.steps >= self.max_step
            ):

            print("Score: ", self.score)
            terminated = True
            reward = self.penalty_falling if self.steps < self.max_step else 0
            self.game_over = True

            result = {
                "game_total_duration": f"{time.time() - self.start_time:.2f}",
                "score": self.score,
            }
            self.recorder.add_no_limit(result)

            if self.sound_enabled and self.sound_fall:
                self.sound_fall.play()

        step_reward = self._reward_calculator(ball_x)
        self.score += step_reward
        # print("ball_x: ", ball_x, ", self.score: ", self.score)
        return self._get_observation(), step_reward, terminated

    def _get_observation(self) -> np.ndarray:
        """Convert game state to observation for RL agent"""
        # update particles and draw them
        screen_data = self.render() # 获取数据

        if self.capture_per_second is not None and self.frame_count % self.capture_per_second == 0:  # Every second at 60 FPS
            pygame.image.save(self.screen, f"capture/frame_{self.frame_count/60}.png")

        self.frame_count += 1
        try:
            screen_data = np.transpose(screen_data, (1, 0, 2))  # 转置以符合 (height, width, channels)
            return screen_data
        except ValueError:
            return screen_data


    def _update_particles(self):
        """Update particle effects for indie visual style"""
        # Create new particles when ball hits platform
        if abs(self.dynamic_body.position[1] - (self.kinematic_body.position[1] - 20)) < 5 and abs(self.dynamic_body.velocity[1]) > 100:
            for _ in range(5):
                self.particles.append({
                    'x': self.dynamic_body.position[0],
                    'y': self.dynamic_body.position[1] + self.ball_radius,
                    'vx': random.uniform(-2, 2),
                    'vy': random.uniform(1, 3),
                    'life': 30,
                    'size': random.uniform(2, 5),
                    'color': random.choice(self.PARTICLE_COLORS)
                })

            if self.sound_enabled and self.sound_bounce:
                self.sound_bounce.play()

        # Update existing particles
        for particle in self.particles[:]:
            particle['x'] += particle['vx']
            particle['y'] += particle['vy']
            particle['life'] -= 1
            if particle['life'] <= 0:
                self.particles.remove(particle)

    def render(self) -> Optional[np.ndarray]:
        """Render the current game state"""
        if self.render_mode == "headless":
            return None

        # Clear screen with background color
        self.screen.fill(self.BACKGROUND_COLOR)

        # Custom drawing (for indie style)
        self._draw_indie_style()


        # Update display if in human mode
        if self.render_mode == "human":
            # Draw game information
            self._draw_game_info()
            pygame.display.flip()
            self.clock.tick(self.fps)
            return None

        elif self.render_mode == "rgb_array":
            # Return RGB array for gym environment
            return pygame.surfarray.array3d(self.screen)

        elif self.render_mode == "rgb_array_and_human": # todo
            print("rgb_array_and_human mode is not supported yet.")

        elif self.render_mode == "rgb_array_and_human_in_colab":
            self.space.debug_draw(self.draw_options)
            current_time = time.time()
            if current_time - self.last_update_time >= self.update_interval:
                # Convert Pygame surface to an image that can be displayed in Colab
                buffer = BytesIO()
                pygame.image.save(self.screen, buffer, 'PNG')
                buffer.seek(0)
                img_data = base64.b64encode(buffer.read()).decode('utf-8')

                # Update the HTML image
                self.display_handle.update(ipd.HTML(f'''
                    <div id="pygame-output" style="width:100%;">
                        <img id="pygame-img" src="data:image/png;base64,{img_data}" style="width:100%;">
                    </div>
                '''))

                self.last_update_time = current_time
            return pygame.surfarray.array3d(self.screen)
        else:
            pass

    def _draw_indie_style(self):
        """Draw game objects with indie game aesthetic"""
        # # Draw platform with gradient and glow
        # platform_points = []
        # for v in self.platform.get_vertices():
        #     x, y = v.rotated(self.kinematic_body.angle) + self.kinematic_body.position
        #     platform_points.append((int(x), int(y)))

        # pygame.draw.polygon(self.screen, self.PLATFORM_COLOR, platform_points)
        # pygame.draw.polygon(self.screen, (255, 255, 255), platform_points, 2)

        platform_pos = (int(self.kinematic_body.position[0]), int(self.kinematic_body.position[1]))
        pygame.draw.circle(self.screen, self.PLATFORM_COLOR, platform_pos, self.platform_length)
        pygame.draw.circle(self.screen, (255, 255, 255), platform_pos, self.platform_length, 2)

        # Draw rotation direction indicator
        self._draw_rotation_indicator(platform_pos, self.platform_length, self.kinematic_body.angular_velocity)

        # Draw ball with gradient and glow
        ball_pos = (int(self.dynamic_body.position[0]), int(self.dynamic_body.position[1]))
        pygame.draw.circle(self.screen, self.BALL_COLOR, ball_pos, self.ball_radius)
        pygame.draw.circle(self.screen, (255, 255, 255), ball_pos, self.ball_radius, 2)

        # Draw particles
        for particle in self.particles:
            alpha = min(255, int(255 * (particle['life'] / 30)))
            pygame.draw.circle(
                self.screen,
                particle['color'],
                (int(particle['x']), int(particle['y'])),
                int(particle['size'])
            )

    def _draw_rotation_indicator(self, position, radius, angular_velocity):
        """Draw an indicator showing the platform's rotation direction and speed"""
        # Only draw the indicator if there's some rotation
        if abs(angular_velocity) < 0.1:
            return

        # Calculate indicator properties based on angular velocity
        indicator_color = (50, 255, 150) if angular_velocity > 0 else (255, 150, 50)
        num_arrows = min(3, max(1, int(abs(angular_velocity))))
        indicator_radius = radius - 20  # Place indicator inside the platform

        # Draw arrow indicators along the platform's circumference
        start_angle = self.kinematic_body.angle

        for i in range(num_arrows):
            # Calculate arrow position
            arrow_angle = start_angle + i * (2 * np.pi / num_arrows)

            # Calculate arrow start and end points
            base_x = position[0] + int(np.cos(arrow_angle) * indicator_radius)
            base_y = position[1] + int(np.sin(arrow_angle) * indicator_radius)

            # Determine arrow direction based on angular velocity
            if angular_velocity > 0:  # Clockwise
                arrow_end_angle = arrow_angle + 0.3
            else:  # Counter-clockwise
                arrow_end_angle = arrow_angle - 0.3

            tip_x = position[0] + int(np.cos(arrow_end_angle) * (indicator_radius + 15))
            tip_y = position[1] + int(np.sin(arrow_end_angle) * (indicator_radius + 15))

            # Draw arrow line
            pygame.draw.line(self.screen, indicator_color, (base_x, base_y), (tip_x, tip_y), 3)

            # Draw arrowhead
            arrowhead_size = 7
            pygame.draw.circle(self.screen, indicator_color, (tip_x, tip_y), arrowhead_size)

    def _draw_game_info(self):
        """Draw game information on screen"""
        # Create texts
        time_text = f"Time: {time.time() - self.start_time:.1f}"
        score_text = f"Score: {self.score}"

        # Render texts
        time_surface = self.font.render(time_text, True, (255, 255, 255))
        score_surface = self.font.render(score_text, True, (255, 255, 255))

        # Draw text backgrounds
        pygame.draw.rect(self.screen, (0, 0, 0, 128),
                        (5, 5, time_surface.get_width() + 10, time_surface.get_height() + 5))
        pygame.draw.rect(self.screen, (0, 0, 0, 128),
                        (self.window_x - score_surface.get_width() - 15, 5,
                         score_surface.get_width() + 10, score_surface.get_height() + 5))

        # Draw texts
        self.screen.blit(time_surface, (10, 10))
        self.screen.blit(score_surface, (self.window_x - score_surface.get_width() - 10, 10))

        # Draw game over screen
        if self.game_over:
            game_over_text = "GAME OVER - Press R to restart"
            game_over_surface = self.font.render(game_over_text, True, (255, 255, 255))

            # Draw semi-transparent background
            overlay = pygame.Surface((self.window_x, self.window_y), pygame.SRCALPHA)
            overlay.fill((0, 0, 0, 128))
            self.screen.blit(overlay, (0, 0))

            # Draw text
            self.screen.blit(game_over_surface,
                           (self.window_x/2 - game_over_surface.get_width()/2,
                            self.window_y/2 - game_over_surface.get_height()/2))

    def _get_x_axis_max_reward_rate(self, platform_length):
        """
        ((self.platform_length / 2) - 5) for calculate the distance to the
        center of game window coordinates. The closer you are, the higher the reward.

        When the ball is to be 10 points away from the center coordinates,
        it should be 1 - ((self.platform_length - 10) * self.x_axis_max_reward_rate)
        """
        self.reward_width = (platform_length / 2) - 5
        self.x_axis_max_reward_rate = 2 / self.reward_width
        print("self.x_axis_max_reward_rate: ", self.x_axis_max_reward_rate)

    # def _reward_calculator(self, ball_x):
    #     # score & reward
    #     if self.steps < 2000:
    #         step_reward = self.steps * 0.01
    #     elif self.steps < 5000:
    #         step_reward = self.steps * 0.03
    #     else:
    #         step_reward = self.steps * 0.05

    #     rw = abs(ball_x - self.window_x/2)
    #     if rw < self.reward_width:
    #         x_axis_reward_rate = 1 + ((self.reward_width - abs(ball_x - self.window_x/2)) * self.x_axis_max_reward_rate)
    #         step_reward = self.steps * 0.01 * x_axis_reward_rate  # Simplified reward calculation
    #         return step_reward
    #     else:
    #         return 0

    def _reward_calculator(self, ball_x):
        # score & reward
        step_reward = 1/100

        rw = abs(ball_x - self.window_x/2)
        if rw < self.reward_width:
            x_axis_reward_rate = 1 + ((self.reward_width - abs(ball_x - self.window_x/2)) * self.x_axis_max_reward_rate)
            step_reward = self.steps * 0.01 * x_axis_reward_rate  # Simplified reward calculation

            if self.steps % 500 == 0:
                step_reward += self.steps/100

            return step_reward
        else:
            return 0

    def close(self):
        """Close the game and clean up resources"""
        if self.render_mode in ["human", "rgb_array"]:
            pygame.quit()

    def run_standalone(self):
        """Run the game in standalone mode with keyboard controls"""
        if self.render_mode not in ["human"]:
            raise ValueError("Standalone mode requires render_mode='human'")

        running = True
        while running:
            # Handle events
            for event in pygame.event.get():
                if event.type == pygame.QUIT:
                    running = False
                elif event.type == pygame.KEYDOWN:
                    if event.key == pygame.K_r and self.game_over:
                        self.reset()

            # Process keyboard controls
            keys = pygame.key.get_pressed()
            action = 0
            if keys[pygame.K_LEFT]:
                action = 0 - self.player_ball_speed
            if keys[pygame.K_RIGHT]:
                action = self.player_ball_speed

            # Take game step
            if not self.game_over:
                self.step(action)

            # Render
            self.render()

        self.close()

pygame-ce 2.5.3 (SDL 2.30.12, Python 3.11.12)


## Levels

## GYM env

In [None]:
import gymnasium as gym
import numpy as np
from gymnasium import spaces

# from balancing_ball_game import BalancingBallGame

class BalancingBallEnv(gym.Env):
    """
    Gymnasium environment for the Balancing Ball game
    """
    metadata = {'render_modes': ['human', 'rgb_array']}

    def __init__(self, render_mode="rgb_array", difficulty="medium", fps=30):
        super(BalancingBallEnv, self).__init__()

        # Action space: discrete - 0: left, 1: right
        self.action_space = spaces.Discrete(2)

        # Initialize game
        self.window_x = 300
        self.window_y = 180
        self.platform_shape = "circle"
        self.platform_proportion = 0.333

        self.stack_size = 3  # Number of frames to stack
        self.observation_stack = []  # Initialize the stack
        self.render_mode = render_mode

        self.game = BalancingBallGame(
            render_mode=render_mode,
            sound_enabled=(render_mode == "human"),
            difficulty=difficulty,
            window_x = self.window_x,
            window_y = self.window_y,
            fps = fps,
            platform_shape = self.platform_shape,
            platform_proportion = self.platform_proportion,
        )

        # Image observation space (RGB) with stacked frames
        self.observation_space = spaces.Box(
            low=0, high=255,
            shape=(self.window_y, self.window_x, 3 * self.stack_size),  # For stacked frames
            dtype=np.uint8
        )

        # Platform_length /= 2 when for calculate the distance to the
        # center of game window coordinates. The closer you are, the higher the reward.
        self.platform_reward_length = (self.game.platform_length / 2) - 5

        # When the ball is to be 10 points away from the center coordinates,
        # it should be 1 - ((self.platform_length - 10) * self.x_axis_max_reward_rate)
        self.x_axis_max_reward_rate = 0.5 / self.platform_reward_length

        self.step_start_timestamp = None

    def step(self, action):
        """Take a step in the environment"""
        # total_step_duration = datetime.datetime.now() - self.step_start_timestamp if self.step_start_timestamp else None
        # print("1total_step_duration: ", total_step_duration)
        # self.step_start_timestamp = datetime.datetime.now()

        # Take step in the game
        # todo
        # 修改代码变成模型执行一次动作然后在接下来的一定禎数持续该动作，同时收集并且堆叠祯然后给模型预测下一次动作
        # 比如一次循环为6祯，那麼模型一次动作将持续六祯，同时堆叠该6祯给模型预测下一次动作
        obs, step_reward, terminated = self.game.step(action)

        # step_end_timestamp = datetime.datetime.now()
        # step_duration = (step_end_timestamp - self.step_start_timestamp).total_seconds()
        # print("2step_duration: ", step_duration)

        # Stack the frames
        self.observation_stack.append(obs)
        if len(self.observation_stack) > self.stack_size:
            self.observation_stack.pop(0)  # Remove the oldest frame

        # If the stack isn't full yet, pad it with the current frame
        while len(self.observation_stack) < self.stack_size:
            self.observation_stack.insert(0, obs)  # Pad with current frame at the beginning

        stacked_obs = np.concatenate(self.observation_stack, axis=-1)

        # stack_frames_duration = (datetime.datetime.now() - step_end_timestamp).total_seconds()
        # print("3stack_frames_duration: ", stack_frames_duration)

        # Gymnasium expects (observation, reward, terminated, truncated, info)
        return stacked_obs, step_reward, terminated, False, {}

    def reset(self, seed=None, options=None):
        """Reset the environment"""
        super().reset(seed=seed)  # This properly seeds the environment in Gymnasium

        observation = self.game.reset()

        # Reset the observation stack
        self.observation_stack = []

        # Fill the stack with the initial observation
        for _ in range(self.stack_size):
            self.observation_stack.append(observation)

        # Create stacked observation
        stacked_obs = np.concatenate(self.observation_stack, axis=-1)

        info = {}
        return stacked_obs, info

    def render(self):
        """Render the environment"""
        return self.game.render()

    def close(self):
        """Clean up resources"""
        self.game.close()

## Test

In [None]:
import argparse

# from balancing_ball_game import BalancingBallGame

def run_standalone_game(render_mode="human", difficulty="medium", capture_per_second=3):
    """Run the game in standalone mode with visual display"""
    window_x = 1000
    window_y = 600
    platform_shape = "circle"
    platform_proportion = 0.333

    game = BalancingBallGame(
        render_mode = render_mode,
        difficulty = difficulty,
        window_x = window_x,
        window_y = window_y,
        platform_shape = platform_shape,
        platform_proportion = platform_proportion,
        fps = 30,
        capture_per_second = 3,
    )

    game.run_standalone()

def test_gym_env(episodes=3, difficulty="medium"):
    """Test the OpenAI Gym environment"""
    import time
    # from gym_env import BalancingBallEnv

    fps = 30
    env = BalancingBallEnv(
        render_mode="human",
        difficulty=difficulty,
        fps=fps,
    )

    for episode in range(episodes):
        observation, info = env.reset()
        total_reward = 0
        step = 0
        done = False

        while not done:
            # Sample a random action (for testing only)
            action = env.action_space.sample()

            # Take step
            observation, reward, terminated, truncated, _ = env.step(action)

            done = terminated or truncated
            total_reward += reward
            step += 1

            # Render
            env.render()

        print(f"Episode {episode+1}: Steps: {step}, Total Reward: {total_reward:.2f}")

    env.close()

## Train

In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import gymnasium as gym
import sys
import optuna

from stable_baselines3 import PPO
from stable_baselines3.common.policies import ActorCriticCnnPolicy
from stable_baselines3.common.vec_env import DummyVecEnv, VecTransposeImage
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback
from stable_baselines3.common.evaluation import evaluate_policy

# Add the game directory to the system path
# sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "game_base_files_test"))

# from gym_env import BalancingBallEnv

# support render_mode: human, rgb_array, rgb_array_and_human, rgb_array_and_human_in_colab

class Train:
    def __init__(self,
                 learning_rate=0.0003,
                 n_steps=2048,
                 batch_size=64,
                 n_epochs=10,
                 gamma=0.99,
                 gae_lambda=0.95,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 policy_kwargs=None,
                 n_envs=4,
                 difficulty="medium",
                 load_model=None,
                 log_dir="./logs/",
                 model_dir="./models/",
                ):

        # Create directories
        os.makedirs(log_dir, exist_ok=True)
        os.makedirs(model_dir, exist_ok=True)
        self.log_dir = log_dir
        self.model_dir = model_dir
        self.n_envs = n_envs

        # Setup environments
        # support render_mode: human, rgb_array, rgb_array_and_human, rgb_array_and_human_in_colab
        env = make_vec_env(
            self.make_env(render_mode="rgb_array", difficulty=difficulty),
            n_envs=n_envs
        )
        # Apply VecTransposeImage to correctly handle image observations
        self.env = VecTransposeImage(env)

        # Setup evaluation environment
        eval_env = make_vec_env(
            self.make_env(render_mode="rgb_array", difficulty=difficulty),
            n_envs=1
        )
        self.eval_env = VecTransposeImage(eval_env)

        # Define policy kwargs if not provided
        if policy_kwargs is None:
            policy_kwargs = {
                "features_extractor_kwargs": {"features_dim": 512},
            }

        # Create the PPO model
        if load_model:
            print(f"Loading model from {load_model}")
            self.model = PPO.load(
                load_model,
                env=self.env,
                tensorboard_log=log_dir,
            )
        else:
            self.model = PPO(
                policy=ActorCriticCnnPolicy,
                env=self.env,
                learning_rate=learning_rate,
                n_steps=n_steps,
                batch_size=batch_size,
                n_epochs=n_epochs,
                gamma=gamma,
                gae_lambda=gae_lambda,
                ent_coef=ent_coef,
                vf_coef=vf_coef,
                max_grad_norm=max_grad_norm,
                tensorboard_log=log_dir,
                policy_kwargs=policy_kwargs,
                verbose=1,
            )

    def make_env(self, render_mode="rgb_array", difficulty="medium"):
        """
        Create and return an environment function to be used with VecEnv
        """
        def _init():
            env = BalancingBallEnv(render_mode=render_mode, difficulty=difficulty)
            return env
        return _init

    def train_ppo(self,
                  total_timesteps=1000000,
                  save_freq=10000,
                  eval_freq=10000,
                  eval_episodes=5,
                 ):
        """
        Train a PPO agent to play the Balancing Ball game

        Args:
            total_timesteps: Total number of steps to train for
            n_envs: Number of parallel environments
            save_freq: How often to save checkpoints (in timesteps)
            log_dir: Directory for tensorboard logs
            model_dir: Directory to save models
            eval_freq: How often to evaluate the model (in timesteps)
            eval_episodes: Number of episodes to evaluate on
            difficulty: Game difficulty level
            load_model: Path to model to load for continued training
        """

        # Setup callbacks
        checkpoint_callback = CheckpointCallback(
            save_freq=save_freq // self.n_envs,  # Divide by n_envs as save_freq is in timesteps
            save_path=self.model_dir,
            name_prefix="ppo_balancing_ball"
        )

        eval_callback = EvalCallback(
            self.eval_env,
            best_model_save_path=self.model_dir,
            log_path=self.log_dir,
            eval_freq=eval_freq // self.n_envs,
            n_eval_episodes=eval_episodes,
            deterministic=True,
            render=False
        )

        # Train the model
        print("Starting training...")
        self.model.learn(
            total_timesteps=total_timesteps,
            callback=[checkpoint_callback, eval_callback],
        )

        # Save the final model
        self.model.save(f"{self.model_dir}/ppo_balancing_ball_final")

        print("Training completed!")
        return self.model

    def evaluate(self, model_path, n_episodes=10, difficulty="medium"):
        """
        Evaluate a trained model

        Args:
            model_path: Path to the saved model
            n_episodes: Number of episodes to evaluate on
            difficulty: Game difficulty level
        """
        # Load the model
        model = PPO.load(model_path)

        # Evaluate
        mean_reward, std_reward = evaluate_policy(
            model,
            self.env,
            n_eval_episodes=n_episodes,
            deterministic=True,
            render=True
        )

        print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

        self.env.close()

    def optuna_parameter_tuning(self, n_trials):
        print("You are using optuna for automatic parameter tuning, it will create a new model")

        pruner = optuna.pruners.HyperbandPruner(
            min_resource=100,        # 最小资源量
            max_resource='auto',   # 最大资源量 ('auto' 或 整数)
            reduction_factor=3     # 折减因子 (eta)
        )

        # 建立 study 物件，並指定剪枝器
        study = optuna.create_study(direction='maximize', pruner=pruner)

        # 執行優化
        study.optimize(self.objective, n_trials=n_trials)

        # 分析結果
        print("最佳試驗的超參數：", study.best_trial.params)
        print("最佳試驗的平均回報：", study.best_trial.value)

        import pandas as pd
        df = study.trials_dataframe()
        print(df.head())


    # def evaluate_policy(self, model, n_eval_episodes=10):
    #     """
    #     評估強化學習策略的函數。

    #     Args:
    #         model: 要評估的 Stable Baselines3 模型。
    #         env: 用於評估的環境。
    #         n_eval_episodes: 要運行的 episode 數量。

    #     Returns:
    #         平均回報。
    #     """
    #     rewards = []
    #     for _ in range(n_eval_episodes):
    #         obs = self.env.reset()[0]  # 注意gymnasium的reset()返回值
    #         done = False
    #         total_reward = 0
    #         while not done:
    #             action, _ = model.predict(obs, deterministic=True)  # 使用確定性策略
    #             obs, reward, terminated, truncated, _ = self.env.step(action)
    #             done = terminated or truncated
    #             total_reward += reward
    #         rewards.append(total_reward)
    #     return sum(rewards) / n_eval_episodes


    def objective(self, trial):

        # 1. 建議超參數
        learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True)
        gamma = trial.suggest_float('gamma', 0.9, 0.999)
        clip_range = trial.suggest_float('clip_range', 0.1, 0.3)
        gae_lambda = trial.suggest_float('gae_lambda', 0.5, 2)
        ent_coef = trial.suggest_float('ent_coef', 0.005, 0.05)
        vf_coef = trial.suggest_float('vf_coef', 0.1, 1)
        features_dim = trial.suggest_categorical('features_dim', [32, 64, 128, 256, 512])
        policy_kwargs = {
            "features_extractor_kwargs": {"features_dim": features_dim},
        }


        n_steps=2048
        batch_size=64
        n_epochs=10
        # gamma=0.99
        # gae_lambda=0.95
        # ent_coef=0.01
        # vf_coef=0.5
        max_grad_norm=0.5
        # policy_kwargs = {
        #     "features_extractor_kwargs": {"features_dim": 512},
        # }

        # 2. 建立環境
        env = make_vec_env(
            self.make_env(render_mode="rgb_array", difficulty="medium"),
            n_envs=1
        )

        # 3. 建立模型
        model = PPO(
                policy=ActorCriticCnnPolicy,
                env=env,
                learning_rate=learning_rate,
                n_steps=n_steps,
                batch_size=batch_size,
                n_epochs=n_epochs,
                gamma=gamma,
                clip_range=clip_range,
                gae_lambda=gae_lambda,
                ent_coef=ent_coef,
                vf_coef=vf_coef,
                max_grad_norm=max_grad_norm,
                tensorboard_log=self.log_dir,
                policy_kwargs=policy_kwargs,
                verbose=0,
            )
        # 4. 訓練模型
        model.learn(total_timesteps=10000)

        # 5. 評估模型
        mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10)
        # mean_reward = self.evaluate_policy(model, env, n_eval_episodes=10)

        # 6. 關閉環境
        env.close()

        # 7. 返回平均回報
        return mean_reward

# import argparse

# parser = argparse.ArgumentParser(description="Train or evaluate PPO agent for Balancing Ball")
# parser.add_argument("--mode", type=str, default="train", choices=["train", "eval"],
#                     help="Mode: 'train' to train model, 'eval' to evaluate")
# parser.add_argument("--timesteps", type=int, default=1000000,
#                     help="Total timesteps for training")
# parser.add_argument("--difficulty", type=str, default="medium",
#                     choices=["easy", "medium", "hard"],
#                     help="Game difficulty")
# parser.add_argument("--load_model", type=str, default=None,
#                     help="Path to model to load for continued training or evaluation")
# parser.add_argument("--n_envs", type=int, default=4,
#                     help="Number of parallel environments for training")
# parser.add_argument("--eval_episodes", type=int, default=5,
#                     help="Number of episodes for evaluation")

# args = parser.parse_args()

# if args.mode == "train":
#     train_ppo(
#         total_timesteps=args.timesteps,
#         difficulty=args.difficulty,
#         n_envs=args.n_envs,
#         load_model=args.load_model,
#         eval_episodes=args.eval_episodes,
#     )
# else:
#     if args.load_model is None:
#         print("Error: Must provide --load_model for evaluation")
#     else:
#         evaluate(
#             model_path=args.load_model,
#             n_episodes=args.eval_episodes,
#             difficulty=args.difficulty
#         )

# Training

In [None]:
import gc

# Memory-optimized TPU training setup
def get_tpu_memory_info():
    """Get memory information from TPU device if available"""
    if TPU_AVAILABLE:
        try:
            # This is just for diagnostic purposes
            import subprocess
            result = subprocess.run(['python3', '-c', 'import torch_xla; print(torch_xla._XLAC._xla_get_memory_info(torch_xla._XLAC._xla_get_default_device()))'],
                                   stdout=subprocess.PIPE, text=True)
            print(f"TPU Memory Info: {result.stdout}")
        except:
            print("Could not get detailed TPU memory info")

# Display memory information
get_tpu_memory_info()

n_envs = 1
batch_size = 64
n_steps = 1024
features_dim = 256

# Policy kwargs with memory-efficient parameters
policy_kwargs = {
    "features_extractor_kwargs": {"features_dim": features_dim},
    "net_arch": [64, 64]
}

# Create trainer with optimized parameters for TPU memory
training = Train(
    n_steps=n_steps,
    batch_size=batch_size,
    difficulty="medium",
    n_envs=n_envs,
    load_model=None,
    policy_kwargs=policy_kwargs
)

# Choose whether to do hyperparameter optimization or direct training
do_optimization = True

if do_optimization:
    # Force TPU memory cleanup before starting
    if TPU_AVAILABLE:
        gc.collect()
        xm.mark_step()

    n_trials = 10
    best_trial = training.optuna_parameter_tuning(n_trials=n_trials)
    print(f"Best parameters found: {best_trial.params}")
else:
    # Run training with memory-optimized settings
    # Use fewer total timesteps for TPU to avoid memory issues
    total_timesteps = 50000

    model = training.train_ppo(
        total_timesteps=total_timesteps,
        eval_episodes=3,  # Fewer eval episodes on TPU
        save_freq=5000,
        eval_freq=5000
    )

    # Force memory cleanup after training
    if TPU_AVAILABLE:
        del model
        gc.collect()
        xm.mark_step()

The json memory file does not exist. Creating new file.
self.x_axis_max_reward_rate:  0.0449438202247191
Loading the json memory file
self.x_axis_max_reward_rate:  0.0449438202247191
Using cuda device


[I 2025-05-12 08:13:43,648] A new study created in memory with name: no-name-5d310101-4195-4b87-b831-be4932a8f112


You are using optuna for automatic parameter tuning, it will create a new model
Loading the json memory file
self.x_axis_max_reward_rate:  0.0449438202247191
Score:  11.263104835559039
Score:  9.647824077317328
Score:  9.622548383576824
Score:  9.633348517892653
Score:  9.03234756678395
Score:  9.14713794621651
Score:  12.129454383666218
Score:  9.968061915868603
Score:  9.022558333414
Score:  9.898184459862577
Score:  11.268554368128436
Score:  10.378489003550966
Score:  7.991167535428429
Score:  9.935074578892182
Score:  9.249628164461809
Score:  11.264525632767171
Score:  9.769785220875953
Score:  12.094760668304943
Score:  10.299269307834457
Score:  9.084706280447536
Score:  9.291278712724445
Score:  7.414232092028512
Score:  8.374070687198516
Score:  9.762787307484125
Score:  10.401097375438106
Score:  10.820344614345188
Score:  9.24843443267512
Score:  13.698671137660723
Score:  9.886627694955928
Score:  11.522349363513193
Score:  9.187992866878254
Score:  9.329266069644238
Score

[I 2025-05-12 08:16:38,400] Trial 0 finished with value: 24.222833 and parameters: {'learning_rate': 0.0009658098068020226, 'gamma': 0.9702335826016597, 'clip_range': 0.20249713527030894, 'gae_lambda': 1.2349541159790536, 'ent_coef': 0.04689464867612561, 'vf_coef': 0.26001291356702166, 'features_dim': 64}. Best is trial 0 with value: 24.222833.


Score:  35.89533087026333
Loading the json memory file
self.x_axis_max_reward_rate:  0.0449438202247191
Score:  12.819456080763286
Score:  10.508612752211912
Score:  8.663565115627756
Score:  9.11713158933281
Score:  12.863521067550023
Score:  8.45823277577527
Score:  9.174039724484595
Score:  9.011470130664275
Score:  12.227910733230381
Score:  9.693641383724572
Score:  9.948986399895555
Score:  9.187956536445707
Score:  9.232639466655296
Score:  17.230359564813806
Score:  11.238044494989598
Score:  10.663407937009955
Score:  9.019817991681585
Score:  7.355525434433136
Score:  9.878733582360494
Score:  8.50460565285637
Score:  8.520016169213223
Score:  9.822136030272295
Score:  8.966862373224734
Score:  10.417491198559755
Score:  11.556676608468207
Score:  9.996221022745738
Score:  10.690522778054214
Score:  10.692778395596875
Score:  7.865478755420718
Score:  9.956863147885857
Score:  9.311304196165464
Score:  9.262005354406911
Score:  10.169536869793417
Score:  11.195193516094884
Sc

[I 2025-05-12 08:19:33,760] Trial 1 finished with value: 21.304708500000004 and parameters: {'learning_rate': 0.00012011484933666821, 'gamma': 0.9127820522537138, 'clip_range': 0.1365668959481123, 'gae_lambda': 1.1281392718108996, 'ent_coef': 0.023903457482917858, 'vf_coef': 0.10142866020613063, 'features_dim': 128}. Best is trial 0 with value: 24.222833.


Score:  6.714086311401905
Score:  6.714086311401905
Loading the json memory file
self.x_axis_max_reward_rate:  0.0449438202247191
Score:  10.58993276846231
Score:  8.545650699605247
Score:  12.003370665035817
Score:  9.851696391259022
Score:  8.028672548620616
Score:  9.337992999681532
Score:  17.13675542087706
Score:  10.395180361268947
Score:  10.830298116257385
Score:  9.188276295322476
Score:  9.010190314252236
Score:  14.555110750807112
Score:  10.43308900529339
Score:  10.6913593679443
Score:  9.93320159579189
Score:  9.183182229525281
Score:  9.808476768468847
Score:  11.470482105665557
Score:  8.61201827830468
Score:  9.144857952773163
Score:  10.491304488846067
Score:  11.95020828159633
Score:  11.463009578593942
Score:  10.462914926225523
Score:  9.941994769486989
Score:  9.262456718486057
Score:  10.310016530362388
Score:  9.766974473876944
Score:  9.836219255447672
Score:  9.999906400354154
Score:  9.750949259738015
Score:  12.22833307581311
Score:  10.11502852879867
Score:

[I 2025-05-12 08:22:31,063] Trial 2 finished with value: 15.4688969 and parameters: {'learning_rate': 0.00023531126871497102, 'gamma': 0.9076485143377995, 'clip_range': 0.13771263889425167, 'gae_lambda': 1.7362208980524643, 'ent_coef': 0.0281629208353441, 'vf_coef': 0.8004633191031324, 'features_dim': 128}. Best is trial 0 with value: 24.222833.


Score:  35.89533087026333
Score:  6.714086311401905
Loading the json memory file
self.x_axis_max_reward_rate:  0.0449438202247191
Score:  8.403491680609735
Score:  8.591782821475176
Score:  12.045536814085429
Score:  11.537130244769743
Score:  7.896461256035934
Score:  18.236499852385617
Score:  11.280496579749018
Score:  11.083285450559478
Score:  9.688488096959563
Score:  8.35561955455311
Score:  9.104770538699727
Score:  9.791343081758887
Score:  9.10832719872834
Score:  9.282516637328444
Score:  11.548306821693231
Score:  8.540095132785922
Score:  11.422907641411356
Score:  7.901380589864368
Score:  9.265201650512344
Score:  9.205027794043808
Score:  10.54896280233502
Score:  11.550166405958596
Score:  12.99850892480011
Score:  8.463805081108426
Score:  10.46582541015463
Score:  7.7427320201695675
Score:  14.398872304520236
Score:  10.03871956326535
Score:  10.033575365489282
Score:  9.09000068967239
Score:  11.935650942649893
Score:  12.050249914394358
Score:  9.154458638515827
Sc

[I 2025-05-12 08:25:28,777] Trial 3 finished with value: 27.4462909 and parameters: {'learning_rate': 0.0003927558743241921, 'gamma': 0.9510980035023114, 'clip_range': 0.14221297235184943, 'gae_lambda': 1.385208372795532, 'ent_coef': 0.03630056835036703, 'vf_coef': 0.45553387828732683, 'features_dim': 64}. Best is trial 3 with value: 27.4462909.


Score:  35.89533087026333
Loading the json memory file
self.x_axis_max_reward_rate:  0.0449438202247191
Score:  9.660315009782538
Score:  9.093245036407467
Score:  8.445113359795544
Score:  8.58666357304478
Score:  10.689388022645499
Score:  9.957194944306226
Score:  9.841167070367597
Score:  8.953154258521485
Score:  10.595871421688143
Score:  7.80885002973819
Score:  12.95310105503993
Score:  8.619234466681108
Score:  12.561031594224364
Score:  9.045878063469734
Score:  10.668366981058748
Score:  8.966980856466158
Score:  9.1903311514226
Score:  8.491713850043588
Score:  10.639747160417969
Score:  9.982164656130939
Score:  9.040128883515347
Score:  10.626116257413454
Score:  9.8964066377854
Score:  8.581189127100885
Score:  7.79922807048953
Score:  11.951533235866687
Score:  12.183729960898301
Score:  9.341017561464746
Score:  8.367779868831072
Score:  10.632372095516628
Score:  9.169762684629207
Score:  13.554539189446785
Score:  9.632644716154449
Score:  9.660076865044248
Score:  7

[I 2025-05-12 08:28:25,908] Trial 4 finished with value: 21.3047085 and parameters: {'learning_rate': 0.0007079843389534981, 'gamma': 0.9274712662480489, 'clip_range': 0.1484416254268359, 'gae_lambda': 1.6547888812281273, 'ent_coef': 0.03184867928642882, 'vf_coef': 0.8921940284929839, 'features_dim': 128}. Best is trial 3 with value: 27.4462909.


Score:  35.89533087026333
Score:  6.714086311401905
Loading the json memory file
self.x_axis_max_reward_rate:  0.0449438202247191
Score:  9.314006845754372
Score:  9.203701467332628
Score:  13.646594425617605
Score:  10.081735879882492
Score:  9.040566525354365
Score:  10.82370040147831
Score:  8.959664009310128
Score:  8.551741033260246
Score:  7.967659762189302
Score:  9.250396640171852
Score:  9.080222822280987
Score:  8.453158299845702
Score:  9.806724307874791
Score:  9.73266897879449
Score:  11.560329408450603
Score:  11.099308026714162
Score:  9.094281431013727
Score:  10.533017974928985
Score:  8.686208686296283
Score:  10.697955483325428
Score:  12.102572342450488
Score:  11.202392914685687
Score:  12.71631109768782
Score:  9.972788091336621
Score:  9.184242486198087
Score:  10.50068227161954
Score:  11.143936441304644
Score:  10.422561976921843
Score:  8.501351067967862
Score:  9.324666004159479
Score:  9.004490843714755
Score:  9.213868156555298
Score:  10.694293705647631
Sc

[I 2025-05-12 08:31:24,477] Trial 5 finished with value: 24.2226994 and parameters: {'learning_rate': 1.7084398115891685e-05, 'gamma': 0.9689826598147425, 'clip_range': 0.2308059635915498, 'gae_lambda': 0.8951704412478843, 'ent_coef': 0.021664005990764634, 'vf_coef': 0.9619915693316471, 'features_dim': 128}. Best is trial 3 with value: 27.4462909.


Score:  35.89533087026333
Loading the json memory file
self.x_axis_max_reward_rate:  0.0449438202247191
Score:  12.761511242992915
Score:  9.33051119555375
Score:  10.535341891649141
Score:  10.03277141222642
Score:  9.273672307954959
Score:  12.852949133634677
Score:  9.221798590757654
Score:  12.030217653801573
Score:  8.617978591609411
Score:  10.792910607544208
Score:  9.251006605176507
Score:  10.048996898939809
Score:  11.144614010881353
Score:  12.223943054347915
Score:  9.197495503475373
Score:  9.198777743087403
Score:  15.523862537463112
Score:  11.362943806823118
Score:  11.27071532053912
Score:  12.660507048899905
Score:  10.547866685312432
Score:  9.715223368577407
Score:  11.427461016149335
Score:  8.945637799856975
Score:  9.995600627274595
Score:  9.159609918677782
Score:  11.839729843422736
Score:  9.973121587548047
Score:  8.945430908072293
Score:  10.599146094906095
Score:  7.981894939374316
Score:  7.844126497286116
Score:  9.97926642702815
Score:  11.30448605913561

  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)
  return np.nan if var_y == 0 else float(1 - np.var(y_true - y_pred) / var_y)


Score:  35.89533087026333
Score:  35.89533087026333
Score:  6.714086311401905
Score:  35.89533087026333
Score:  35.89533087026333
Score:  6.714086311401905
Score:  35.89533087026333
Score:  6.714086311401905
Score:  35.89533087026333
Score:  6.714086311401905
Score:  35.89533087026333
Score:  35.89533087026333
Score:  35.89533087026333
Score:  6.714086311401905
Score:  6.714086311401905
Score:  6.714086311401905
Score:  35.89533087026333
Score:  35.89533087026333
Score:  6.714086311401905
Score:  6.714086311401905
Score:  35.89533087026333
Score:  6.714086311401905
Score:  6.714086311401905
Score:  6.714086311401905
Score:  6.714086311401905
Score:  35.89533087026333
Score:  6.714086311401905
Score:  35.89533087026333
Score:  6.714086311401905
Score:  6.714086311401905
Score:  35.89533087026333
Score:  6.714086311401905
Score:  6.714086311401905
Score:  35.89533087026333
Score:  35.89533087026333
Score:  35.89533087026333
Score:  35.89533087026333
Score:  6.714086311401905
Score:  6.71

[I 2025-05-12 08:34:23,475] Trial 6 finished with value: 30.059082 and parameters: {'learning_rate': 5.486951503714077e-05, 'gamma': 0.9928562527165621, 'clip_range': 0.1331419128798429, 'gae_lambda': 1.7736564758969893, 'ent_coef': 0.04467590295722115, 'vf_coef': 0.28933176105065056, 'features_dim': 256}. Best is trial 6 with value: 30.059082.


Score:  35.89533087026333
Loading the json memory file
self.x_axis_max_reward_rate:  0.0449438202247191
Score:  13.870354916688425
Score:  10.539249213414225
Score:  8.645229542346161
Score:  9.780337646176301
Score:  9.976180675009685
Score:  11.313562448123276
Score:  8.051601350151122
Score:  10.86194246202602
Score:  9.746871610896898
Score:  8.300003622160105
Score:  12.01177925484373
Score:  12.487068672422488
Score:  8.719849668019917
Score:  9.00291234897197
Score:  10.448030022227007
Score:  8.51789113109359
Score:  13.741764599422613
Score:  9.970000223784034
Score:  11.408639359817071
Score:  15.525286362311952
Score:  9.982819941009751
Score:  8.577033471178433
Score:  10.478519490069695
Score:  7.743215051606086
Score:  8.641734359729298
Score:  9.094332424390329
Score:  9.84620673382556
Score:  9.06996005870925
Score:  8.577121467503822
Score:  9.958472251008194
Score:  8.735002014200692
Score:  9.081611541677239
Score:  7.4376762540009365
Score:  8.46172086260782
Score: 

[I 2025-05-12 08:37:23,599] Trial 7 finished with value: 27.140957500000003 and parameters: {'learning_rate': 6.486519514610248e-05, 'gamma': 0.99678211145833, 'clip_range': 0.28764284045232635, 'gae_lambda': 0.8522970826158942, 'ent_coef': 0.02872307049958714, 'vf_coef': 0.9650430237181122, 'features_dim': 128}. Best is trial 6 with value: 30.059082.


Score:  35.89533087026333
Loading the json memory file
self.x_axis_max_reward_rate:  0.0449438202247191
Score:  9.168232336567401
Score:  7.870277379199648
Score:  9.075492255008065
Score:  10.53071414565449
Score:  9.144492399307097
Score:  8.630489595705958
Score:  9.847325907904796
Score:  7.914186588434603
Score:  10.681385852087603
Score:  13.005682878202714
Score:  9.69589341041952
Score:  7.781969197512261
Score:  11.561930775402905
Score:  9.78672719613586
Score:  9.855135122833012
Score:  7.878087631469127
Score:  9.096049768952179
Score:  12.325614716105065
Score:  10.039963087968578
Score:  9.790842034772867
Score:  9.131134037089955
Score:  12.177636650021814
Score:  8.631137706627603
Score:  10.007160277934195
Score:  11.413700124291799
Score:  9.82972170252379
Score:  9.151950467284419
Score:  13.524447708498437
Score:  9.950415000772566
Score:  11.152009151403119
Score:  10.723648226958618
Score:  9.783968855432638
Score:  8.983739026852474
Score:  8.994542609232148
Scor

[I 2025-05-12 08:40:24,538] Trial 8 finished with value: 27.140957500000003 and parameters: {'learning_rate': 0.0001407945175771097, 'gamma': 0.9185311062449576, 'clip_range': 0.10329737350557056, 'gae_lambda': 0.89958629296331, 'ent_coef': 0.019467231173835545, 'vf_coef': 0.13198674577969202, 'features_dim': 64}. Best is trial 6 with value: 30.059082.


Score:  35.89533087026333
Loading the json memory file
self.x_axis_max_reward_rate:  0.0449438202247191
Score:  9.368742196002398
Score:  9.28609537960954
Score:  10.494809733554133
Score:  13.127861298629263
Score:  9.125050158616848
Score:  9.345452043623867
Score:  9.984431228735508
Score:  8.994918663422778
Score:  11.83262516344042
Score:  9.03808398432341
Score:  12.40597203812212
Score:  14.446702842088639
Score:  10.076578582098811
Score:  9.989956265107814
Score:  9.161891766164912
Score:  8.662431813537898
Score:  8.605605330710834
Score:  10.088554318604665
Score:  10.020120207232688
Score:  10.361656703512107
Score:  9.90406516561064
Score:  11.152506948521086
Score:  10.810401950754343
Score:  12.062934839680064
Score:  9.043350767929894
Score:  14.43913349447292
Score:  11.531493462126678
Score:  11.226661001804967
Score:  11.261381172430484
Score:  9.854507521293474
Score:  9.729711247607113
Score:  9.183752983462565
Score:  9.666621141103416
Score:  8.55650710102836
Sco

[I 2025-05-12 08:43:27,073] Trial 9 finished with value: 18.3865835 and parameters: {'learning_rate': 4.0955224759590364e-05, 'gamma': 0.945506125646682, 'clip_range': 0.17118860056412796, 'gae_lambda': 1.5200001675981019, 'ent_coef': 0.02755911768357827, 'vf_coef': 0.2703490765874229, 'features_dim': 512}. Best is trial 6 with value: 30.059082.


Score:  35.89533087026333
最佳試驗的超參數： {'learning_rate': 5.486951503714077e-05, 'gamma': 0.9928562527165621, 'clip_range': 0.1331419128798429, 'gae_lambda': 1.7736564758969893, 'ent_coef': 0.04467590295722115, 'vf_coef': 0.28933176105065056, 'features_dim': 256}
最佳試驗的平均回報： 30.059082
   number      value             datetime_start          datetime_complete  \
0       0  24.222833 2025-05-12 08:13:43.649919 2025-05-12 08:16:38.400583   
1       1  21.304709 2025-05-12 08:16:38.401665 2025-05-12 08:19:33.760175   
2       2  15.468897 2025-05-12 08:19:33.761518 2025-05-12 08:22:31.063406   
3       3  27.446291 2025-05-12 08:22:31.064579 2025-05-12 08:25:28.776555   
4       4  21.304709 2025-05-12 08:25:28.777840 2025-05-12 08:28:25.907862   

                duration  params_clip_range  params_ent_coef  \
0 0 days 00:02:54.750664           0.202497         0.046895   
1 0 days 00:02:55.358510           0.136567         0.023903   
2 0 days 00:02:57.301888           0.137713         0.0281

AttributeError: 'NoneType' object has no attribute 'params'

In [None]:
# Copy the best model to a stable location
!cp /content/models/best_model.zip /content/drive/MyDrive/RL_Models/best_model_$(date +%Y%m%d_%H%M%S).zip

# Optional: Monitor TPU usage
if TPU_AVAILABLE:
    !sudo lsof -w /dev/accel0

In [None]:
# Load a saved model and continue training or evaluate
model_path = "/content/models/best_model.zip"

if os.path.exists(model_path):
    print(f"Loading model from {model_path} for evaluation")

    # Create trainer with the saved model
    eval_trainer = Train(
        n_steps=1024,
        batch_size=batch_size,
        difficulty="medium",
        n_envs=1,  # Use 1 env for evaluation
        load_model=model_path
    )

    # Evaluate the model
    eval_trainer.evaluate(
        model_path=model_path,
        n_episodes=5,
        difficulty="medium"
    )
else:
    print(f"Model not found at {model_path}")

# --

In [None]:
# run_standalone_game(difficulty="medium")
# test_gym_env(difficulty="medium")