In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.distributions import Categorical
import gymnasium as gym
import random
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from gymnasium import spaces
import wandb

In [2]:
import sys
sys.path.append("/home/martina/codi2/4year/tfg")  # add parent folder of general.py

from general import prepare, Glioblastoma, Glioblastoma2, testing


# PPO:

In [5]:
class GlobalAwarePPOActorCritic(nn.Module):
    def __init__(self, env, learning_rate=3e-4, device='cpu'):
        super().__init__()
        self.device = device
        self.n_outputs = env.action_space.n
        self.learning_rate = learning_rate
        
        # CNN for patch processing
        input_channels = 1
        patch_shape = env.observation_space['patch'].shape  # (60, 60)
        
        self.patch_features = nn.Sequential(
            nn.Conv2d(input_channels, 32, kernel_size=3, stride=2, padding=1),
            nn.ELU(),
            nn.Conv2d(32, 64, kernel_size=3, stride=2, padding=1),
            nn.ELU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1),
            nn.ELU(),
        )
        
        # Calculate flattened patch features size
        with torch.no_grad():
            dummy_patch = torch.zeros(1, input_channels, *patch_shape)
            patch_features_out = self.patch_features(dummy_patch)
            patch_flatten = patch_features_out.view(1, -1).size(1)
        
        # Position embedding
        position_size = env.observation_space['position'].shape[0]  # 2
        self.position_embedding = nn.Sequential(
            nn.Linear(position_size, 16),
            nn.ELU(),
            nn.Linear(16, 32),
            nn.ELU()
        )
        
        # Combined features
        combined_features_size = patch_flatten + 32
        
        # Actor and Critic
        self.actor = nn.Sequential(
            nn.Linear(combined_features_size, 256),
            nn.ELU(),
            nn.Linear(256, 128),
            nn.ELU(),
            nn.Linear(128, self.n_outputs),
            nn.Softmax(dim=-1)
        )
        
        self.critic = nn.Sequential(
            nn.Linear(combined_features_size, 256),
            nn.ELU(),
            nn.Linear(256, 128),
            nn.ELU(),
            nn.Linear(128, 1)
        )
        
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)
        
        if self.device == 'cuda':
            self.to(self.device)
    
    def forward(self, x):
        if isinstance(x, dict):
            # Single observation - convert to batch of size 1
            patch = x['patch']
            position = x['position']
            
            if isinstance(patch, np.ndarray):
                if patch.ndim == 2:
                    patch = patch[np.newaxis, np.newaxis, :, :]  # (1, 1, 60, 60)
                patch = torch.FloatTensor(patch).to(self.device)
            
            if isinstance(position, np.ndarray):
                position = torch.FloatTensor(position).to(self.device).unsqueeze(0)  # Add batch dim
            
        elif isinstance(x, list):
            # Batch of observations
            patch_batch = []
            position_batch = []
            
            for obs in x:
                patch_batch.append(obs['patch'])
                position_batch.append(obs['position'])
            
            patch_array = np.array(patch_batch)
            if patch_array.ndim == 3:
                patch_array = patch_array[:, np.newaxis, :, :]
            
            patch = torch.FloatTensor(patch_array).to(self.device)
            position = torch.FloatTensor(np.array(position_batch)).to(self.device)
        
        # Process through networks (both paths now have batch dimension)
        patch_features = self.patch_features(patch)
        patch_flat = patch_features.view(patch.size(0), -1)
        position_embedded = self.position_embedding(position)
        combined = torch.cat([patch_flat, position_embedded], dim=-1)
        
        action_probs = self.actor(combined)
        state_values = self.critic(combined)
        
        return action_probs, state_values



# Fixed environment with global awareness
class GlobalAwareGlioblastoma(Glioblastoma):
    def __init__(self, image_path, mask_path, grid_size=4, tumor_threshold=0.0001, rewards=[1.0, -2.0, -0.5], action_space=spaces.Discrete(3), render_mode="human"):
        super().__init__(image_path, mask_path, grid_size, tumor_threshold, rewards, action_space, render_mode)
        
        self.image = np.load(image_path).astype(np.float32)
        self.mask = np.load(mask_path).astype(np.uint8)
        
        img_min, img_max = self.image.min(), self.image.max()
        if img_max > 1.0:
            self.image = (self.image - img_min) / (img_max - img_min + 1e-8)

        self.grid_size = grid_size
        self.block_size = self.image.shape[0] // grid_size
        
        self.action_space = action_space
        self.tumor_threshold = tumor_threshold
        self.rewards = rewards
        self.render_mode = render_mode

        # Dict observation space with position info
        self.observation_space = spaces.Dict({
            'patch': spaces.Box(low=0, high=1, shape=(self.block_size, self.block_size), dtype=np.float32),
            'position': spaces.Box(low=0, high=grid_size-1, shape=(2,), dtype=np.int32)
        })

        self.agent_pos = [0, 0]
        self.current_step = 0
        self.max_steps = 20

    def reset(self, seed=None, options=None):
        self.agent_pos = [0, 0]
        self.current_step = 0
        obs = self._get_obs()
        info = {}
        return obs, info

    def step(self, action):
        self.current_step += 1
        prev_pos = self.agent_pos.copy()
        
        # Movement logic (same as before)
        if self.action_space.n == 3:
            if action == 1 and self.agent_pos[0] < self.grid_size - 1:
                self.agent_pos[0] += 1
            elif action == 2 and self.agent_pos[1] < self.grid_size - 1:
                self.agent_pos[1] += 1
        elif self.action_space.n == 5:
            if action == 1 and self.agent_pos[0] < self.grid_size - 1:
                self.agent_pos[0] += 1
            elif action == 2 and self.agent_pos[1] < self.grid_size - 1:
                self.agent_pos[1] += 1
            elif action == 3 and self.agent_pos[0] > 0:
                self.agent_pos[0] -= 1
            elif action == 4 and self.agent_pos[1] > 0:
                self.agent_pos[1] -= 1
        
        reward = self._get_reward(action, prev_pos)
        obs = self._get_obs()
        terminated = self.current_step >= self.max_steps
        truncated = False
        info = {}

        return obs, reward, terminated, truncated, info

    def _get_obs(self):
        r0 = self.agent_pos[0] * self.block_size
        c0 = self.agent_pos[1] * self.block_size
        patch = self.image[r0:r0+self.block_size, c0:c0+self.block_size].astype(np.float32)
        
        return {
            'patch': patch,
            'position': np.array(self.agent_pos, dtype=np.int32)
        }

    def _get_reward(self, action, prev_pos):
        r0 = self.agent_pos[0] * self.block_size
        c0 = self.agent_pos[1] * self.block_size
        patch_mask = self.mask[r0:r0+self.block_size, c0:c0+self.block_size]
        
        tumor_count_curr = np.sum(np.isin(patch_mask, [1, 4]))
        total = self.block_size * self.block_size
        inside = (tumor_count_curr / total) >= self.tumor_threshold
        
        if inside:
            return self.rewards[0]
        else:
            if action == 0 or prev_pos == self.agent_pos:
                return self.rewards[1]
            else:
                return self.rewards[2]

    def render(self, show=True):
        if self.render_mode != "human": # would be rgb_array or ansi
            return  # Only render in human mode

        # Create RGB visualization image
        # not necessary since it's grayscale, but i want to draw the mask and position
        vis_img = np.stack([self.image] * 3, axis=-1).astype(np.float32)

        # Overlay tumor mask in red [..., 0] 
        tumor_overlay = np.zeros_like(vis_img) # do all blank but here we have 3 channels, mask is 2D
        tumor_overlay[..., 0] = (self.mask > 0).astype(float) # red channel. set to float to avoid issues when blending in vis_img

        # transparency overlay (crec que es el mateix valor que tinc a l'altra notebook)
        alpha = 0.4
        vis_img = (1 - alpha) * vis_img + alpha * tumor_overlay

        if show:
            # Plotting
            fig, ax = plt.subplots(figsize=(3, 3))
            ax.imshow(vis_img, cmap='gray', origin='upper')

            # Draw grid lines
            # alpha for transparency again
            for i in range(1, self.grid_size):
                ax.axhline(i * self.block_size, color='white', lw=1, alpha=0.5)
                ax.axvline(i * self.block_size, color='white', lw=1, alpha=0.5)

            # Draw agent position
            r0 = self.agent_pos[0] * self.block_size
            c0 = self.agent_pos[1] * self.block_size
            rect = patches.Rectangle(
                (c0, r0), # (x,y) bottom left corner
                self.block_size, # width
                self.block_size, # height
                linewidth=2,
                edgecolor='yellow',
                facecolor='none'
            )
            ax.add_patch(rect)

            ax.set_title(f"Agent at {self.agent_pos} | Step {self.current_step}/{self.max_steps}")
            ax.axis('off')
            plt.show()
            return None
        else: #just return without showing but draw the agent position
            rgb_array = (vis_img * 255).astype(np.uint8)
        
            # Draw grid lines directly on the array
            for i in range(1, self.grid_size):
                # Horizontal line
                y = i * self.block_size
                rgb_array[y-1:y+1, :] = [255, 255, 255]  # White line
                
                # Vertical line  
                x = i * self.block_size
                rgb_array[:, x-1:x+1] = [255, 255, 255]  # White line
            
            # Draw agent position as a yellow rectangle
            r0 = self.agent_pos[0] * self.block_size
            c0 = self.agent_pos[1] * self.block_size
            
            # Draw rectangle borders (yellow)
            rgb_array[r0:r0+2, c0:c0+self.block_size] = [255, 255, 0]  # Top border
            rgb_array[r0+self.block_size-2:r0+self.block_size, c0:c0+self.block_size] = [255, 255, 0]  # Bottom border
            rgb_array[r0:r0+self.block_size, c0:c0+2] = [255, 255, 0]  # Left border
            rgb_array[r0:r0+self.block_size, c0+self.block_size-2:c0+self.block_size] = [255, 255, 0]  # Right border
            
            # Add step counter text to the image
            from PIL import Image, ImageDraw, ImageFont
            pil_img = Image.fromarray(rgb_array)
            draw = ImageDraw.Draw(pil_img)
            
            # Use default font (you can also load a specific font)
            try:
                font = ImageFont.truetype("arial.ttf", 16)
            except:
                font = ImageFont.load_default()
            
            # Draw step counter in top-left corner
            step_text = f"Step: {self.current_step}/{self.max_steps}"
            draw.text((5, 5), step_text, fill=(255, 255, 0), font=font)  # Yellow text
            
            # Convert back to numpy array
            rgb_array = np.array(pil_img)
            return rgb_array
        
    def current_patch_overlap_with_lesion(self):
        row, col = self.agent_pos
        patch_h = self.block_size
        patch_w = self.block_size
        
        y0 = row * patch_h
        y1 = y0 + patch_h
        x0 = col * patch_w
        x1 = x0 + patch_w
        patch_mask = self.mask[y0:y1, x0:x1]
        overlap = np.sum(patch_mask > 0)
        return overlap


class PPOAgent:
    def __init__(self, env_config, model, train_pairs, env_class,
                 gamma=0.99, gae_lambda=0.95,
                 clip_epsilon=0.2, ppo_epochs=4, batch_size=64,
                 save_name="PPO_Agent"):
        
        self.env_config = env_config
        self.env_class = env_class
        self.model = model
        self.device = model.device
        
        self.gamma = gamma
        self.gae_lambda = gae_lambda
        self.clip_epsilon = clip_epsilon
        self.ppo_epochs = ppo_epochs
        self.batch_size = batch_size
        self.save_name = save_name
        
        self.training_rewards = []
        self.mean_training_rewards = []
        self.actor_losses = []
        self.critic_losses = []
        self.entropies = []
        
        self.train_pairs = train_pairs


# Also need to fix the PPOAgent to handle dict observations
class GlobalAwarePPOAgent(PPOAgent):
    def __init__(self, env_config, model, train_pairs, env_class,
                 gamma=0.99, gae_lambda=0.95,
                 clip_epsilon=0.2, ppo_epochs=4, batch_size=64,
                 save_name="GlobalAware_PPO"):
        
        self.env_config = env_config
        self.env_class = env_class
        self.model = model
        self.device = model.device
        
        self.gamma = gamma
        self.gae_lambda = gae_lambda
        self.clip_epsilon = clip_epsilon
        self.ppo_epochs = ppo_epochs
        self.batch_size = batch_size
        self.save_name = save_name
        
        self.training_rewards = []
        self.mean_training_rewards = []
        self.actor_losses = []
        self.critic_losses = []
        self.entropies = []
        
        self.train_pairs = train_pairs
        
    def compute_gae(self, rewards, values, dones, next_value):
        gae = 0
        returns = []
        advantages = []
        
        values = values + [next_value]
        
        for step in reversed(range(len(rewards))):
            delta = rewards[step] + self.gamma * values[step + 1] * (1 - dones[step]) - values[step]
            gae = delta + self.gamma * self.gae_lambda * (1 - dones[step]) * gae
            advantages.insert(0, gae)
            returns.insert(0, gae + values[step])
            
        return returns, advantages
    
    def collect_trajectories(self, num_steps=2048):
        all_states = []
        all_actions = []
        all_rewards = []
        all_dones = []
        all_values = []
        all_log_probs = []
        
        img_path, mask_path = random.choice(self.train_pairs)
        env = self.env_class(img_path, mask_path, **self.env_config)
        state, _ = env.reset()
        
        episode_reward = 0
        episode_rewards = []
        
        for step in range(num_steps):
            with torch.no_grad():
                action_probs, value = self.model(state)
                dist = Categorical(action_probs)
                action = dist.sample()
                log_prob = dist.log_prob(action)
                value = value.squeeze()
            
            next_state, reward, terminated, truncated, _ = env.step(action.item())
            done = terminated or truncated
            
            # Store the state dict properly
            all_states.append({
                'patch': state['patch'].copy(),
                'position': state['position'].copy()
            })
            all_actions.append(action.item())
            all_rewards.append(reward)
            all_dones.append(done)
            all_values.append(value.item())
            all_log_probs.append(log_prob.item())
            
            episode_reward += reward
            state = next_state
            
            if done:
                episode_rewards.append(episode_reward)
                img_path, mask_path = random.choice(self.train_pairs)
                env = self.env_class(img_path, mask_path, **self.env_config)
                state, _ = env.reset()
                episode_reward = 0
        
        with torch.no_grad():
            _, next_value = self.model(state)
            next_value = next_value.squeeze().item()
        
        return (all_states, all_actions, all_rewards, all_dones, 
                all_values, all_log_probs, next_value, episode_rewards)
    
    def update(self, states, actions, returns, advantages, old_log_probs):
        # Prepare batch data
        patch_batch = []
        position_batch = []
        
        for state in states:
            patch_batch.append(state['patch'])
            position_batch.append(state['position'])
        
        # Convert to tensors
        patch_array = np.array(patch_batch)
        if patch_array.ndim == 3:
            patch_array = patch_array[:, np.newaxis, :, :]
        
        batch_data = {
            'patch': torch.FloatTensor(patch_array).to(self.device),
            'position': torch.FloatTensor(np.array(position_batch)).to(self.device)
        }
        
        actions_tensor = torch.LongTensor(actions).to(self.device)
        returns_tensor = torch.FloatTensor(returns).to(self.device)
        advantages_tensor = torch.FloatTensor(advantages).to(self.device)
        old_log_probs_tensor = torch.FloatTensor(old_log_probs).to(self.device)
        
        # Normalize advantages
        advantages_tensor = (advantages_tensor - advantages_tensor.mean()) / (advantages_tensor.std() + 1e-8)
        
        batch_size = len(states)
        indices = np.arange(batch_size)
        
        for _ in range(self.ppo_epochs):
            np.random.shuffle(indices)
            
            for start in range(0, batch_size, self.batch_size):
                end = start + self.batch_size
                batch_indices = indices[start:end]
                
                batch_states = {
                    'patch': batch_data['patch'][batch_indices],
                    'position': batch_data['position'][batch_indices]
                }
                batch_actions = actions_tensor[batch_indices]
                batch_returns = returns_tensor[batch_indices]
                batch_advantages = advantages_tensor[batch_indices]
                batch_old_log_probs = old_log_probs_tensor[batch_indices]
                
                # Get current policy and value
                action_probs, values = self.model(batch_states)
                dist = Categorical(action_probs)
                new_log_probs = dist.log_prob(batch_actions)
                entropy = dist.entropy().mean()
                
                values = values.squeeze()
                
                # Calculate ratios
                ratios = torch.exp(new_log_probs - batch_old_log_probs)
                
                # Policy loss
                surr1 = ratios * batch_advantages
                surr2 = torch.clamp(ratios, 1 - self.clip_epsilon, 1 + self.clip_epsilon) * batch_advantages
                policy_loss = -torch.min(surr1, surr2).mean()
                
                # Value loss
                value_loss = 0.5 * (values - batch_returns).pow(2).mean()
                
                # Total loss
                loss = policy_loss + 0.5 * value_loss - 0.01 * entropy
                
                # Backpropagate
                self.model.optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=0.5)
                self.model.optimizer.step()
                
                self.actor_losses.append(policy_loss.item())
                self.critic_losses.append(value_loss.item())
                self.entropies.append(entropy.item())
    
    def train(self, max_episodes=1000, num_steps=512):
        print("Starting Global-Aware PPO training...")
        
        episode = 0
        best_mean_reward = -float('inf')
        
        while episode < max_episodes:
            (states, actions, rewards, dones, values, 
             old_log_probs, next_value, episode_rewards) = self.collect_trajectories(num_steps)
            
            if not states:
                continue
            
            returns, advantages = self.compute_gae(rewards, values, dones, next_value)
            self.update(states, actions, returns, advantages, old_log_probs)
            
            self.training_rewards.extend(episode_rewards)
            
            if len(self.training_rewards) >= 100:
                mean_reward = np.mean(self.training_rewards[-100:])
            else:
                mean_reward = np.mean(self.training_rewards)
                
            self.mean_training_rewards.append(mean_reward)
            
            if episode_rewards:
                avg_episode_reward = np.mean(episode_rewards)
                print(f"Episode {episode} | "
                      f"Avg Reward: {avg_episode_reward:.2f} | "
                      f"Mean Reward (100): {mean_reward:.2f} | "
                      f"Actor Loss: {np.mean(self.actor_losses[-10:] or [0]):.4f}")
                
                # save in wandb
                wandb.log({
                    "episode": episode,
                    "avg_episode_reward": avg_episode_reward,
                    "mean_reward_100": mean_reward,
                    "actor_loss": np.mean(self.actor_losses[-10:] or [0]),
                    "critic_loss": np.mean(self.critic_losses[-10:] or [0]),
                    "entropy": np.mean(self.entropies[-10:] or [0])
                })
            
            episode += len(episode_rewards)
            
            if mean_reward > best_mean_reward:
                best_mean_reward = mean_reward
                torch.save(self.model.state_dict(), f"{self.save_name}_best.pth")
                print("New best model saved!")
            
            if episode % 100 == 0:
                torch.save(self.model.state_dict(), f"{self.save_name}_checkpoint.pth")
        
        torch.save(self.model.state_dict(), f"{self.save_name}_final.pth")
        print("Training completed!")


# Testing

In [5]:
import os
import numpy as np
import torch
from torch.distributions import Categorical
import imageio
from PIL import Image
import matplotlib.pyplot as plt

def testing(agent, test_pairs, agent_type, num_episodes=None, env_config=None, save_gifs=True, gif_folder="TEST_GIFS"):

    if num_episodes is None:
        num_episodes = len(test_pairs)
    
    # Create GIF folder if needed
    if save_gifs and not os.path.exists(gif_folder):
        os.makedirs(gif_folder)
    
    # Set model to evaluation mode
    if agent_type.lower() == "dqn":
        agent.dnnetwork.eval()
    elif agent_type.lower() == "ppo":
        agent.model.eval()
    
    results = {
        'success_rate': [],
        'final_position_accuracy': [],
        'average_reward': [],
        'steps_to_find_tumor': [],
        'total_tumor_reward': [],
        'tumor_sizes_pixels': [],
        'tumor_sizes_percentage': [],
        'episode_details': []
    }
    
    grid_size = env_config.get('grid_size', 4)
    rewards = env_config.get('rewards', [5.0, -1.0, -0.2])
    action_space = env_config.get('action_space', None)
    
    for i in range(min(num_episodes, len(test_pairs))):
        img_path, mask_path = test_pairs[i]
        
        # Create environment
        if hasattr(agent, 'env_class'):
            env = agent.env_class(img_path, mask_path, grid_size=grid_size, rewards=rewards, action_space=action_space)
        else:
            env = Glioblastoma(img_path, mask_path, grid_size=grid_size, rewards=rewards, action_space=action_space)
        
        state, _ = env.reset()
        total_reward = 0
        found_tumor = False
        tumor_positions_visited = set()
        steps_to_find = env.max_steps
        tumor_rewards = 0
        
        # For action distribution tracking
        action_counts = np.zeros(env.action_space.n)
        
        # For GIF creation
        frames = []
        
        # Get tumor size information for this episode
        tumor_size_pixels = count_tumor_pixels(env)
        total_pixels = env.image.shape[0] * env.image.shape[1]
        tumor_size_percentage = (tumor_size_pixels / total_pixels) * 100
        
        results['tumor_sizes_pixels'].append(tumor_size_pixels)
        results['tumor_sizes_percentage'].append(tumor_size_percentage)
        
        for step in range(env.max_steps):
            with torch.no_grad():
                if agent_type.lower() == "dqn":
                    action = agent.dnnetwork.get_action(state, epsilon=0.00)
                    action_idx = action
                elif agent_type.lower() == "ppo":
                    action_probs, _ = agent.model(state)
                    dist = Categorical(action_probs)
                    action = dist.sample()
                    action_idx = action.item()
            
            action_counts[action_idx] += 1
            
            next_state, reward, terminated, truncated, _ = env.step(action_idx)
            state = next_state
            total_reward += reward
            
            # Track tumor-related metrics
            current_overlap = env.current_patch_overlap_with_lesion()
            if current_overlap > 0:
                tumor_positions_visited.add(tuple(env.agent_pos))
                if not found_tumor:
                    found_tumor = True
                    steps_to_find = step + 1
                
                # Count positive rewards (when on tumor)
                if reward > 0:
                    tumor_rewards += 1
            
            # Capture frame for GIF
            if save_gifs:
                frame = env.render(show=False)
                if frame is not None:
                    frames.append(frame)
            
            if terminated or truncated:
                break
        
        # Save GIF
        gif_path = None
        if save_gifs and frames:
            gif_path = os.path.join(gif_folder, f"episode_{i}_{os.path.basename(img_path).split('.')[0]}.gif")
            # Convert frames to PIL Images and save as GIF
            pil_frames = [Image.fromarray(frame) for frame in frames]
            pil_frames[0].save(
                gif_path,
                save_all=True,
                append_images=pil_frames[1:],
                duration=500,  # milliseconds per frame
                loop=0
            )
            if i % 10 == 0:
                print(f"Saved GIF for episode {i} at {gif_path}")
        
        # Calculate metrics for this episode
        final_overlap = env.current_patch_overlap_with_lesion()
        
        # Success: ended on tumor region
        success = final_overlap > 0
        results['success_rate'].append(success)
        
        # Final position accuracy
        results['final_position_accuracy'].append(final_overlap > 0)
        
        # Average reward
        results['average_reward'].append(total_reward)
        
        # Steps to find tumor
        results['steps_to_find_tumor'].append(steps_to_find)
                
        # Total positive rewards from tumor
        results['total_tumor_reward'].append(tumor_rewards)
        
        # Store detailed episode information
        episode_detail = {
            'image_path': img_path,
            'success': success,
            'final_on_tumor': final_overlap > 0,
            'total_reward': total_reward,
            'steps_to_find_tumor': steps_to_find,
            'tumor_rewards': tumor_rewards,
            'tumor_size_pixels': tumor_size_pixels,
            'tumor_size_percentage': tumor_size_percentage,
            'action_distribution': action_counts / np.sum(action_counts),  # Normalized
            'action_counts_raw': action_counts,  # Keep raw counts for aggregation
            'gif_path': gif_path
        }
        results['episode_details'].append(episode_detail)
    
    # Calculate separate action distributions
    successful_episodes = [ep for ep in results['episode_details'] if ep['final_on_tumor']]
    unsuccessful_episodes = [ep for ep in results['episode_details'] if not ep['final_on_tumor']]
    
    action_dist_success = calculate_separate_action_distribution(successful_episodes)
    action_dist_failure = calculate_separate_action_distribution(unsuccessful_episodes)
    
    # Calculate overall metrics with new tumor size statistics
    overall_results = {
        'success_rate': np.mean(results['success_rate']),
        'average_reward': np.mean(results['average_reward']),
        'avg_steps_to_find_tumor': np.mean(results['steps_to_find_tumor']),
        'avg_tumor_rewards': np.mean(results['total_tumor_reward']),
        'biggest_tumor_pixels': np.max(results['tumor_sizes_pixels']),
        'smallest_tumor_pixels': np.min(results['tumor_sizes_pixels']),
        'biggest_tumor_percentage': np.max(results['tumor_sizes_percentage']),
        'smallest_tumor_percentage': np.min(results['tumor_sizes_percentage']),
        'avg_tumor_size_pixels': np.mean(results['tumor_sizes_pixels']),
        'avg_tumor_size_percentage': np.mean(results['tumor_sizes_percentage']),
        'action_distribution': calculate_overall_action_distribution(results['episode_details']),
        'action_distribution_success': action_dist_success,
        'action_distribution_failure': action_dist_failure,
        'episode_details': results['episode_details']
    }
    
    # Print summary
    print("\n" + "="*60)
    print(f"TEST RESULTS ({agent_type.upper()} Agent)")
    print("="*60)
    print(f"Success Rate: {overall_results['success_rate']*100:.2f}%")
    print(f"Average Episode Reward: {overall_results['average_reward']:.2f}")
    print(f"Average Steps to Find Tumor: {overall_results['avg_steps_to_find_tumor']:.2f}")
    print(f"Average Tumor Rewards per Episode: {overall_results['avg_tumor_rewards']:.2f}")
    print(f"Tumor Size Statistics:")
    print(f"  Biggest Tumor: {overall_results['biggest_tumor_pixels']:.0f} pixels ({overall_results['biggest_tumor_percentage']:.2f}%)")
    print(f"  Smallest Tumor: {overall_results['smallest_tumor_pixels']:.0f} pixels ({overall_results['smallest_tumor_percentage']:.2f}%)")
    print(f"  Average Tumor: {overall_results['avg_tumor_size_pixels']:.0f} pixels ({overall_results['avg_tumor_size_percentage']:.2f}%)")
    print(f"Overall Action Distribution: {overall_results['action_distribution']}")
    print(f"  Successful Episodes: {overall_results['action_distribution_success']}")
    print(f"  Unsuccessful Episodes: {overall_results['action_distribution_failure']}")
    
    # Print individual episode results
    print(f"\nDetailed Results for {len(results['episode_details'])} episodes:")
    print("-" * 80)
    for i, detail in enumerate(results['episode_details']):
        print(f"Episode {i}: {os.path.basename(detail['image_path'])}")
        print(f"  Success: {detail['success']}, Final on Tumor: {detail['final_on_tumor']}")
        print(f"  Total Reward: {detail['total_reward']:.2f}, Steps to Find: {detail['steps_to_find_tumor']}")
        print(f"  Tumor Size: {detail['tumor_size_pixels']} pixels ({detail['tumor_size_percentage']:.2f}%)")
        print(f"  Action Distribution: {detail['action_distribution']}")
        if detail['gif_path']:
            print(f"  GIF saved: {detail['gif_path']}")
        print()
    
    return overall_results


def count_tumor_pixels(env):
    """Count total number of tumor pixels in the mask"""
    if hasattr(env, 'mask'):
        return np.sum(env.mask > 0)
    elif hasattr(env, 'original_mask'):
        return np.sum(env.original_mask > 0)
    else:
        # Fallback: try to access the mask through available attributes
        try:
            mask = env.lesion_mask if hasattr(env, 'lesion_mask') else None
            if mask is not None:
                return np.sum(mask > 0)
        except:
            pass
    return 0

def calculate_overall_action_distribution(episode_details):
    """Calculate overall action distribution across all episodes"""
    total_actions = np.zeros_like(episode_details[0]['action_distribution'])
    
    for detail in episode_details:
        # Multiply by steps to get actual count, then normalize
        action_dist = detail['action_distribution']
        # Since action_distribution is already normalized per episode, we'll average them
        total_actions += action_dist
    
    # Normalize to get overall distribution
    overall_dist = total_actions / len(episode_details)
    return overall_dist

def calculate_separate_action_distribution(episode_list):
    """Calculate action distribution for a specific list of episodes"""
    if len(episode_list) == 0:
        return np.array([])  # Return empty array if no episodes
    
    total_actions = np.zeros_like(episode_list[0]['action_distribution'])
    
    for episode in episode_list:
        total_actions += episode['action_distribution']
    
    # Normalize to get distribution
    distribution = total_actions / len(episode_list)
    return distribution

# TRIALS:

In [4]:
train_pairs = prepare()

sucess = {}

✅ Found 30 pairs out of 30 listed in CSV.


In [7]:
CURRENT_CONFIG = {
    'grid_size': 4,
    'rewards': [5.0, -1.0, -0.2], 
    'action_space': gym.spaces.Discrete(3)
}

LR = 3e-4
MAX_EPISODES = 1000
NUM_STEPS = 512  # Start with smaller rollout for testing
BATCH_SIZE = 64

env = GlobalAwareGlioblastoma(*train_pairs[0], **CURRENT_CONFIG)
model = GlobalAwarePPOActorCritic(env, learning_rate=LR, device='cpu')
agent = GlobalAwarePPOAgent(
    env_config=CURRENT_CONFIG,
    model=model,
    train_pairs=train_pairs,
    env_class=GlobalAwareGlioblastoma,  # Use the new environment class
    gamma=0.99,
    clip_epsilon=0.2,
    ppo_epochs=4,
    batch_size=BATCH_SIZE,
    save_name=f"PPO_batch{BATCH_SIZE}_rewards{CURRENT_CONFIG['rewards']}_3actions"
)


wandb.init(project="TFG_Glioblastoma_PPO", 
           name=f"PPO_batch{BATCH_SIZE}_rewards{CURRENT_CONFIG['rewards']}_3actions",
           config={
               "learning_rate": LR,
               "max_episodes": MAX_EPISODES,
               "num_steps": NUM_STEPS,
               "batch_size": BATCH_SIZE,
               "configuration": CURRENT_CONFIG
           })

# Start training
agent.train(max_episodes=MAX_EPISODES, num_steps=NUM_STEPS)
wandb.finish()


[34m[1mwandb[0m: Currently logged in as: [33mmartinacarrettab[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Starting Global-Aware PPO training...
Episode 0 | Avg Reward: -3.95 | Mean Reward (100): -3.95 | Actor Loss: -0.0228
New best model saved!
Episode 25 | Avg Reward: -4.96 | Mean Reward (100): -4.46 | Actor Loss: -0.0212
Episode 50 | Avg Reward: -8.46 | Mean Reward (100): -5.79 | Actor Loss: -0.0046
Episode 75 | Avg Reward: -5.17 | Mean Reward (100): -5.64 | Actor Loss: -0.0123
Episode 100 | Avg Reward: 0.05 | Mean Reward (100): -4.64 | Actor Loss: -0.0079
Episode 125 | Avg Reward: 1.07 | Mean Reward (100): -3.13 | Actor Loss: 0.0156
New best model saved!
Episode 150 | Avg Reward: 1.68 | Mean Reward (100): -0.59 | Actor Loss: -0.0068
New best model saved!
Episode 175 | Avg Reward: 3.01 | Mean Reward (100): 1.45 | Actor Loss: -0.0035
New best model saved!
Episode 200 | Avg Reward: 2.56 | Mean Reward (100): 2.08 | Actor Loss: -0.0331
New best model saved!
Episode 225 | Avg Reward: 14.27 | Mean Reward (100): 5.38 | Actor Loss: -0.0056
New best model saved!
Episode 250 | Avg Reward: 8.58 | M

0,1
actor_loss,▂▂▄▃▄▆▄▄▁▄█▇▄▇▅▄▆▂▃▆▄▃▃▅▆▂▅▂▅▃▅▃▃▂▂▆▄▃▆█
avg_episode_reward,▂▁▁▁▂▂▂▃▂▄▃▄▇▂▃▅▆▄▅▆▃▅▄▅▆▆▆▆▅▄▆██▄▇▅▄▆▂█
critic_loss,▂▁▁▁▂▂▂▂▂▅▅▄▆▃▃▆▅▅▆▆▄▅▅▅▅▇▇▆▆▆▇██▆█▆▆▆▃▆
entropy,████▇▇▇▇▆▅▄▄▄▄▃▂▂▃▂▃▃▂▃▂▂▂▁▁▂▂▂▁▁▁▁▂▃▂▃▂
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
mean_reward_100,▁▁▁▁▁▁▂▂▂▃▃▄▅▅▅▅▅▆▆▆▅▆▅▅▆▆▇▇▇▆▆▇▇██▇▆▇▅▆

0,1
actor_loss,0.03654
avg_episode_reward,44.304
critic_loss,179.40695
entropy,0.37175
episode,975.0
mean_reward_100,22.892


In [8]:
overall_results = testing(
    agent=agent,
    test_pairs=prepare(mode="test"),
    agent_type="ppo",
    num_episodes=100,
    env_config=CURRENT_CONFIG,
    save_gifs=True,
    gif_folder=f"GIFs_batch{BATCH_SIZE}_rewards{CURRENT_CONFIG['rewards']}_3actions"
)

sucess[f"PPO_batch{BATCH_SIZE}_rewards{CURRENT_CONFIG['rewards']}_3actions"] = overall_results['success_rate']

✅ Found 100 pairs out of 100 listed in CSV.
Saved GIF for episode 0 at GIFs_batch64_rewards[5.0, -1.0, -0.2]_3actions/episode_0_002_58.gif
Saved GIF for episode 10 at GIFs_batch64_rewards[5.0, -1.0, -0.2]_3actions/episode_10_013_86.gif
Saved GIF for episode 20 at GIFs_batch64_rewards[5.0, -1.0, -0.2]_3actions/episode_20_024_49.gif
Saved GIF for episode 30 at GIFs_batch64_rewards[5.0, -1.0, -0.2]_3actions/episode_30_038_84.gif
Saved GIF for episode 40 at GIFs_batch64_rewards[5.0, -1.0, -0.2]_3actions/episode_40_052_98.gif
Saved GIF for episode 50 at GIFs_batch64_rewards[5.0, -1.0, -0.2]_3actions/episode_50_104_74.gif
Saved GIF for episode 60 at GIFs_batch64_rewards[5.0, -1.0, -0.2]_3actions/episode_60_176_99.gif
Saved GIF for episode 70 at GIFs_batch64_rewards[5.0, -1.0, -0.2]_3actions/episode_70_204_52.gif
Saved GIF for episode 80 at GIFs_batch64_rewards[5.0, -1.0, -0.2]_3actions/episode_80_260_62.gif
Saved GIF for episode 90 at GIFs_batch64_rewards[5.0, -1.0, -0.2]_3actions/episode_90

In [9]:
CURRENT_CONFIG = {
    'grid_size': 4,
    'rewards': [5.0, -1.0, -0.2], 
    'action_space': gym.spaces.Discrete(3)
}

LR = 3e-4
MAX_EPISODES = 1000
NUM_STEPS = 512  # Start with smaller rollout for testing
BATCH_SIZE = 32

env = GlobalAwareGlioblastoma(*train_pairs[0], **CURRENT_CONFIG)
model = GlobalAwarePPOActorCritic(env, learning_rate=LR, device='cpu')
agent = GlobalAwarePPOAgent(
    env_config=CURRENT_CONFIG,
    model=model,
    train_pairs=train_pairs,
    env_class=GlobalAwareGlioblastoma,  # Use the new environment class
    gamma=0.99,
    clip_epsilon=0.2,
    ppo_epochs=4,
    batch_size=BATCH_SIZE,
    save_name=f"PPO_batch{BATCH_SIZE}_rewards{CURRENT_CONFIG['rewards']}_3actions"
)


wandb.init(project="TFG_Glioblastoma_PPO",
           name=f"PPO_batch{BATCH_SIZE}_rewards{CURRENT_CONFIG['rewards']}_3actions",
           config={
               "learning_rate": LR,
               "max_episodes": MAX_EPISODES,
               "num_steps": NUM_STEPS,
               "batch_size": BATCH_SIZE,
               "configuration": CURRENT_CONFIG
           })

# Start training
agent.train(max_episodes=MAX_EPISODES, num_steps=NUM_STEPS)
wandb.finish()


Starting Global-Aware PPO training...
Episode 0 | Avg Reward: -4.82 | Mean Reward (100): -4.82 | Actor Loss: -0.0617
New best model saved!
Episode 25 | Avg Reward: -1.55 | Mean Reward (100): -3.18 | Actor Loss: 0.0289
New best model saved!
Episode 50 | Avg Reward: -5.09 | Mean Reward (100): -3.82 | Actor Loss: -0.0207
Episode 75 | Avg Reward: -8.98 | Mean Reward (100): -5.11 | Actor Loss: 0.0223
Episode 100 | Avg Reward: -8.11 | Mean Reward (100): -5.93 | Actor Loss: 0.0050
Episode 125 | Avg Reward: -6.16 | Mean Reward (100): -7.08 | Actor Loss: 0.0579
Episode 150 | Avg Reward: -6.43 | Mean Reward (100): -7.42 | Actor Loss: 0.0434
Episode 175 | Avg Reward: 5.73 | Mean Reward (100): -3.74 | Actor Loss: -0.1008
Episode 200 | Avg Reward: 15.76 | Mean Reward (100): 2.22 | Actor Loss: 0.0072
New best model saved!
Episode 225 | Avg Reward: 35.76 | Mean Reward (100): 12.70 | Actor Loss: -0.0476
New best model saved!
Episode 250 | Avg Reward: 17.25 | Mean Reward (100): 18.62 | Actor Loss: 0.03

0,1
actor_loss,▃▇▅▆▆█▇▁▆▃▇▅▃▆▅▃▆▁▆▇▅▄▅▆▅▄▅▅▆▇▅▃▃▆▇▅▆▆▄▆
avg_episode_reward,▁▂▁▁▁▁▁▂▃▅▄▄▅▄▄▅▄▂▅▅▅▄▆▄▅▅▅▅▆▅▆▆█▇▅▅▄█▆█
critic_loss,▂▂▁▁▁▁▁▂▄▆▄▅▇▅▆▇▇▃▆▇▅▆▆▅▇▆▇▆▇▆▇▇██▇█▇▇▆▇
entropy,███████▇▆▄▃▃▃▂▁▂▂▄▃▃▂▂▂▃▃▃▃▃▂▃▂▂▁▁▂▂▂▂▂▁
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
mean_reward_100,▁▂▁▁▁▁▁▁▂▃▄▅▅▄▅▅▅▄▅▅▅▅▆▅▅▆▆▆▆▆▆▇▇██▇▆▆▇█

0,1
actor_loss,0.0168
avg_episode_reward,60.032
critic_loss,210.38103
entropy,0.1874
episode,975.0
mean_reward_100,47.924


In [10]:
overall_results = testing(
    agent=agent,
    test_pairs=prepare(mode="test"),
    agent_type="ppo",
    num_episodes=100,
    env_config=CURRENT_CONFIG,
    save_gifs=True,
    gif_folder=f"GIFs_batch{BATCH_SIZE}_rewards{CURRENT_CONFIG['rewards']}_3actions"
)

sucess[f"PPO_batch{BATCH_SIZE}_rewards{CURRENT_CONFIG['rewards']}_3actions"] = overall_results['success_rate']

✅ Found 100 pairs out of 100 listed in CSV.
Saved GIF for episode 0 at GIFs_batch32_rewards[5.0, -1.0, -0.2]_3actions/episode_0_002_58.gif
Saved GIF for episode 10 at GIFs_batch32_rewards[5.0, -1.0, -0.2]_3actions/episode_10_013_86.gif
Saved GIF for episode 20 at GIFs_batch32_rewards[5.0, -1.0, -0.2]_3actions/episode_20_024_49.gif
Saved GIF for episode 30 at GIFs_batch32_rewards[5.0, -1.0, -0.2]_3actions/episode_30_038_84.gif
Saved GIF for episode 40 at GIFs_batch32_rewards[5.0, -1.0, -0.2]_3actions/episode_40_052_98.gif
Saved GIF for episode 50 at GIFs_batch32_rewards[5.0, -1.0, -0.2]_3actions/episode_50_104_74.gif
Saved GIF for episode 60 at GIFs_batch32_rewards[5.0, -1.0, -0.2]_3actions/episode_60_176_99.gif
Saved GIF for episode 70 at GIFs_batch32_rewards[5.0, -1.0, -0.2]_3actions/episode_70_204_52.gif
Saved GIF for episode 80 at GIFs_batch32_rewards[5.0, -1.0, -0.2]_3actions/episode_80_260_62.gif
Saved GIF for episode 90 at GIFs_batch32_rewards[5.0, -1.0, -0.2]_3actions/episode_90

In [11]:
CURRENT_CONFIG = {
    'grid_size': 4,
    'rewards': [5.0, -1.0, -0.2], 
    'action_space': gym.spaces.Discrete(3)
}

LR = 3e-4
MAX_EPISODES = 1000
NUM_STEPS = 512  # Start with smaller rollout for testing
BATCH_SIZE = 128

env = GlobalAwareGlioblastoma(*train_pairs[0], **CURRENT_CONFIG)
model = GlobalAwarePPOActorCritic(env, learning_rate=LR, device='cpu')
agent = GlobalAwarePPOAgent(
    env_config=CURRENT_CONFIG,
    model=model,
    train_pairs=train_pairs,
    env_class=GlobalAwareGlioblastoma,  # Use the new environment class
    gamma=0.99,
    clip_epsilon=0.2,
    ppo_epochs=4,
    batch_size=BATCH_SIZE,
    save_name=f"PPO_batch{BATCH_SIZE}_rewards{CURRENT_CONFIG['rewards']}_3actions"
)


wandb.init(project="TFG_Glioblastoma_PPO", 
           name=f"PPO_batch{BATCH_SIZE}_rewards{CURRENT_CONFIG['rewards']}_3actions",
           config={
               "learning_rate": LR,
               "max_episodes": MAX_EPISODES,
               "num_steps": NUM_STEPS,
               "batch_size": BATCH_SIZE,
               "configuration": CURRENT_CONFIG
           })

# Start training
agent.train(max_episodes=MAX_EPISODES, num_steps=NUM_STEPS)
wandb.finish()


Starting Global-Aware PPO training...
Episode 0 | Avg Reward: -8.96 | Mean Reward (100): -8.96 | Actor Loss: 0.0004
New best model saved!
Episode 25 | Avg Reward: -9.78 | Mean Reward (100): -9.37 | Actor Loss: -0.0080
Episode 50 | Avg Reward: -7.89 | Mean Reward (100): -8.87 | Actor Loss: -0.0009
New best model saved!
Episode 75 | Avg Reward: -8.61 | Mean Reward (100): -8.81 | Actor Loss: 0.0062
New best model saved!
Episode 100 | Avg Reward: -5.07 | Mean Reward (100): -7.84 | Actor Loss: 0.0197
New best model saved!
Episode 125 | Avg Reward: -7.36 | Mean Reward (100): -7.23 | Actor Loss: 0.0011
New best model saved!
Episode 150 | Avg Reward: -5.86 | Mean Reward (100): -6.72 | Actor Loss: -0.0043
New best model saved!
Episode 175 | Avg Reward: -7.68 | Mean Reward (100): -6.49 | Actor Loss: -0.0125
New best model saved!
Episode 200 | Avg Reward: -8.66 | Mean Reward (100): -7.39 | Actor Loss: -0.0285
Episode 225 | Avg Reward: -5.78 | Mean Reward (100): -6.99 | Actor Loss: -0.0089
Episode

0,1
actor_loss,▂▂▂▂▃▂▂▂▁▂▁▂▂▃▅█▃▂▂▂▂▄▂▂▃▂▂▂▂▃▂▃▂▃▃▄▃▃▃▃
avg_episode_reward,▁▁▁▁▂▁▁▁▁▁▂▂▂▄▅▃▄▅▄▄▄▃▄▂▃▄▅▃▅█▇█▅▄▃▄▄▃▅▅
critic_loss,▁▁▁▁▁▁▁▁▁▁▂▂▂▄▆▄▅▆▅▅▆▄▆▄▅▅▇▄▆█▆▇▆▇▅▄▅▃▆▅
entropy,████████▇▇▇▆▆▃▂▆▃▃▄▄▃▆▃▃▄▃▃▄▃▁▁▁▂▂▃▅▆▆▄▃
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
mean_reward_100,▁▁▁▁▁▁▁▁▁▁▂▂▂▃▃▄▄▅▄▅▅▄▄▄▃▃▄▄▅▆▆██▇▆▅▄▄▅▅

0,1
actor_loss,0.01542
avg_episode_reward,22.256
critic_loss,163.41984
entropy,0.51148
episode,975.0
mean_reward_100,16.596


In [12]:
overall_results = testing(
    agent=agent,
    test_pairs=prepare(mode="test"),
    agent_type="ppo",
    num_episodes=100,
    env_config=CURRENT_CONFIG,
    save_gifs=True,
    gif_folder=f"GIFs_batch{BATCH_SIZE}_rewards{CURRENT_CONFIG['rewards']}_3actions"
)

sucess[f"PPO_batch{BATCH_SIZE}_rewards{CURRENT_CONFIG['rewards']}_3actions"] = overall_results['success_rate']

✅ Found 100 pairs out of 100 listed in CSV.
Saved GIF for episode 0 at GIFs_batch128_rewards[5.0, -1.0, -0.2]_3actions/episode_0_002_58.gif
Saved GIF for episode 10 at GIFs_batch128_rewards[5.0, -1.0, -0.2]_3actions/episode_10_013_86.gif
Saved GIF for episode 20 at GIFs_batch128_rewards[5.0, -1.0, -0.2]_3actions/episode_20_024_49.gif
Saved GIF for episode 30 at GIFs_batch128_rewards[5.0, -1.0, -0.2]_3actions/episode_30_038_84.gif
Saved GIF for episode 40 at GIFs_batch128_rewards[5.0, -1.0, -0.2]_3actions/episode_40_052_98.gif
Saved GIF for episode 50 at GIFs_batch128_rewards[5.0, -1.0, -0.2]_3actions/episode_50_104_74.gif
Saved GIF for episode 60 at GIFs_batch128_rewards[5.0, -1.0, -0.2]_3actions/episode_60_176_99.gif
Saved GIF for episode 70 at GIFs_batch128_rewards[5.0, -1.0, -0.2]_3actions/episode_70_204_52.gif
Saved GIF for episode 80 at GIFs_batch128_rewards[5.0, -1.0, -0.2]_3actions/episode_80_260_62.gif
Saved GIF for episode 90 at GIFs_batch128_rewards[5.0, -1.0, -0.2]_3actions/

__________________

In [13]:
CURRENT_CONFIG = {
    'grid_size': 4,
    'rewards': [5.0, -1.0, -0.2], 
    'action_space': gym.spaces.Discrete(3)
}

LR = 1e-4
MAX_EPISODES = 1000
NUM_STEPS = 512  # Start with smaller rollout for testing
BATCH_SIZE = 128

env = GlobalAwareGlioblastoma(*train_pairs[0], **CURRENT_CONFIG)
model = GlobalAwarePPOActorCritic(env, learning_rate=LR, device='cpu')
agent = GlobalAwarePPOAgent(
    env_config=CURRENT_CONFIG,
    model=model,
    train_pairs=train_pairs,
    env_class=GlobalAwareGlioblastoma,  # Use the new environment class
    gamma=0.99,
    clip_epsilon=0.2,
    ppo_epochs=4,
    batch_size=BATCH_SIZE,
    save_name=f"PPO_batch{BATCH_SIZE}_rewards{CURRENT_CONFIG['rewards']}_3actions"
)


wandb.init(project="TFG_Glioblastoma_PPO", 
           name=f"PPO_batch{BATCH_SIZE}_rewards{CURRENT_CONFIG['rewards']}_3actions",
           config={
               "learning_rate": LR,
               "max_episodes": MAX_EPISODES,
               "num_steps": NUM_STEPS,
               "batch_size": BATCH_SIZE,
               "configuration": CURRENT_CONFIG
           })

# Start training
agent.train(max_episodes=MAX_EPISODES, num_steps=NUM_STEPS)
wandb.finish()


Starting Global-Aware PPO training...
Episode 0 | Avg Reward: -10.02 | Mean Reward (100): -10.02 | Actor Loss: -0.0133
New best model saved!
Episode 25 | Avg Reward: -9.18 | Mean Reward (100): -9.60 | Actor Loss: -0.0044
New best model saved!
Episode 50 | Avg Reward: -8.40 | Mean Reward (100): -9.20 | Actor Loss: 0.0154
New best model saved!
Episode 75 | Avg Reward: -8.70 | Mean Reward (100): -9.08 | Actor Loss: 0.0052
New best model saved!
Episode 100 | Avg Reward: -7.98 | Mean Reward (100): -8.57 | Actor Loss: -0.0040
New best model saved!
Episode 125 | Avg Reward: -6.99 | Mean Reward (100): -8.02 | Actor Loss: -0.0001
New best model saved!
Episode 150 | Avg Reward: -7.14 | Mean Reward (100): -7.70 | Actor Loss: 0.0014
New best model saved!
Episode 175 | Avg Reward: -9.36 | Mean Reward (100): -7.87 | Actor Loss: -0.0068
Episode 200 | Avg Reward: -7.55 | Mean Reward (100): -7.76 | Actor Loss: 0.0147
Episode 225 | Avg Reward: -9.60 | Mean Reward (100): -8.41 | Actor Loss: -0.0148
Episo

0,1
actor_loss,▂▃▅▄▃▃▄▃▅▂▄▁▃▂▃▅█▁▃▃▄▄▄▃▄▃▂▂▂▄▄▃▃▄▃▄▄▄▅▄
avg_episode_reward,▁▁▁▁▁▁▁▁▁▁▁▂▂▃▃▆▆▁▃▅▄▅▄▄▅▂▄▄▆▅▇▃▄▆▄▄▅█▆▄
critic_loss,▁▁▁▁▁▁▁▁▁▁▁▁▁▃▄▇▇▃▄▆▆▅▅▆▅▃▄▅▆▇▇▅▅▇▅▄▆█▇▅
entropy,████████████▇▇▅▂▁▃▄▂▂▂▄▄▄▅▅▄▃▂▃▃▄▂▃▃▃▂▂▄
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
mean_reward_100,▁▁▁▁▁▁▂▂▂▁▁▂▂▃▃▅▆▆▆▅▅▆▆▆▆▅▅▅▆▇█▇▇▇▆▆▇▇██

0,1
actor_loss,0.00322
avg_episode_reward,6.672
critic_loss,167.68151
entropy,0.62044
episode,975.0
mean_reward_100,17.964


In [14]:
overall_results = testing(
    agent=agent,
    test_pairs=prepare(mode="test"),
    agent_type="ppo",
    num_episodes=100,
    env_config=CURRENT_CONFIG,
    save_gifs=True,
    gif_folder=f"GIFs_batch{BATCH_SIZE}_rewards{CURRENT_CONFIG['rewards']}_3actions"
)

sucess[f"PPO_batch{BATCH_SIZE}_rewards{CURRENT_CONFIG['rewards']}_3actions"] = overall_results['success_rate']

✅ Found 100 pairs out of 100 listed in CSV.
Saved GIF for episode 0 at GIFs_batch128_rewards[5.0, -1.0, -0.2]_3actions/episode_0_002_58.gif
Saved GIF for episode 10 at GIFs_batch128_rewards[5.0, -1.0, -0.2]_3actions/episode_10_013_86.gif
Saved GIF for episode 20 at GIFs_batch128_rewards[5.0, -1.0, -0.2]_3actions/episode_20_024_49.gif
Saved GIF for episode 30 at GIFs_batch128_rewards[5.0, -1.0, -0.2]_3actions/episode_30_038_84.gif
Saved GIF for episode 40 at GIFs_batch128_rewards[5.0, -1.0, -0.2]_3actions/episode_40_052_98.gif
Saved GIF for episode 50 at GIFs_batch128_rewards[5.0, -1.0, -0.2]_3actions/episode_50_104_74.gif
Saved GIF for episode 60 at GIFs_batch128_rewards[5.0, -1.0, -0.2]_3actions/episode_60_176_99.gif
Saved GIF for episode 70 at GIFs_batch128_rewards[5.0, -1.0, -0.2]_3actions/episode_70_204_52.gif
Saved GIF for episode 80 at GIFs_batch128_rewards[5.0, -1.0, -0.2]_3actions/episode_80_260_62.gif
Saved GIF for episode 90 at GIFs_batch128_rewards[5.0, -1.0, -0.2]_3actions/

In [15]:
CURRENT_CONFIG = {
    'grid_size': 4,
    'rewards': [5.0, -0.5, -0.2], 
    'action_space': gym.spaces.Discrete(3)
}

LR = 1e-4
MAX_EPISODES = 1000
NUM_STEPS = 512  # Start with smaller rollout for testing
BATCH_SIZE = 128

env = GlobalAwareGlioblastoma(*train_pairs[0], **CURRENT_CONFIG)
model = GlobalAwarePPOActorCritic(env, learning_rate=LR, device='cpu')
agent = GlobalAwarePPOAgent(
    env_config=CURRENT_CONFIG,
    model=model,
    train_pairs=train_pairs,
    env_class=GlobalAwareGlioblastoma,  # Use the new environment class
    gamma=0.99,
    clip_epsilon=0.2,
    ppo_epochs=4,
    batch_size=BATCH_SIZE,
    save_name=f"PPO_batch{BATCH_SIZE}_rewards{CURRENT_CONFIG['rewards']}_3actions"
)


wandb.init(project="TFG_Glioblastoma_PPO",
           name=f"PPO_batch{BATCH_SIZE}_rewards{CURRENT_CONFIG['rewards']}_3actions",
           config={
               "learning_rate": LR,
               "max_episodes": MAX_EPISODES,
               "num_steps": NUM_STEPS,
               "batch_size": BATCH_SIZE,
               "configuration": CURRENT_CONFIG
           })

# Start training
agent.train(max_episodes=MAX_EPISODES, num_steps=NUM_STEPS)
wandb.finish()


Starting Global-Aware PPO training...
Episode 0 | Avg Reward: 3.32 | Mean Reward (100): 3.32 | Actor Loss: -0.0195
New best model saved!
Episode 25 | Avg Reward: -0.38 | Mean Reward (100): 1.47 | Actor Loss: -0.0050
Episode 50 | Avg Reward: 1.63 | Mean Reward (100): 1.52 | Actor Loss: -0.0062
Episode 75 | Avg Reward: -1.61 | Mean Reward (100): 0.74 | Actor Loss: 0.0010
Episode 100 | Avg Reward: 8.10 | Mean Reward (100): 1.94 | Actor Loss: 0.0121
Episode 125 | Avg Reward: 11.76 | Mean Reward (100): 4.97 | Actor Loss: 0.0237
New best model saved!
Episode 150 | Avg Reward: 2.48 | Mean Reward (100): 5.18 | Actor Loss: 0.0120
New best model saved!
Episode 175 | Avg Reward: 22.87 | Mean Reward (100): 11.30 | Actor Loss: 0.0225
New best model saved!
Episode 200 | Avg Reward: 20.41 | Mean Reward (100): 14.38 | Actor Loss: 0.0293
New best model saved!
Episode 225 | Avg Reward: 13.02 | Mean Reward (100): 14.69 | Actor Loss: 0.0058
New best model saved!
Episode 250 | Avg Reward: 12.29 | Mean Rewa

0,1
actor_loss,▂▄▄▅▆▇▆▇█▅▅▅▄▃▆▅▄▄▄▄▅▄▄▅▆▅▆▄▅▅▄▅▃▅▄▃▁▄▅▄
avg_episode_reward,▂▁▂▁▃▃▂▆▅▄▄▄▇▅▄▃▅▅▅▄▃▆▄▄▂▄▄▅▄▆▅▅▆▇▅▃▅█▆▅
critic_loss,▁▁▁▁▂▄▃▅▇▅▅▅▇▇▅▄▅▆▅▆▄▆▄▃▃▅▅▆▆▆▇▇▇▇▇▅▇██▇
entropy,███▇▇▅▇▄▂▄▄▄▄▃▅▄▄▄▃▃▃▃▄▄▄▃▃▂▃▃▃▂▃▂▃▂▃▂▁▁
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
mean_reward_100,▂▁▁▁▁▂▂▄▅▅▆▅▅▆▆▆▅▅▆▆▅▆▅▅▅▄▄▄▅▆▆▆▇█▇▇▆▇▇█

0,1
actor_loss,-0.0055
avg_episode_reward,18.524
critic_loss,161.93659
entropy,0.4338
episode,975.0
mean_reward_100,25.915


In [16]:
overall_results = testing(
    agent=agent,
    test_pairs=prepare(mode="test"),
    agent_type="ppo",
    num_episodes=100,
    env_config=CURRENT_CONFIG,
    save_gifs=True,
    gif_folder=f"GIFs_batch{BATCH_SIZE}_rewards{CURRENT_CONFIG['rewards']}_3actions"
)

sucess[f"PPO_batch{BATCH_SIZE}_rewards{CURRENT_CONFIG['rewards']}_3actions"] = overall_results['success_rate']

✅ Found 100 pairs out of 100 listed in CSV.
Saved GIF for episode 0 at GIFs_batch128_rewards[5.0, -0.5, -0.2]_3actions/episode_0_002_58.gif
Saved GIF for episode 10 at GIFs_batch128_rewards[5.0, -0.5, -0.2]_3actions/episode_10_013_86.gif
Saved GIF for episode 20 at GIFs_batch128_rewards[5.0, -0.5, -0.2]_3actions/episode_20_024_49.gif
Saved GIF for episode 30 at GIFs_batch128_rewards[5.0, -0.5, -0.2]_3actions/episode_30_038_84.gif
Saved GIF for episode 40 at GIFs_batch128_rewards[5.0, -0.5, -0.2]_3actions/episode_40_052_98.gif
Saved GIF for episode 50 at GIFs_batch128_rewards[5.0, -0.5, -0.2]_3actions/episode_50_104_74.gif
Saved GIF for episode 60 at GIFs_batch128_rewards[5.0, -0.5, -0.2]_3actions/episode_60_176_99.gif
Saved GIF for episode 70 at GIFs_batch128_rewards[5.0, -0.5, -0.2]_3actions/episode_70_204_52.gif
Saved GIF for episode 80 at GIFs_batch128_rewards[5.0, -0.5, -0.2]_3actions/episode_80_260_62.gif
Saved GIF for episode 90 at GIFs_batch128_rewards[5.0, -0.5, -0.2]_3actions/

In [17]:
CURRENT_CONFIG = {
    'grid_size': 4,
    'rewards': [5.0, -0.2, -0.05], 
    'action_space': gym.spaces.Discrete(3)
}

LR = 1e-4
MAX_EPISODES = 1000
NUM_STEPS = 512  # Start with smaller rollout for testing
BATCH_SIZE = 128

env = GlobalAwareGlioblastoma(*train_pairs[0], **CURRENT_CONFIG)
model = GlobalAwarePPOActorCritic(env, learning_rate=LR, device='cpu')
agent = GlobalAwarePPOAgent(
    env_config=CURRENT_CONFIG,
    model=model,
    train_pairs=train_pairs,
    env_class=GlobalAwareGlioblastoma,  # Use the new environment class
    gamma=0.99,
    clip_epsilon=0.2,
    ppo_epochs=4,
    batch_size=BATCH_SIZE,
    save_name=f"PPO_batch{BATCH_SIZE}_rewards{CURRENT_CONFIG['rewards']}_3actions"
)


wandb.init(project="TFG_Glioblastoma_PPO", 
           name=f"PPO_batch{BATCH_SIZE}_rewards{CURRENT_CONFIG['rewards']}_3actions",
           config={
               "learning_rate": LR,
               "max_episodes": MAX_EPISODES,
               "num_steps": NUM_STEPS,
               "batch_size": BATCH_SIZE,
               "configuration": CURRENT_CONFIG
           })

# Start training
agent.train(max_episodes=MAX_EPISODES, num_steps=NUM_STEPS)
wandb.finish()


Starting Global-Aware PPO training...
Episode 0 | Avg Reward: 7.54 | Mean Reward (100): 7.54 | Actor Loss: -0.0115
New best model saved!
Episode 25 | Avg Reward: 4.63 | Mean Reward (100): 6.08 | Actor Loss: 0.0016
Episode 50 | Avg Reward: 9.25 | Mean Reward (100): 7.14 | Actor Loss: -0.0159
Episode 75 | Avg Reward: 8.81 | Mean Reward (100): 7.56 | Actor Loss: -0.0141
New best model saved!
Episode 100 | Avg Reward: 23.44 | Mean Reward (100): 11.53 | Actor Loss: -0.0050
New best model saved!
Episode 125 | Avg Reward: 12.02 | Mean Reward (100): 13.38 | Actor Loss: 0.0031
New best model saved!
Episode 150 | Avg Reward: 11.56 | Mean Reward (100): 13.96 | Actor Loss: -0.0088
New best model saved!
Episode 175 | Avg Reward: 27.18 | Mean Reward (100): 18.55 | Actor Loss: 0.0113
New best model saved!
Episode 200 | Avg Reward: 19.85 | Mean Reward (100): 17.65 | Actor Loss: -0.0101
Episode 225 | Avg Reward: 20.11 | Mean Reward (100): 19.68 | Actor Loss: 0.0058
New best model saved!
Episode 250 | A

0,1
actor_loss,▂▄▁▂▃▅▃▆▂▅▄▃▅▇▇▁▆▄▂▃▅▁▃▄▅█▅▇▃▄▄▆▅▂▅▃▆▂▅▂
avg_episode_reward,▁▁▂▂▄▂▂▄▃▃▄▄▃▅▆▄▄▆▅▃▄▄▅▂▇▃▄█▅▄▄▃▅▅▆▄▅▅▄▄
critic_loss,▁▁▁▂▄▃▃▅▅▄▆▆▅▆▆▇▆█▇▅▆▆▆▄█▅▆█▇▆▆▅▇▇▇▅▆▆▅▅
entropy,███▇▆▆▅▅▅▅▄▄▄▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▁▂▁▁▂▂
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
mean_reward_100,▁▁▁▁▂▃▃▄▄▄▅▅▅▅▆▆▇▇▇▇▇▆▆▅▆▆▅█▇██▆▆▇▇▇▇▇▆▆

0,1
actor_loss,-0.01239
avg_episode_reward,24.536
critic_loss,160.59939
entropy,0.25473
episode,975.0
mean_reward_100,27.996


In [18]:
overall_results = testing(
    agent=agent,
    test_pairs=prepare(mode="test"),
    agent_type="ppo",
    num_episodes=100,
    env_config=CURRENT_CONFIG,
    save_gifs=True,
    gif_folder=f"GIFs_batch{BATCH_SIZE}_rewards{CURRENT_CONFIG['rewards']}_3actions"
)

sucess[f"PPO_batch{BATCH_SIZE}_rewards{CURRENT_CONFIG['rewards']}_3actions"] = overall_results['success_rate']

✅ Found 100 pairs out of 100 listed in CSV.
Saved GIF for episode 0 at GIFs_batch128_rewards[5.0, -0.2, -0.05]_3actions/episode_0_002_58.gif
Saved GIF for episode 10 at GIFs_batch128_rewards[5.0, -0.2, -0.05]_3actions/episode_10_013_86.gif
Saved GIF for episode 20 at GIFs_batch128_rewards[5.0, -0.2, -0.05]_3actions/episode_20_024_49.gif
Saved GIF for episode 30 at GIFs_batch128_rewards[5.0, -0.2, -0.05]_3actions/episode_30_038_84.gif
Saved GIF for episode 40 at GIFs_batch128_rewards[5.0, -0.2, -0.05]_3actions/episode_40_052_98.gif
Saved GIF for episode 50 at GIFs_batch128_rewards[5.0, -0.2, -0.05]_3actions/episode_50_104_74.gif
Saved GIF for episode 60 at GIFs_batch128_rewards[5.0, -0.2, -0.05]_3actions/episode_60_176_99.gif
Saved GIF for episode 70 at GIFs_batch128_rewards[5.0, -0.2, -0.05]_3actions/episode_70_204_52.gif
Saved GIF for episode 80 at GIFs_batch128_rewards[5.0, -0.2, -0.05]_3actions/episode_80_260_62.gif
Saved GIF for episode 90 at GIFs_batch128_rewards[5.0, -0.2, -0.05]

In [19]:
CURRENT_CONFIG = {
    'grid_size': 4,
    'rewards': [5.0, -0.1, -0.02], 
    'action_space': gym.spaces.Discrete(3)
}

LR = 1e-4
MAX_EPISODES = 1000
NUM_STEPS = 512  # Start with smaller rollout for testing
BATCH_SIZE = 128

env = GlobalAwareGlioblastoma(*train_pairs[0], **CURRENT_CONFIG)
model = GlobalAwarePPOActorCritic(env, learning_rate=LR, device='cpu')
agent = GlobalAwarePPOAgent(
    env_config=CURRENT_CONFIG,
    model=model,
    train_pairs=train_pairs,
    env_class=GlobalAwareGlioblastoma,  # Use the new environment class
    gamma=0.99,
    clip_epsilon=0.2,
    ppo_epochs=4,
    batch_size=BATCH_SIZE,
    save_name=f"PPO_batch{BATCH_SIZE}_rewards{CURRENT_CONFIG['rewards']}_3actions"
)


wandb.init(project="TFG_Glioblastoma_PPO", 
           name=f"PPO_batch{BATCH_SIZE}_rewards{CURRENT_CONFIG['rewards']}_3actions",
           config={
               "learning_rate": LR,
               "max_episodes": MAX_EPISODES,
               "num_steps": NUM_STEPS,
               "batch_size": BATCH_SIZE,
               "configuration": CURRENT_CONFIG
           })

# Start training
agent.train(max_episodes=MAX_EPISODES, num_steps=NUM_STEPS)
wandb.finish()


Starting Global-Aware PPO training...
Episode 0 | Avg Reward: 4.95 | Mean Reward (100): 4.95 | Actor Loss: -0.0096
New best model saved!
Episode 25 | Avg Reward: 6.35 | Mean Reward (100): 5.65 | Actor Loss: 0.0090
New best model saved!
Episode 50 | Avg Reward: 6.17 | Mean Reward (100): 5.82 | Actor Loss: -0.0203
New best model saved!
Episode 75 | Avg Reward: 7.99 | Mean Reward (100): 6.36 | Actor Loss: -0.0003
New best model saved!
Episode 100 | Avg Reward: 22.64 | Mean Reward (100): 10.79 | Actor Loss: -0.0339
New best model saved!
Episode 125 | Avg Reward: 20.32 | Mean Reward (100): 14.28 | Actor Loss: -0.0050
New best model saved!
Episode 150 | Avg Reward: 27.66 | Mean Reward (100): 19.65 | Actor Loss: 0.0137
New best model saved!
Episode 175 | Avg Reward: 15.19 | Mean Reward (100): 21.45 | Actor Loss: -0.0092
New best model saved!
Episode 200 | Avg Reward: 19.11 | Mean Reward (100): 20.57 | Actor Loss: -0.0170
Episode 225 | Avg Reward: 14.44 | Mean Reward (100): 19.10 | Actor Loss:

0,1
actor_loss,▅▇▃▆▁▅█▅▃▅▄▅▄▄▅▅█▆▆▇▆▅▅▅▃▇▆▄▇▅█▆▇▅▆▇▇▅▅▅
avg_episode_reward,▁▁▁▂▄▄▅▃▄▃▄▅▃▄▄▅▆▆▄▇▇▄▇▇▅▅▅▅▆▄▅▅▄▅▇▆▅▃▆█
critic_loss,▁▁▁▁▄▄▆▄▅▄▄▆▄▅▆▆▆▆▆▇▇▆▇▇▆▇▆▇▆▆▆▆▅▅█▆▆▅▆▇
entropy,██▇▇▆▄▄▄▄▄▄▃▅▃▃▂▃▂▂▃▂▃▁▃▃▁▂▂▃▃▂▃▂▃▂▂▂▂▃▁
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
mean_reward_100,▁▁▁▁▂▃▅▅▅▅▄▅▅▅▅▅▆▆▆▇▇▇██▇▇▇▆▇▇▇▆▆▆▇▇▇▇▆▇

0,1
actor_loss,-0.00462
avg_episode_reward,41.44
critic_loss,182.87313
entropy,0.40932
episode,975.0
mean_reward_100,28.6312


In [20]:
overall_results = testing(
    agent=agent,
    test_pairs=prepare(mode="test"),
    agent_type="ppo",
    num_episodes=100,
    env_config=CURRENT_CONFIG,
    save_gifs=True,
    gif_folder=f"GIFs_batch{BATCH_SIZE}_rewards{CURRENT_CONFIG['rewards']}_3actions"
)

sucess[f"PPO_batch{BATCH_SIZE}_rewards{CURRENT_CONFIG['rewards']}_3actions"] = overall_results['success_rate']

✅ Found 100 pairs out of 100 listed in CSV.
Saved GIF for episode 0 at GIFs_batch128_rewards[5.0, -0.1, -0.02]_3actions/episode_0_002_58.gif
Saved GIF for episode 10 at GIFs_batch128_rewards[5.0, -0.1, -0.02]_3actions/episode_10_013_86.gif
Saved GIF for episode 20 at GIFs_batch128_rewards[5.0, -0.1, -0.02]_3actions/episode_20_024_49.gif
Saved GIF for episode 30 at GIFs_batch128_rewards[5.0, -0.1, -0.02]_3actions/episode_30_038_84.gif
Saved GIF for episode 40 at GIFs_batch128_rewards[5.0, -0.1, -0.02]_3actions/episode_40_052_98.gif
Saved GIF for episode 50 at GIFs_batch128_rewards[5.0, -0.1, -0.02]_3actions/episode_50_104_74.gif
Saved GIF for episode 60 at GIFs_batch128_rewards[5.0, -0.1, -0.02]_3actions/episode_60_176_99.gif
Saved GIF for episode 70 at GIFs_batch128_rewards[5.0, -0.1, -0.02]_3actions/episode_70_204_52.gif
Saved GIF for episode 80 at GIFs_batch128_rewards[5.0, -0.1, -0.02]_3actions/episode_80_260_62.gif
Saved GIF for episode 90 at GIFs_batch128_rewards[5.0, -0.1, -0.02]

In [21]:
CURRENT_CONFIG = {
    'grid_size': 4,
    'rewards': [5.0, -0.2, -0.05], 
    'action_space': gym.spaces.Discrete(5)
}

LR = 1e-4
MAX_EPISODES = 1000
NUM_STEPS = 512  # Start with smaller rollout for testing
BATCH_SIZE = 128

env = GlobalAwareGlioblastoma(*train_pairs[0], **CURRENT_CONFIG)
model = GlobalAwarePPOActorCritic(env, learning_rate=LR, device='cpu')
agent = GlobalAwarePPOAgent(
    env_config=CURRENT_CONFIG,
    model=model,
    train_pairs=train_pairs,
    env_class=GlobalAwareGlioblastoma,  # Use the new environment class
    gamma=0.99,
    clip_epsilon=0.2,
    ppo_epochs=4,
    batch_size=BATCH_SIZE,
    save_name=f"PPO_batch{BATCH_SIZE}_rewards{CURRENT_CONFIG['rewards']}_5actions"
)


wandb.init(project="TFG_Glioblastoma_PPO", 
           name=f"PPO_batch{BATCH_SIZE}_rewards{CURRENT_CONFIG['rewards']}_5actions",
           config={
               "learning_rate": LR,
               "max_episodes": MAX_EPISODES,
               "num_steps": NUM_STEPS,
               "batch_size": BATCH_SIZE,
               "configuration": CURRENT_CONFIG
           })

# Start training
agent.train(max_episodes=MAX_EPISODES, num_steps=NUM_STEPS)
wandb.finish()


Starting Global-Aware PPO training...
Episode 0 | Avg Reward: 5.89 | Mean Reward (100): 5.89 | Actor Loss: -0.0219
New best model saved!
Episode 25 | Avg Reward: 14.90 | Mean Reward (100): 10.40 | Actor Loss: 0.0053
New best model saved!
Episode 50 | Avg Reward: 2.68 | Mean Reward (100): 7.82 | Actor Loss: 0.0027
Episode 75 | Avg Reward: 6.56 | Mean Reward (100): 7.51 | Actor Loss: -0.0113
Episode 100 | Avg Reward: 10.81 | Mean Reward (100): 8.74 | Actor Loss: 0.0024
Episode 125 | Avg Reward: 9.48 | Mean Reward (100): 7.38 | Actor Loss: 0.0006
Episode 150 | Avg Reward: 17.03 | Mean Reward (100): 10.97 | Actor Loss: 0.0018
New best model saved!
Episode 175 | Avg Reward: 13.26 | Mean Reward (100): 12.64 | Actor Loss: -0.0111
New best model saved!
Episode 200 | Avg Reward: 16.40 | Mean Reward (100): 14.04 | Actor Loss: -0.0131
New best model saved!
Episode 225 | Avg Reward: 8.00 | Mean Reward (100): 13.67 | Actor Loss: -0.0096
Episode 250 | Avg Reward: 14.09 | Mean Reward (100): 12.93 | A

0,1
actor_loss,▁▆▆▃▆▅▆▃▃▃▃▅▅▃▆▄▂▆▃▅▄█▅▅▃▆█▄▄▅▂▄▇▆▄▅█▄█▄
avg_episode_reward,▂▄▁▂▃▂▄▃▄▂▃▁▄▃▃▄▃▄▃▃▅▇▄▄▄▄▅▅▄▄▇▆▆█▅▆█▄▇▅
critic_loss,▁▃▁▂▂▂▂▂▃▂▂▁▃▂▂▂▂▃▂▂▄▅▄▃▃▃▄▄▃▄▆▆▆█▅▅█▄▇▄
entropy,██████████▇▇▇▇▇▆▇▇▆▆▅▅▅▅▅▅▄▄▄▄▃▃▂▁▂▂▁▂▂▃
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
mean_reward_100,▁▂▂▁▂▁▃▃▄▃▃▂▂▃▃▄▃▄▄▃▄▅▅▆▅▄▅▅▅▅▆▆▇█▇▇█▇█▇

0,1
actor_loss,-0.00828
avg_episode_reward,20.164
critic_loss,75.03027
entropy,0.86347
episode,975.0
mean_reward_100,26.449


In [22]:
overall_results = testing(
    agent=agent,
    test_pairs=prepare(mode="test"),
    agent_type="ppo",
    num_episodes=100,
    env_config=CURRENT_CONFIG,
    save_gifs=True,
    gif_folder=f"GIFs_batch{BATCH_SIZE}_rewards{CURRENT_CONFIG['rewards']}_5actions"
)

sucess[f"PPO_batch{BATCH_SIZE}_rewards{CURRENT_CONFIG['rewards']}_5actions"] = overall_results['success_rate']

✅ Found 100 pairs out of 100 listed in CSV.
Saved GIF for episode 0 at GIFs_batch128_rewards[5.0, -0.2, -0.05]_5actions/episode_0_002_58.gif
Saved GIF for episode 10 at GIFs_batch128_rewards[5.0, -0.2, -0.05]_5actions/episode_10_013_86.gif
Saved GIF for episode 20 at GIFs_batch128_rewards[5.0, -0.2, -0.05]_5actions/episode_20_024_49.gif
Saved GIF for episode 30 at GIFs_batch128_rewards[5.0, -0.2, -0.05]_5actions/episode_30_038_84.gif
Saved GIF for episode 40 at GIFs_batch128_rewards[5.0, -0.2, -0.05]_5actions/episode_40_052_98.gif
Saved GIF for episode 50 at GIFs_batch128_rewards[5.0, -0.2, -0.05]_5actions/episode_50_104_74.gif
Saved GIF for episode 60 at GIFs_batch128_rewards[5.0, -0.2, -0.05]_5actions/episode_60_176_99.gif
Saved GIF for episode 70 at GIFs_batch128_rewards[5.0, -0.2, -0.05]_5actions/episode_70_204_52.gif
Saved GIF for episode 80 at GIFs_batch128_rewards[5.0, -0.2, -0.05]_5actions/episode_80_260_62.gif
Saved GIF for episode 90 at GIFs_batch128_rewards[5.0, -0.2, -0.05]

In [23]:
for item in sucess:
    print(f"{item}: Success Rate = {sucess[item]*100:.2f}%")

PPO_batch64_rewards[5.0, -1.0, -0.2]_3actions: Success Rate = 43.00%
PPO_batch32_rewards[5.0, -1.0, -0.2]_3actions: Success Rate = 46.00%
PPO_batch128_rewards[5.0, -1.0, -0.2]_3actions: Success Rate = 54.00%
PPO_batch128_rewards[5.0, -0.5, -0.2]_3actions: Success Rate = 28.00%
PPO_batch128_rewards[5.0, -0.2, -0.05]_3actions: Success Rate = 44.00%
PPO_batch128_rewards[5.0, -0.1, -0.02]_3actions: Success Rate = 37.00%
PPO_batch128_rewards[5.0, -0.2, -0.05]_5actions: Success Rate = 37.00%


In [24]:
CURRENT_CONFIG = {
    'grid_size': 4,
    'rewards': [5.0, -1.0, -0.2], 
    'action_space': gym.spaces.Discrete(5)
}

LR = 1e-4
MAX_EPISODES = 1000
NUM_STEPS = 512  # Start with smaller rollout for testing
BATCH_SIZE = 128

env = GlobalAwareGlioblastoma(*train_pairs[0], **CURRENT_CONFIG)
model = GlobalAwarePPOActorCritic(env, learning_rate=LR, device='cpu')
agent = GlobalAwarePPOAgent(
    env_config=CURRENT_CONFIG,
    model=model,
    train_pairs=train_pairs,
    env_class=GlobalAwareGlioblastoma,  # Use the new environment class
    gamma=0.99,
    clip_epsilon=0.2,
    ppo_epochs=4,
    batch_size=BATCH_SIZE,
    save_name=f"PPO_batch{BATCH_SIZE}_rewards{CURRENT_CONFIG['rewards']}_5actions"
)


wandb.init(project="TFG_Glioblastoma_PPO", 
           name=f"PPO_batch{BATCH_SIZE}_rewards{CURRENT_CONFIG['rewards']}_5actions",
           config={
               "learning_rate": LR,
               "max_episodes": MAX_EPISODES,
               "num_steps": NUM_STEPS,
               "batch_size": BATCH_SIZE,
               "configuration": CURRENT_CONFIG
           })

# Start training
agent.train(max_episodes=MAX_EPISODES, num_steps=NUM_STEPS)
wandb.finish()


Starting Global-Aware PPO training...
Episode 0 | Avg Reward: -1.68 | Mean Reward (100): -1.68 | Actor Loss: -0.0093
New best model saved!
Episode 25 | Avg Reward: 7.71 | Mean Reward (100): 3.02 | Actor Loss: -0.0044
New best model saved!
Episode 50 | Avg Reward: 6.35 | Mean Reward (100): 4.13 | Actor Loss: -0.0074
New best model saved!
Episode 75 | Avg Reward: 5.04 | Mean Reward (100): 4.36 | Actor Loss: 0.0045
New best model saved!
Episode 100 | Avg Reward: 4.32 | Mean Reward (100): 5.86 | Actor Loss: -0.0116
New best model saved!
Episode 125 | Avg Reward: 12.72 | Mean Reward (100): 7.11 | Actor Loss: 0.0331
New best model saved!
Episode 150 | Avg Reward: 4.00 | Mean Reward (100): 6.52 | Actor Loss: -0.0013
Episode 175 | Avg Reward: 6.72 | Mean Reward (100): 6.94 | Actor Loss: -0.0059
Episode 200 | Avg Reward: 9.90 | Mean Reward (100): 8.34 | Actor Loss: 0.0023
New best model saved!
Episode 225 | Avg Reward: 11.55 | Mean Reward (100): 8.04 | Actor Loss: -0.0128
Episode 250 | Avg Rewa

0,1
actor_loss,▂▃▂▄▂▇▃▂▃▂▂▃▃▄▁▂▁▂█▂▂▃▃▂▂▂▃▅▂▂▃▃▃▃▄▅▅▅▃▃
avg_episode_reward,▁▃▃▂▂▄▂▃▃▄▂▃▂▃▃▄▃▂▄▂▄▄▄▄▃▄▄▄▄▄▄▅▆▆█▃▃▆▄▆
critic_loss,▁▃▃▂▂▃▂▂▂▃▂▂▁▃▃▃▃▁▄▂▃▃▄▄▃▃▄▄▃▅▅▆▆▅█▄▃▆▅▇
entropy,█████▇▇▇▇▇▇▇▇▆▆▆▆▇▅▆▆▅▅▆▆▆▆▆▆▅▅▄▃▃▁▂▅▃▁▁
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
mean_reward_100,▁▂▂▃▃▃▃▃▄▃▃▄▃▃▃▄▄▃▄▃▄▄▄▅▅▅▅▅▅▅▅▅▆▇█▇▆▆▅▆

0,1
actor_loss,-0.00138
avg_episode_reward,24.496
critic_loss,108.83502
entropy,0.91926
episode,975.0
mean_reward_100,18.516


In [25]:
overall_results = testing(
    agent=agent,
    test_pairs=prepare(mode="test"),
    agent_type="ppo",
    num_episodes=100,
    env_config=CURRENT_CONFIG,
    save_gifs=True,
    gif_folder=f"GIFs_batch{BATCH_SIZE}_rewards{CURRENT_CONFIG['rewards']}_5actions"
)

sucess[f"PPO_batch{BATCH_SIZE}_rewards{CURRENT_CONFIG['rewards']}_5actions"] = overall_results['success_rate']

✅ Found 100 pairs out of 100 listed in CSV.
Saved GIF for episode 0 at GIFs_batch128_rewards[5.0, -1.0, -0.2]_5actions/episode_0_002_58.gif
Saved GIF for episode 10 at GIFs_batch128_rewards[5.0, -1.0, -0.2]_5actions/episode_10_013_86.gif
Saved GIF for episode 20 at GIFs_batch128_rewards[5.0, -1.0, -0.2]_5actions/episode_20_024_49.gif
Saved GIF for episode 30 at GIFs_batch128_rewards[5.0, -1.0, -0.2]_5actions/episode_30_038_84.gif
Saved GIF for episode 40 at GIFs_batch128_rewards[5.0, -1.0, -0.2]_5actions/episode_40_052_98.gif
Saved GIF for episode 50 at GIFs_batch128_rewards[5.0, -1.0, -0.2]_5actions/episode_50_104_74.gif
Saved GIF for episode 60 at GIFs_batch128_rewards[5.0, -1.0, -0.2]_5actions/episode_60_176_99.gif
Saved GIF for episode 70 at GIFs_batch128_rewards[5.0, -1.0, -0.2]_5actions/episode_70_204_52.gif
Saved GIF for episode 80 at GIFs_batch128_rewards[5.0, -1.0, -0.2]_5actions/episode_80_260_62.gif
Saved GIF for episode 90 at GIFs_batch128_rewards[5.0, -1.0, -0.2]_5actions/

In [26]:
for item in sucess:
    print(f"{item}: Success Rate = {sucess[item]*100:.2f}%")

PPO_batch64_rewards[5.0, -1.0, -0.2]_3actions: Success Rate = 43.00%
PPO_batch32_rewards[5.0, -1.0, -0.2]_3actions: Success Rate = 46.00%
PPO_batch128_rewards[5.0, -1.0, -0.2]_3actions: Success Rate = 54.00%
PPO_batch128_rewards[5.0, -0.5, -0.2]_3actions: Success Rate = 28.00%
PPO_batch128_rewards[5.0, -0.2, -0.05]_3actions: Success Rate = 44.00%
PPO_batch128_rewards[5.0, -0.1, -0.02]_3actions: Success Rate = 37.00%
PPO_batch128_rewards[5.0, -0.2, -0.05]_5actions: Success Rate = 37.00%
PPO_batch128_rewards[5.0, -1.0, -0.2]_5actions: Success Rate = 32.00%
