In [6]:
from ot2_gym_wrapper import OT2Env
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import BaseCallback
import numpy as np


In [7]:
class EarlyStoppingCallback(BaseCallback):
    """
    Custom callback for early stopping based on average reward.
    """
    def __init__(self, patience: int = 10, min_delta: float = 0.01, verbose: int = 1):
        super(EarlyStoppingCallback, self).__init__()
        self.patience = patience         # How many consecutive evaluations without improvement
        self.min_delta = min_delta       # Minimum change to consider an improvement
        self.verbose = verbose           # Verbose level
        self.best_mean_reward = -np.inf  # Best observed mean reward
        self.no_improvement_count = 0    # Counter for no improvement

    def _on_step(self) -> bool:
        # Only run this callback every 1000 steps
        if self.n_calls % 1000 == 0:
            # Retrieve rewards from the environment
            rewards = self.locals.get("rewards", [])
            if len(rewards) > 0:
                mean_reward = np.mean(rewards)
                if self.verbose > 0:
                    print(f"Step {self.n_calls}: Mean reward: {mean_reward}")

                # Check for improvement
                if mean_reward > self.best_mean_reward + self.min_delta:
                    self.best_mean_reward = mean_reward
                    self.no_improvement_count = 0  # Reset counter
                else:
                    self.no_improvement_count += 1

                # Early stopping condition
                if self.no_improvement_count >= self.patience:
                    if self.verbose > 0:
                        print("Early stopping triggered!")
                    return False  # Stop training

        return True  # Continue training


In [9]:
# Create the custom environment
env = OT2Env(render=False)

# Define the PPO model
model = PPO(
    policy="MlpPolicy",
    env=env,
    learning_rate=0.0003,
    n_steps=2048,
    batch_size=64,
    gamma=0.99,
    clip_range=0.2,
    verbose=1
)

# Train the PPO model with early stopping
early_stopping_callback = EarlyStoppingCallback(patience=10, min_delta=0.01, verbose=1)
model.learn(total_timesteps=3000, callback=early_stopping_callback)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Action received: [-0.99303436  0.90163475  0.197805    0.40758848]
Updated pipette position: [-0.99303436  0.90163475  0.297805  ]
Clipped pipette position: [-0.26  0.13  0.2 ]
Distance to target: 0.4390509201537902
Action received: [ 0.9247638   1.         -0.00546418  0.        ]
Updated pipette position: [0.6647638  1.13       0.19453582]
Clipped pipette position: [0.18       0.13       0.19453582]
Distance to target: 0.14444175124713185
Action received: [ 0.30175135 -1.         -1.          0.        ]
Updated pipette position: [ 0.48175135 -0.87       -0.80546418]
Clipped pipette position: [ 0.18 -0.26  0.08]
Distance to target: 0.2657542716897197
Action received: [-0.4125514 -1.         1.         1.       ]
Updated pipette position: [-0.2325514 -1.26       1.08     ]
Clipped pipette position: [-0.2325514 -0.26       0.2      ]
Distance to target: 0.46709519360737134
Action received: [-0

<stable_baselines3.ppo.ppo.PPO at 0x1fde5f25460>

In [10]:

# Save the trained model
model.save("ppo_model_3")

# Test the trained model
obs = env.reset()
done = False

while not done:
    action, _ = model.predict(obs)
    action = np.clip(action, env.action_space.low, env.action_space.high)
    obs, reward, done, info = env.step(action)
    print(f"Action: {action}, Obs: {obs}, Reward: {reward}, Done: {done}")

Action received: [ 1.         -1.          0.65856826  0.8111369 ]
Updated pipette position: [ 1.         -1.          0.75856826]
Clipped pipette position: [ 0.18 -0.26  0.2 ]
Distance to target: 0.40103773890862326
Action: [ 1.         -1.          0.65856826  0.8111369 ], Obs: [ 0.18       -0.26        0.2        -0.07691585  0.02614579  0.08621907], Reward: -0.40103773890862326, Done: False
Action received: [-0.13861495 -0.43931422 -1.          0.        ]
Updated pipette position: [ 0.04138505 -0.69931422 -0.8       ]
Clipped pipette position: [ 0.04138505 -0.26        0.08      ]
Distance to target: 0.3096985492480227
Action: [-0.13861495 -0.43931422 -1.          0.        ], Obs: [ 0.04138505 -0.26        0.08       -0.07691585  0.02614579  0.08621907], Reward: -0.3096985492480227, Done: False
Action received: [-0.5652417   0.91171145  1.          0.        ]
Updated pipette position: [-0.52385665  0.65171145  1.08      ]
Clipped pipette position: [-0.26  0.13  0.2 ]
Distance to

In [None]:
import numpy as np
from stable_baselines3 import PPO
from ot2_gym_wrapper import OT2Env

# Load the trained model
model = PPO.load("ppo_model_3")  # Replace with your model file

# Initialize the environment
env = OT2Env(render=False)

# Simplified evaluation variables
num_episodes = 5  # Test on a small number of episodes
total_rewards = []

# Run test episodes
for episode in range(num_episodes):
    obs = env.reset()
    done = False
    episode_reward = 0

    while not done:
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        episode_reward += reward

    total_rewards.append(episode_reward)
    print(f"Episode {episode + 1}: Total Reward = {episode_reward:.2f}")

# Display average reward
mean_reward = np.mean(total_rewards)
print(f"Average Reward over {num_episodes} episodes: {mean_reward:.2f}")

