In [7]:
# Cellule 1 (au d√©but du notebook)
%load_ext autoreload
%autoreload 2

import sys
import os
import numpy as np

# Add the src directory to the path
sys.path.append(os.path.abspath('../src'))
sys.path.append(os.path.abspath('..'))

# Import the BaseAgent class
from src.agents.my_agent import MyAgent
from src.agents.my_agent_DQN import DQNTrainer, MyAgentDQN
# Import the evaluation tools
from src.test_agent_validity import validate_agent, load_agent_class
from src.evaluation import evaluate_agent, visualize_trajectory
from wind_scenarios import get_wind_scenario, WIND_SCENARIOS

# List available wind scenarios
print("Available wind scenarios:")
for windfield_name in sorted(WIND_SCENARIOS.keys()):
    print(f"- {windfield_name}")
from typing import Dict, Any
from typing import Dict, Any


from src.utils import save_my_agent
from src.utils import save_dqn_agent
from src.env_sailing import SailingEnv
from src.wind_scenarios import get_wind_scenario
from src.agents.my_agent_DQN import collect_normalization_stats, compute_physics_features, compute_physics_features_raw


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Available wind scenarios:
- simple_static
- static_headwind
- training_1
- training_2
- training_3


In [3]:
WIND_SCENARIO_NAME = "simple_static" # Options: simple_static, static_headwind, training_1, training_2, training_3, etc.

# Evaluation parameters
SEEDS = [1]  # Seeds to use for evaluation
MAX_HORIZON = 200            # Maximum steps per episode
VERBOSE = True               # Show progress bar
RENDER = True              # Enable rendering (slower but necessary for visualization)

#############################################
### DO NOT MODIFY BELOW THIS LINE ##########
#############################################


print(f"Wind scenario: {WIND_SCENARIO_NAME}")
print(f"Using {len(SEEDS)} seeds: {SEEDS}")
print(f"Max steps per episode: {MAX_HORIZON}")

Wind scenario: simple_static
Using 1 seeds: [1]
Max steps per episode: 200


In [None]:




def train_agent(agent=MyAgent, 
                num_episodes=1000, max_steps = 200, 
                learning_rate=0.1, discount_factor=0.99, 
                seed=42, 
                TRAIN_SCENARIO = 'training_1', 
                TRAIN_SCENARIOS = ['training_1', 'training_2', 'training_3'],
                mu=0.39, prog=9):

    ql_agent_full = agent(learning_rate=learning_rate, discount_factor=discount_factor)

    # Set fixed seed for reproducibility
    np.random.seed(seed)
    ql_agent_full.seed(seed)

    

    # Create environment with a simple wind scenario
    #env = SailingEnv(**get_wind_scenario(TRAIN_SCENARIO))


    # Progress tracking
    rewards_history = []
    steps_history = []
    success_history = []

    # Training loop
    print("Starting full training with 15000 episodes...")
    import time
    start_time = time.time()
    goal = [16, 31]

    for episode in range(num_episodes):
        # Reset environment and get initial state
        env = SailingEnv(**get_wind_scenario(np.random.choice(TRAIN_SCENARIOS)))
        
        observation, info = env.reset(seed=episode)  # Different seed each episode
        state = ql_agent_full.discretize_state(observation)
        
        total_reward = 0
        x_prev, y_prev = observation[0], observation[1]
        distance_prev = np.sqrt((goal[0]-x_prev)**2 + (goal[1]-y_prev)**2)
        
        for step in range(max_steps):
            # Select action and take step
            action = ql_agent_full.act(observation)
            next_observation, reward, done, truncated, info = env.step(action)

            x, y = next_observation[0], next_observation[1]
            vx, vy = next_observation[2], next_observation[3]

            distance_curr = np.sqrt((goal[0]-x)**2 + (goal[1]-y)**2)

            progress = distance_prev - distance_curr
            progress_reward = prog * progress  

            velocity = np.sqrt(vx**2 + vy**2)
            velocity_reward = mu * velocity
            step_penalty = -0.5

            shaped_reward = progress_reward + velocity_reward + reward + step_penalty

            next_state = ql_agent_full.discretize_state(next_observation)
            
            # Update Q-table
            ql_agent_full.learn(state, action, shaped_reward, next_state)
            
            # Update state and total reward
            state = next_state
            observation = next_observation
            total_reward += shaped_reward
            distance_prev = distance_curr
            
            # Break if episode is done
            if done or truncated:
                break

           
        # Record metrics
        rewards_history.append(total_reward)
        steps_history.append(step+1) # type: ignore
        success_history.append(done) # type: ignore
        
        # Update exploration rate (decrease over time) we can try UCB
        # ql_agent_full.exploration_rate = max(0.05, ql_agent_full.exploration_rate * 0.98)
        #ql_agent_full.learning_rate = 0.1 / (1 + episode / 1000)
        ql_agent_full.learning_rate = max(0.005, ql_agent_full.learning_rate * 0.998)

        
        
        # Print progress every 100 episodes
        if (episode + 1) % 100 == 0:
            success_rate = sum(success_history[-100:]) / 100 * 100
            print(f"Episode {episode+1}/1000: Success rate (last 100): {success_rate:.1f}%")

    training_time = time.time() - start_time

    # Calculate overall success rate
    success_rate = sum(success_history) / len(success_history) * 100

    print(f"\nTraining completed in {training_time:.1f} seconds!")
    print(f"Success rate: {success_rate:.1f}%")
    print(f"Average reward: {np.mean(rewards_history):.2f}")
    print(f"Average steps: {np.mean(steps_history):.1f}")
    print(f"Q-table size: {len(ql_agent_full.q_table)} states")
    print(f"\nFinal Q-table size: {len(ql_agent_full.q_table)} states")
    return ql_agent_full, rewards_history, steps_history, success_history


In [None]:
agent, rewards_history, steps_history, success_history = train_agent(agent=MyAgent, 
                                                              num_episodes=12000, 
                                                              max_steps = 200, 
                                                              learning_rate=0.1, discount_factor=0.99, 
                                                              seed=42, TRAIN_SCENARIO = 'training_1', 
                                                              mu=0.39, prog=9)

In [None]:
import matplotlib.pyplot as plt

# Calculate rolling averages
window_size = 100
rolling_rewards = np.convolve(rewards_history, np.ones(window_size)/window_size, mode='valid')
rolling_steps = np.convolve(steps_history, np.ones(window_size)/window_size, mode='valid')
rolling_success = np.convolve([1 if s else 0 for s in success_history], np.ones(window_size)/window_size, mode='valid') * 100

# Create the plots
fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(10, 12), sharex=True)
#fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 12), sharex=True)

# Plot rewards
ax1.plot(rolling_rewards)
ax1.set_ylabel('Average Reward')
ax1.set_title('Training Progress (10-episode rolling average)')

# Plot steps
ax2.plot(rolling_steps)
ax2.set_ylabel('Average Steps')

# Plot success rate
ax3.plot(rolling_success)
ax3.set_ylabel('Success Rate (%)')
ax3.set_xlabel('Episode')

plt.tight_layout()
plt.show()

In [2]:
def print_evaluation_results(results):
    """Print evaluation results in a readable format."""
    print("\n" + "="*50)
    print("EVALUATION RESULTS")
    print("="*50)
    
    print(f"Success Rate: {results['success_rate']:.2%}")
    print(f"Mean Reward: {results['mean_reward']:.2f} ¬± {results['std_reward']:.2f}")
    print(f"Mean Steps: {results['mean_steps']:.1f} ¬± {results['std_steps']:.1f}")
    
    if 'individual_results' in results:
        print("\nIndividual Episode Results:")
        for i, episode in enumerate(results['individual_results']):
            print(f"  Seed {episode['seed']}: " + 
                  f"Reward={episode['reward']:.1f}, " +
                  f"Steps={episode['steps']}, " +
                  f"Success={'‚úì' if episode['success'] else '‚úó'}")
    
    print("="*50)



In [None]:
#############################################
### MODIFY THESE PARAMETERS AS NEEDED ######
#############################################

# Choose which wind scenarios to evaluate on
TRAINING_WIND_SCENARIOS = ["simple_static", "training_1", "training_2", "training_3"]

# Evaluation parameters for all wind scenarios
ALL_SEEDS = [42, 43, 44, 45, 46]  # Seeds to use for all evaluations
ALL_MAX_HORIZON = 200             # Maximum steps per episode

#############################################
### DO NOT MODIFY BELOW THIS LINE ##########
#############################################


all_results = {}
    
print(f"Evaluating agent on {len(TRAINING_WIND_SCENARIOS)} wind scenarios (including simple static)...")
    
    # Evaluate on each wind scenario
for wind_scenario_name in TRAINING_WIND_SCENARIOS:
    print(f"\nWind scenario: {wind_scenario_name}")
        
        # Get the wind scenario
    wind_scenario = get_wind_scenario(wind_scenario_name)
        
        # Run the evaluation
    results = evaluate_agent(
            agent=agent,
            wind_scenario=wind_scenario,
            seeds=ALL_SEEDS,
            max_horizon=ALL_MAX_HORIZON,
            verbose=False,  # Less verbose for multiple evaluations
            render=True,
            full_trajectory=True
        )
        
        # Store results
    all_results[wind_scenario_name] = results
        
        # Print summary
    print(f"  Success Rate: {results['success_rate']:.2%}")
    print(f"  Mean Reward: {results['mean_reward']:.2f}")
    print(f"  Mean Steps: {results['mean_steps']:.1f}")
    
    # Print overall performance
total_success = sum(r['success_rate'] for r in all_results.values()) / len(all_results)
print("\n" + "="*50)
print(f"OVERALL SUCCESS RATE: {total_success:.2%}")
print("="*50)

In [None]:
save_my_agent(
    agent=agent,
    output_path='../src/submission/my_agent.py',
    agent_class_name='MyAgent'
)

print("‚úì Agent saved to my_agent.py")

## DQN 


In [2]:
env = SailingEnv(**get_wind_scenario('training_1'))
    
# 2. Collecter les stats de normalisation (UNE FOIS)
collect_normalization_stats(env, n_episodes=1000, save_path='normalization_stats.pkl')

Collecting normalization statistics over 1000 episodes...
  Episode 20/1000
  Episode 40/1000
  Episode 60/1000
  Episode 80/1000
  Episode 100/1000
  Episode 120/1000
  Episode 140/1000
  Episode 160/1000
  Episode 180/1000
  Episode 200/1000
  Episode 220/1000
  Episode 240/1000
  Episode 260/1000
  Episode 280/1000
  Episode 300/1000
  Episode 320/1000
  Episode 340/1000
  Episode 360/1000
  Episode 380/1000
  Episode 400/1000
  Episode 420/1000
  Episode 440/1000
  Episode 460/1000
  Episode 480/1000
  Episode 500/1000
  Episode 520/1000
  Episode 540/1000
  Episode 560/1000
  Episode 580/1000
  Episode 600/1000
  Episode 620/1000
  Episode 640/1000
  Episode 660/1000
  Episode 680/1000
  Episode 700/1000
  Episode 720/1000
  Episode 740/1000
  Episode 760/1000
  Episode 780/1000
  Episode 800/1000
  Episode 820/1000
  Episode 840/1000
  Episode 860/1000
  Episode 880/1000
  Episode 900/1000
  Episode 920/1000
  Episode 940/1000
  Episode 960/1000
  Episode 980/1000
  Episode 1000/

(array([ 3.0344770e+01,  1.5336728e+00,  6.4928097e-01,  3.0006409e+00,
        -1.4352226e-01, -4.4055212e-01, -2.5167916e-02,  3.0738291e+01,
        -3.9398366e-01, -1.3322753e+00,  2.9947329e+00, -4.5197472e-04,
         2.8610569e-01, -4.6641007e-01,  3.7271380e-02], dtype=float32),
 array([2.9249268 , 0.2879615 , 0.40528658, 0.22029994, 0.65921944,
        0.5336879 , 1.293241  , 3.3061597 , 0.6777205 , 1.6052179 ,
        0.21716067, 0.08825349, 0.7938838 , 0.7201869 , 0.2786854 ],
       dtype=float32))

In [5]:
env = SailingEnv(**get_wind_scenario('training_1')) # type: ignore
    
# Cr√©er le trainer
trainer = DQNTrainer(
        env,
        stats_path='normalization_stats.pkl',
        learning_rate=3e-4,
        lr_decay=0.9999,
        epsilon_start=1.0,
        epsilon_end=0.05,
        epsilon_decay=0.9995,
        buffer_capacity=100000,
        target_update_freq=50,
        device='cpu'
)


In [6]:
# Entra√Æner
trainer.train(num_episodes=200, eval_freq=50, verbose=True)

Target network updated at step 100
Target network updated at step 200
Episode 0/200 | Reward: 153.95 | Epsilon: 0.905 | LR: 0.000299 | Buffer: 200 | Steps: 200
Target network updated at step 300
Target network updated at step 400
Target network updated at step 500
Target network updated at step 600
Target network updated at step 700
Target network updated at step 800
Target network updated at step 900
Target network updated at step 1000
Target network updated at step 1100
Target network updated at step 1200
Target network updated at step 1300
Target network updated at step 1400
Target network updated at step 1500
Target network updated at step 1600
Target network updated at step 1700
Target network updated at step 1800
Target network updated at step 1900
Target network updated at step 2000
Target network updated at step 2100
Episode 10/200 | Reward: 76.13 | Epsilon: 0.341 | LR: 0.000285 | Buffer: 2154 | Steps: 2154
Target network updated at step 2200
Target network updated at step 2300

In [8]:
save_dqn_agent(trainer, 'submission/my_agent.py')


SAVING DQN AGENT FOR SUBMISSION

üìä Agent Statistics:
   Network parameters: 128,921
   Physics features: 15
   Goal position: (16, 31)

üíæ Embedding network weights...
   5/18 layers embedded
   10/18 layers embedded
   15/18 layers embedded
   ‚úì All 18 layers embedded

üìù Writing to submission/my_agent.py...

‚úÖ AGENT SAVED SUCCESSFULLY

üìÑ Output file: submission/my_agent.py
üìä File size: 1.93 MB
üß† Network parameters: 128,921
üéØ Physics features: 15

üìã Next steps:
   1. Validate: python src/test_agent_validity.py submission/my_agent.py
   2. Evaluate: python src/evaluate_submission.py submission/my_agent.py --num-seeds 10
   3. Submit: Upload submission/my_agent.py to competition platform


