In [None]:
# Comparaison des algorithmes de RL sur Monty Hall 1

import os
import time
import json
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from src.rl_environments.monty_hall_interactive import MontyHallInteractive
from src.rl_algorithms.q_learning import QLearning
from src.rl_algorithms.sarsa import SARSA
from src.rl_algorithms.dyna_q import DynaQ
from src.rl_algorithms.monte_carlo_es import MonteCarloES
from src.rl_algorithms.off_policy_mc_control import OffPolicyMCControl
from src.rl_algorithms.on_policy_first_visit_mc_control import OnPolicyFirstVisitMCControl
from src.rl_algorithms.policy_iteration import PolicyIteration
from src.rl_algorithms.value_iteration import ValueIteration

# Initialisation de l'environnement Monty Hall 1
env = MontyHallInteractive()
output_dir = 'outputs/mh1'
os.makedirs(output_dir, exist_ok=True)

In [None]:
agent_ql = QLearning(
    state_space_size=env.state_space_size,
    action_space_size=env.action_space_size,
    learning_rate=0.1,
    gamma=0.9,
    epsilon=0.1,
    epsilon_decay=0.995,
    epsilon_min=0.01
)
num_episodes = 5000
start_time = time.time()
train_stats_ql = agent_ql.train(env, num_episodes=num_episodes, verbose=True)
train_time_ql = time.time() - start_time

agent_ql.save_model(os.path.join(output_dir, 'q_learning_model.pkl'))
with open(os.path.join(output_dir, 'q_learning_description.json'), 'w') as f:
    json.dump({
        "algorithm": "Q-Learning",
        "num_episodes": num_episodes,
        "train_time_seconds": train_time_ql,
        "final_avg_reward": train_stats_ql['final_avg_reward'],
        "hyperparameters": {
            "learning_rate": agent_ql.learning_rate,
            "gamma": agent_ql.gamma,
            "epsilon": agent_ql.epsilon,
            "epsilon_decay": agent_ql.epsilon_decay,
            "epsilon_min": agent_ql.epsilon_min
        }
    }, f, indent=2)
qlearning_metrics = {
    "rewards": train_stats_ql['episode_rewards'],
    "lengths": train_stats_ql['episode_lengths'],
    "train_time": train_time_ql
}
plt.figure(figsize=(10, 4))
plt.plot(qlearning_metrics['rewards'])
plt.title('Récompense par épisode (Q-Learning)')
plt.xlabel('Épisode')
plt.ylabel('Récompense')
plt.grid()
plt.show()
print("Politique apprise (état -> action):", agent_ql.get_policy())
print("Q-table finale :", agent_ql.q_function)

In [None]:
agent_sarsa = SARSA(
    state_space_size=env.state_space_size,
    action_space_size=env.action_space_size,
    learning_rate=0.1,
    gamma=0.9,
    epsilon=0.1,
    epsilon_decay=0.995,
    epsilon_min=0.01
)
num_episodes_sarsa = 5000
start_time_sarsa = time.time()
train_stats_sarsa = agent_sarsa.train(env, num_episodes=num_episodes_sarsa, verbose=True)
train_time_sarsa = time.time() - start_time_sarsa

agent_sarsa.save_model(os.path.join(output_dir, 'sarsa_model.pkl'))
with open(os.path.join(output_dir, 'sarsa_description.json'), 'w') as f:
    json.dump({
        "algorithm": "SARSA",
        "num_episodes": num_episodes_sarsa,
        "train_time_seconds": train_time_sarsa,
        "final_avg_reward": train_stats_sarsa['final_avg_reward'],
        "hyperparameters": {
            "learning_rate": agent_sarsa.learning_rate,
            "gamma": agent_sarsa.gamma,
            "epsilon": agent_sarsa.epsilon,
            "epsilon_decay": agent_sarsa.epsilon_decay,
            "epsilon_min": agent_sarsa.epsilon_min
        }
    }, f, indent=2)
sarsa_metrics = {
    "rewards": train_stats_sarsa['episode_rewards'],
    "lengths": train_stats_sarsa['episode_lengths'],
    "train_time": train_time_sarsa
}
plt.figure(figsize=(10, 4))
plt.plot(sarsa_metrics['rewards'])
plt.title('Récompense par épisode (SARSA)')
plt.xlabel('Épisode')
plt.ylabel('Récompense')
plt.grid()
plt.show()
print("Politique apprise (état -> action):", agent_sarsa.get_policy())
print("Q-table finale :", agent_sarsa.q_function)

In [None]:
agent_dynaq = DynaQ(
    state_space_size=env.state_space_size,
    action_space_size=env.action_space_size,
    learning_rate=0.1,
    gamma=0.99,
    epsilon=0.1,
    epsilon_decay=0.995,
    epsilon_min=0.01,
    n_planning_steps=10
)
num_episodes_dynaq = 5000
start_time_dynaq = time.time()
train_stats_dynaq = agent_dynaq.train(env, num_episodes=num_episodes_dynaq, verbose=True)
train_time_dynaq = time.time() - start_time_dynaq

agent_dynaq.save_model(os.path.join(output_dir, 'dynaq_model.pkl'))
with open(os.path.join(output_dir, 'dynaq_description.json'), 'w') as f:
    json.dump({
        "algorithm": "Dyna-Q",
        "num_episodes": num_episodes_dynaq,
        "train_time_seconds": train_time_dynaq,
        "final_avg_reward": train_stats_dynaq['final_avg_reward'],
        "hyperparameters": {
            "learning_rate": agent_dynaq.learning_rate,
            "gamma": agent_dynaq.gamma,
            "epsilon": agent_dynaq.epsilon,
            "epsilon_decay": agent_dynaq.epsilon_decay,
            "epsilon_min": agent_dynaq.epsilon_min,
            "n_planning_steps": agent_dynaq.n_planning_steps
        }
    }, f, indent=2)
dynaq_metrics = {
    "rewards": train_stats_dynaq['episode_rewards'],
    "lengths": train_stats_dynaq['episode_lengths'],
    "train_time": train_time_dynaq
}
plt.figure(figsize=(10, 4))
plt.plot(dynaq_metrics['rewards'])
plt.title('Récompense par épisode (Dyna-Q)')
plt.xlabel('Épisode')
plt.ylabel('Récompense')
plt.grid()
plt.show()
print("Politique apprise (état -> action):", agent_dynaq.get_policy())
print("Q-table finale :", agent_dynaq.q_function)

In [None]:
agent_mc_es = MonteCarloES(
    state_space_size=env.state_space_size,
    action_space_size=env.action_space_size,
    gamma=0.99,
    epsilon=0.1,
    epsilon_decay=0.995,
    epsilon_min=0.01
)
num_episodes_mc_es = 5000
start_time_mc_es = time.time()
train_stats_mc_es = agent_mc_es.train(env, num_episodes=num_episodes_mc_es, verbose=True)
train_time_mc_es = time.time() - start_time_mc_es

agent_mc_es.save_model(os.path.join(output_dir, 'mc_es_model.pkl'))
with open(os.path.join(output_dir, 'mc_es_description.json'), 'w') as f:
    json.dump({
        "algorithm": "Monte Carlo ES",
        "num_episodes": num_episodes_mc_es,
        "train_time_seconds": train_time_mc_es,
        "final_avg_reward": train_stats_mc_es['final_avg_reward'],
        "hyperparameters": {
            "gamma": agent_mc_es.gamma,
            "epsilon": agent_mc_es.epsilon,
            "epsilon_decay": agent_mc_es.epsilon_decay,
            "epsilon_min": agent_mc_es.epsilon_min
        }
    }, f, indent=2)
mc_es_metrics = {
    "rewards": train_stats_mc_es['episode_rewards'],
    "lengths": train_stats_mc_es['episode_lengths'],
    "train_time": train_time_mc_es
}
plt.figure(figsize=(10, 4))
plt.plot(mc_es_metrics['rewards'])
plt.title('Récompense par épisode (Monte Carlo ES)')
plt.xlabel('Épisode')
plt.ylabel('Récompense')
plt.grid()
plt.show()
print("Politique apprise (état -> action):", agent_mc_es.get_policy())
print("Q-table finale :", agent_mc_es.q_function)

In [None]:
agent_offpolicy_mc = OffPolicyMCControl(
    state_space_size=env.state_space_size,
    action_space_size=env.action_space_size,
    gamma=0.99,
    epsilon=0.1,
    epsilon_decay=0.995,
    epsilon_min=0.01
)
num_episodes_offpolicy_mc = 5000
start_time_offpolicy_mc = time.time()
train_stats_offpolicy_mc = agent_offpolicy_mc.train(env, num_episodes=num_episodes_offpolicy_mc, verbose=True)
train_time_offpolicy_mc = time.time() - start_time_offpolicy_mc

agent_offpolicy_mc.save_model(os.path.join(output_dir, 'offpolicy_mc_model.pkl'))
with open(os.path.join(output_dir, 'offpolicy_mc_description.json'), 'w') as f:
    json.dump({
        "algorithm": "Off-policy MC Control",
        "num_episodes": num_episodes_offpolicy_mc,
        "train_time_seconds": train_time_offpolicy_mc,
        "final_avg_reward": train_stats_offpolicy_mc['final_avg_reward'],
        "hyperparameters": {
            "gamma": agent_offpolicy_mc.gamma,
            "epsilon": agent_offpolicy_mc.epsilon,
            "epsilon_decay": agent_offpolicy_mc.epsilon_decay,
            "epsilon_min": agent_offpolicy_mc.epsilon_min
        }
    }, f, indent=2)
offpolicy_mc_metrics = {
    "rewards": train_stats_offpolicy_mc['episode_rewards'],
    "lengths": train_stats_offpolicy_mc['episode_lengths'],
    "train_time": train_time_offpolicy_mc
}
plt.figure(figsize=(10, 4))
plt.plot(offpolicy_mc_metrics['rewards'])
plt.title('Récompense par épisode (Off-policy MC Control)')
plt.xlabel('Épisode')
plt.ylabel('Récompense')
plt.grid()
plt.show()
print("Politique apprise (état -> action):", agent_offpolicy_mc.get_policy())
print("Q-table finale :", agent_offpolicy_mc.q_function)

In [None]:
agent_onpolicy_mc = OnPolicyFirstVisitMCControl(
    state_space_size=env.state_space_size,
    action_space_size=env.action_space_size,
    gamma=0.99,
    epsilon=0.1,
    epsilon_decay=0.995,
    epsilon_min=0.01
)
num_episodes_onpolicy_mc = 5000
start_time_onpolicy_mc = time.time()
train_stats_onpolicy_mc = agent_onpolicy_mc.train(env, num_episodes=num_episodes_onpolicy_mc, verbose=True)
train_time_onpolicy_mc = time.time() - start_time_onpolicy_mc

agent_onpolicy_mc.save_model(os.path.join(output_dir, 'onpolicy_mc_model.pkl'))
with open(os.path.join(output_dir, 'onpolicy_mc_description.json'), 'w') as f:
    json.dump({
        "algorithm": "On-policy First Visit MC Control",
        "num_episodes": num_episodes_onpolicy_mc,
        "train_time_seconds": train_time_onpolicy_mc,
        "final_avg_reward": train_stats_onpolicy_mc['final_avg_reward'],
        "hyperparameters": {
            "gamma": agent_onpolicy_mc.gamma,
            "epsilon": agent_onpolicy_mc.epsilon,
            "epsilon_decay": agent_onpolicy_mc.epsilon_decay,
            "epsilon_min": agent_onpolicy_mc.epsilon_min
        }
    }, f, indent=2)
onpolicy_mc_metrics = {
    "rewards": train_stats_onpolicy_mc['episode_rewards'],
    "lengths": train_stats_onpolicy_mc['episode_lengths'],
    "train_time": train_time_onpolicy_mc
}
plt.figure(figsize=(10, 4))
plt.plot(onpolicy_mc_metrics['rewards'])
plt.title('Récompense par épisode (On-policy First Visit MC Control)')
plt.xlabel('Épisode')
plt.ylabel('Récompense')
plt.grid()
plt.show()
print("Politique apprise (état -> action):", agent_onpolicy_mc.get_policy())
print("Q-table finale :", agent_onpolicy_mc.q_function)

In [None]:
agent_policy_iter = PolicyIteration(
    state_space_size=env.state_space_size,
    action_space_size=env.action_space_size,
    gamma=0.999999,
    theta=0.00001,
    max_iterations=1000
)
start_time_policy_iter = time.time()
train_stats_policy_iter = agent_policy_iter.train(env, verbose=True)
train_time_policy_iter = time.time() - start_time_policy_iter

agent_policy_iter.save_model(os.path.join(output_dir, 'policy_iteration_model.pkl'))
with open(os.path.join(output_dir, 'policy_iteration_description.json'), 'w') as f:
    json.dump({
        "algorithm": "Policy Iteration",
        "iterations": train_stats_policy_iter['iterations'],
        "converged": train_stats_policy_iter['converged'],
        "max_value": train_stats_policy_iter['max_value'],
        "train_time_seconds": train_time_policy_iter,
        "hyperparameters": {
            "gamma": agent_policy_iter.gamma,
            "theta": agent_policy_iter.theta,
            "max_iterations": agent_policy_iter.max_iterations
        }
    }, f, indent=2)
policy_iter_metrics = {
    "iterations": train_stats_policy_iter['iterations'],
    "train_time": train_time_policy_iter
}
print("Politique apprise (état -> action):", agent_policy_iter.get_policy())
print("Fonction de valeur finale :", agent_policy_iter.get_value_function())

In [None]:
agent_value_iter = ValueIteration(
    state_space_size=env.state_space_size,
    action_space_size=env.action_space_size,
    gamma=0.9,
    theta=1e-6,
    max_iterations=1000
)
start_time_value_iter = time.time()
train_stats_value_iter = agent_value_iter.train(env, verbose=True)
train_time_value_iter = time.time() - start_time_value_iter

agent_value_iter.save_model(os.path.join(output_dir, 'value_iteration_model.pkl'))
with open(os.path.join(output_dir, 'value_iteration_description.json'), 'w') as f:
    json.dump({
        "algorithm": "Value Iteration",
        "iterations": train_stats_value_iter['iterations'],
        "converged": train_stats_value_iter['converged'],
        "max_value": train_stats_value_iter['max_value'],
        "train_time_seconds": train_time_value_iter,
        "hyperparameters": {
            "gamma": agent_value_iter.gamma,
            "theta": agent_value_iter.theta,
            "max_iterations": agent_value_iter.max_iterations
        }
    }, f, indent=2)
value_iter_metrics = {
    "iterations": train_stats_value_iter['iterations'],
    "train_time": train_time_value_iter
}
print("Politique apprise (état -> action):", agent_value_iter.get_policy())
print("Fonction de valeur finale :", agent_value_iter.get_value_function())

In [None]:
# Courbes de récompenses (pour les algos par expérience)
plt.figure(figsize=(12, 6))
plt.plot(qlearning_metrics['rewards'], label='Q-Learning')
plt.plot(sarsa_metrics['rewards'], label='SARSA')
plt.plot(dynaq_metrics['rewards'], label='Dyna-Q')
plt.plot(mc_es_metrics['rewards'], label='Monte Carlo ES')
plt.plot(offpolicy_mc_metrics['rewards'], label='Off-policy MC Control')
plt.plot(onpolicy_mc_metrics['rewards'], label='On-policy First Visit MC Control')
plt.title('Courbe de récompense par épisode')
plt.xlabel('Épisode')
plt.ylabel('Récompense')
plt.legend()
plt.grid()
plt.show()

# Tableau récapitulatif
summary_data = {
    'Algorithme': [
        'Q-Learning', 'SARSA', 'Dyna-Q', 'Monte Carlo ES',
        'Off-policy MC Control', 'On-policy First Visit MC Control',
        'Policy Iteration', 'Value Iteration'
    ],
    'Temps entraînement (s)': [
        qlearning_metrics['train_time'],
        sarsa_metrics['train_time'],
        dynaq_metrics['train_time'],
        mc_es_metrics['train_time'],
        offpolicy_mc_metrics['train_time'],
        onpolicy_mc_metrics['train_time'],
        policy_iter_metrics['train_time'],
        value_iter_metrics['train_time']
    ],
    'Convergence/Itérations': [
        len(qlearning_metrics['rewards']),
        len(sarsa_metrics['rewards']),
        len(dynaq_metrics['rewards']),
        len(mc_es_metrics['rewards']),
        len(offpolicy_mc_metrics['rewards']),
        len(onpolicy_mc_metrics['rewards']),
        policy_iter_metrics['iterations'],
        value_iter_metrics['iterations']
    ]
}
summary_df = pd.DataFrame(summary_data)
display(summary_df)

# Politiques optimales
print('Politiques optimales apprises :')
print('Q-Learning:', agent_ql.get_policy())
print('SARSA:', agent_sarsa.get_policy())
print('Dyna-Q:', agent_dynaq.get_policy())
print('Monte Carlo ES:', agent_mc_es.get_policy())
print('Off-policy MC Control:', agent_offpolicy_mc.get_policy())
print('On-policy First Visit MC Control:', agent_onpolicy_mc.get_policy())
print('Policy Iteration:', agent_policy_iter.get_policy())
print('Value Iteration:', agent_value_iter.get_policy())