In [3]:
import gymnasium as gym
import numpy as np 
env = gym.make("Taxi-v3",render_mode="human")
state_size = env.observation_space.n
action_size= env.action_space.n
#******************table de politique***
policy_table = np.ones((state_size, action_size))/ action_size

value_table = np.zeros(state_size)

print("Premières lignes de policy_table:")
print(policy_table[:5])
print("\nPremières lignes de value_table:")
print(value_table[:5])

Premières lignes de policy_table:
[[0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]
 [0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]
 [0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]
 [0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]
 [0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]]

Premières lignes de value_table:
[0. 0. 0. 0. 0.]


In [None]:
def run_random_agent(env, num_episodes=20):
    for episode in range(num_episodes):
        state = env.reset()
        done = False
        total_reward = 0
        actions = []
        rewards = []
        
        print(f"\nÉpisode {episode + 1}")
        print("-----------------")
        
        while not done:
            action = env.action_space.sample()
            next_state, reward, done, _, _ = env.step(action)
            
            actions.append(action)
            rewards.append(reward)
            total_reward += reward
            state = next_state
        
        print("Actions exécutées:", actions)
        print("Récompenses obtenues:", rewards)
        print("Récompense totale:", total_reward)

run_random_agent(env)


Épisode 1
-----------------


In [7]:
import gymnasium as gym
import numpy as np

env = gym.make("Taxi-v3",render_mode="human")
state_size = env.observation_space.n
action_size = env.action_space.n

policy_table = np.full((state_size, action_size), 1/action_size)  
value_table = np.zeros(state_size)  

gamma = 0.99
lr_policy = 0.1
clip_epsilon = 0.2

def calculate_discounted_rewards(rewards):
    discounted = np.zeros_like(rewards, dtype=np.float32)
    running_add = 0
    for t in reversed(range(len(rewards))):
        running_add = running_add * gamma + rewards[t]
        discounted[t] = running_add
    return discounted

state = env.reset()[0]
done = False
episode_states = []
episode_actions = []
episode_rewards = []

while not done:
    action_probs = policy_table[state] + 1e-8  
    action_probs /= np.sum(action_probs)       
    action = np.random.choice(action_size, p=action_probs)
    
    next_state, reward, done, _, _ = env.step(action)
    
    episode_states.append(state)
    episode_actions.append(action)
    episode_rewards.append(reward)
    state = next_state

discounted_rewards = calculate_discounted_rewards(episode_rewards)
advantages = discounted_rewards - value_table[episode_states]  # At = Rt - V(st)

for t in range(len(episode_states)):
    state = episode_states[t]
    action = episode_actions[t]
    
    old_prob = policy_table[state, action]
    ratio = policy_table[state, action] / old_prob
    
    surr1 = ratio * advantages[t]
    surr2 = np.clip(ratio, 1-clip_epsilon, 1+clip_epsilon) * advantages[t]
    policy_loss = -np.minimum(surr1, surr2)
    
    policy_table[state, action] -= lr_policy * policy_loss
    value_table[state] += lr_policy * advantages[t]

print("Mise à jour PPO terminée avec succès!")
print(f"Nombre d'étapes dans l'épisode: {len(episode_states)}")
print(f"Récompense totale: {sum(episode_rewards)}")
print("Exemple de mise à jour - État 0:")
print(f"Ancienne politique: {np.round(policy_table[0], 3)}")
print(f"Nouvelle valeur: {value_table[0]:.3f}")

Mise à jour PPO terminée avec succès!
Nombre d'étapes dans l'épisode: 755
Récompense totale: -3020
Exemple de mise à jour - État 0:
Ancienne politique: [0.167 0.167 0.167 0.167 0.167 0.167]
Nouvelle valeur: 0.000


In [None]:
# Exercice 4 : Évaluation de l'agent après entraînement

num_eval_episodes = 20  
total_rewards = []

print("\nDébut .....")

for ep in range(num_eval_episodes):
    state = env.reset()[0]
    done = False
    episode_reward = 0
    
    while not done:
        action = np.argmax(policy_table[state])
        next_state, reward, done, _, _ = env.step(action)
        episode_reward += reward
        state = next_state
    
    total_rewards.append(episode_reward)
    print(f"Épisode {ep + 1}: Récompense = {episode_reward}")

avg_reward = np.mean(total_rewards)
std_reward = np.std(total_rewards)

print("\nRésultats finaux:")
print(f"Récompense moyenne sur {num_eval_episodes} épisodes: {avg_reward:.2f}")
print(f"Écart-type: {std_reward:.2f}")

plt.figure(figsize=(10, 5))
plt.bar(['Avant entraînement', 'Après entraînement'], 
        [-8, avg_reward],  
        color=['red', 'green'])
plt.title("Comparaison des performances")
plt.ylabel("Récompense moyenne")
plt.show()