Old Softmax Parametrization

In [None]:
# Softmax Policy Parametrization
class SoftmaxPolicy:
    def __init__(self, num_actions, num_features):
        self.num_actions = num_actions
        self.num_features = num_features
        self.theta = np.zeros((num_features, num_actions))  # Initialize policy parameters

    # Softmax action selection
    def select_action(self, state):
        action_probs = self.calculate_softmax_probabilities(state)
        chosen_action = np.random.choice(np.arange(self.num_actions), p=action_probs)
        if chosen_action == 0:
          return "a1"
        else :
          return "a2"

    # Calculate softmax probabilities
    def calculate_softmax_probabilities(self, state):
        exp_values = np.exp(np.dot(self.theta.T, state))
        return exp_values / np.sum(exp_values)

    # Gradient of log(πθ(a|s))
    def gradient_log_pi_theta(self, state, action, action_probs):
        grad_log_pi = np.zeros((self.num_features, self.num_actions))
        if action == "a1":
          action = 0
        else :
          action = 1
        grad_log_pi[:, action] = state - np.dot(state, action_probs)
        return grad_log_pi


In [None]:
# Initialize policy parameters
theta = np.random.rand(2,2)
alpha = 0.01  # Learning rate
gamma = 0.99  # Discount factor
num_episodes = 1000
env = ServerEnvironment()
policy = SoftmaxPolicy(2, 2)  # Assuming num_features and num_actions are defined properly
# Initialiser les listes pour stocker les récompenses
total_rewards = []
average_rewards = []

# Reinforcement learning loop
for episode in range(num_episodes):
    # Reset environment for each episode
    env.reset()
    state = env.get_state()
    trajectory = []  # Store (state, action, reward) tuples
    total_reward = 0  # Initialiser la récompense totale pour cet épisode

    compteur = 0
    while compteur < 100:
        action = policy.select_action(state)  # Select action using policy
        next_state, reward, _ = env.step(action)  # Take action and observe next state, reward, and done flag
        trajectory.append((state, action, reward))  # Store (state, action, reward) tuple in trajectory
        state = next_state  # Move to next state
        compteur += 1

    # Calculate returns (Gt) for each time step using the rewards
    Gt_list = []
    Gt = 0
    for t in reversed(range(len(trajectory))):
        _, _, reward = trajectory[t]
        Gt = gamma * Gt + reward
        Gt_list.insert(0, Gt)
        total_reward += reward

    total_rewards.append(total_reward)
    average_reward = np.mean(total_rewards) if len(total_rewards) > 0 else 0
    average_rewards.append(average_reward)

    # Update policy parameters using the gradient
    for t, (state, action, _) in enumerate(trajectory):

        # Calculate gradient of log probability
        action_probs = policy.calculate_softmax_probabilities(state)
        log_prob_gradient = policy.gradient_log_pi_theta(state, action, action_probs)

        # Update policy parameters using the REINFORCE update rule
        theta += alpha * log_prob_gradient * Gt_list[t] # θt+1 = θt + α * gamma^t * ∇θ log(πθ(At|St))Gt

# Représentation de l'action optimale en fonction de l'état (Q1, Q2)
actions_optimales = np.zeros((max_Q + 1, max_Q + 1))
for Q1 in range(max_Q + 1):
    for Q2 in range(max_Q + 1):
        actions_optimales[Q1, Q2] = max(policy.calculate_softmax_probabilities((Q1, Q2)))

# Tracer le total des récompenses au fil du temps
plt.figure(figsize=(10, 6))
plt.plot(total_rewards)
plt.title('Total des récompenses par épisode')
plt.xlabel('Épisode')
plt.ylabel('Total des récompenses')
plt.show()

# Tracer la récompense moyenne au fil du temps
plt.figure(figsize=(10, 6))
plt.plot(average_rewards)
plt.title('Récompense moyenne par épisode')
plt.xlabel('Épisode')
plt.ylabel('Récompense moyenne')
plt.show()

# Affichage de l'action optimale en fonction de l'état (Q1, Q2)
plt.figure(figsize=(10, 8))
plt.imshow(actions_optimales, interpolation='nearest')
plt.colorbar(label='Action optimale (0 pour a1, 1 pour a2)')
plt.title('Action Optimale en fonction de l\'État (Q1, Q2)')
plt.xlabel('Q2 (Nombre de travaux dans le Serveur 2)')
plt.ylabel('Q1 (Nombre de travaux dans le Serveur 1)')
plt.gca().invert_yaxis()  # Inversion de l'axe des ordonnées
plt.show()
