# Laboratorio 5: Monte Carlo

**Universidad del Valle de Guatemala**  
**Facultad de Ingeniería**  
**Departamento de Ciencias de la Computación**  
**Aprendizaje por Refuerzo** 

## Integrantes
- Diego Leiva - 21752 
- Pablo Orellana - 21970

## Librerias

In [None]:
import gymnasium as gym
import numpy as np
from collections import defaultdict
import random
import matplotlib.pyplot as plt

## Control Monte Carlo

In [None]:
def epsilon_greedy_policy(Q, state, nA, epsilon):
    """
    Epsilon-greedy policy for action selection.

    Args:
        Q (dict): The state-action value function.
        state: The current state.
        nA (int): The number of actions.
        epsilon (float): The exploration rate.

    Returns:
        int: The selected action.
    """
    if np.random.rand() < epsilon:
        return np.random.choice(nA)
    else:
        return np.argmax(Q[state])

In [None]:
def mc_first_visit(num_episodes=10000, 
                   gamma=1.0, 
                   epsilon_start=1.0,
                   epsilon_end=0.05,
                   epsilon_decay=0.7,
                   sab=True,
                   seed=42):
    """
    Monte Carlo First Visit Control for Blackjack.
    A method for estimating the value of each state-action pair in a reinforcement learning environment.

    Args:
        num_episodes (int): The number of episodes to run.
        gamma (float): The discount factor.
        epsilon_start (float): The initial exploration rate.
        epsilon_end (float): The minimum exploration rate.
        epsilon_decay (float): The decay rate of exploration.
        sab (bool): Whether to use the standard action space or the simplified action space.
        seed (int): The random seed for reproducibility.

    Returns:
        policy (dict): The learned policy.
        Q (dict): The state-action value function.
    """

    # Deterministic seed
    np.random.seed(seed)
    random.seed(seed)

    # Create environment Blackjack
    env = gym.make('Blackjack-v1', sab=sab)
    nA = env.action_space.n # Number of actions: 0=stick, 1=hit

    # Initialize Q-value function
    Q = defaultdict(lambda: np.zeros(nA))
    returns_sum = defaultdict(lambda: np.zeros(nA)) # Sum of returns
    returns_count = defaultdict(lambda: np.zeros(nA, dtype=int)) # Count of returns

    
    # Policy ε-soft
    def get_epsilon(ep):
        # Linearly decaying epsilon
        T = int(num_episodes * epsilon_decay)

        # Check if we are still in the decay period
        if ep < T:
            return epsilon_start - (epsilon_start - epsilon_end) * (ep / T)
        return epsilon_end # Otherwise, return the minimum epsilon
    
    # Iterate over episodes
    for ep in range(num_episodes):
        epsilon = get_epsilon(ep)

        # Generate an episode
        state, _ = env.reset(seed=seed+ep) # Reset the environment
        episode = [] # Initialize episode
        terminal = False
        truncated = False

        # Loop until terminal or truncated
        while not (terminal or truncated):
            action = epsilon_greedy_policy(Q, state, nA, epsilon) # Select action with ε-soft policy
            next_state, reward, terminal, truncated, _ = env.step(action) # Take action with the environment
            episode.append((state, action, reward)) # Store state, action, reward
            state = next_state # Update state

        # First visit MC update
        G = 0.0 # Initialize return
        visited = set() # Track visited state-action pairs
        # Iterate over episode in reverse
        for t in reversed(range(len(episode))):
            s, a, r = episode[t] # Unpack state, action, reward
            G = gamma  * G + r # Update return

            # Check if state-action pair is visited
            if (s, a) not in visited:  
                visited.add((s, a)) # Mark state-action pair as visited
                returns_count[s][a] += 1 # Increment count of returns
                returns_sum[s][a] += G  # Increment sum of returns
                Q[s][a] = returns_sum[s][a] / returns_count[s][a] # Update Q-value

        # Final Policy definition
        policy = {s: int(np.argmax(Q[s])) for s in Q} # Greedy policy based on Q-values
        env.close() # Close the environment
        return policy, Q
