In [1]:
# Peter and the Wolf: Reinforcement Learning Notebook

# --- Imports ---
import numpy as np
import random
import matplotlib.pyplot as plt


In [4]:

# --- Environment Setup ---
class Board:
    def __init__(self, width=6, height=6):
        self.width = width
        self.height = height
        self.grid = self.create_board()
    
    def create_board(self):
        # Initialize empty ground
        board = [['ground' for _ in range(self.width)] for _ in range(self.height)]
        
        # Randomly place apples, trees/grass, wolf
        for _ in range(3):
            x, y = random.randint(0, self.width-1), random.randint(0, self.height-1)
            board[x][y] = 'apple'
        for _ in range(3):
            x, y = random.randint(0, self.width-1), random.randint(0, self.height-1)
            board[x][y] = 'tree'
        x, y = random.randint(0, self.width-1), random.randint(0, self.height-1)
        board[x][y] = 'wolf'
        
        return board
    
    def render(self):
        for row in self.grid:
            print(' | '.join(row))
        print()

# --- Parameters ---
actions = {0: (-1,0), 1: (1,0), 2: (0,-1), 3: (0,1)} # up, down, left, right
max_energy = 20
max_fatigue = 20
required_energy = 10
max_fatigue_for_battle = 5

# Discretize energy/fatigue for Q-table
n_energy_levels = 5
n_fatigue_levels = 5

# --- Helper Functions ---
def discretize(value, max_value, levels):
    """Map value to discrete index for Q-table"""
    idx = int((value / max_value) * (levels - 1))
    return max(0, min(levels - 1, idx))

def reward(state, next_state, board):
    x, y, energy, fatigue = next_state
    r = -1  # penalty for moving
    
    cell = board.grid[x][y]
    
    if cell == 'apple':
        r += 10
        energy = min(energy + 5, max_energy)
    if cell in ['tree', 'grass']:
        r += 5
        fatigue = max(fatigue - 5, 0)
    if cell == 'wolf':
        if energy >= required_energy and fatigue <= max_fatigue_for_battle:
            r += 50
        else:
            r -= 50
    return r

def step(state, action, board):
    x, y, energy, fatigue = state
    dx, dy = actions[action]
    new_x = max(0, min(board.width-1, x + dx))
    new_y = max(0, min(board.height-1, y + dy))
    
    # Energy and fatigue updates
    energy -= 1
    fatigue += 1
    
    cell = board.grid[new_x][new_y]
    if cell == 'apple':
        energy = min(energy + 5, max_energy)
    if cell in ['tree', 'grass']:
        fatigue = max(fatigue - 5, 0)
    
    next_state = (new_x, new_y, energy, fatigue)
    r = reward(state, next_state, board)
    
    return next_state, r

def choose_action(Q, state, epsilon=0.1):
    x, y, e, f = state
    e_idx = discretize(e, max_energy, n_energy_levels)
    f_idx = discretize(f, max_fatigue, n_fatigue_levels)
    
    if random.random() < epsilon:
        return random.choice(list(actions.keys()))
    else:
        return np.argmax(Q[x, y, e_idx, f_idx, :])

# --- Q-Learning Setup ---
def q_learning(board, epochs=5000, alpha=0.1, gamma=0.9, epsilon=0.1):
    Q = np.zeros((board.width, board.height, n_energy_levels, n_fatigue_levels, len(actions)))
    
    for ep in range(epochs):
        # Random initial state
        x, y = random.randint(0, board.width-1), random.randint(0, board.height-1)
        state = (x, y, max_energy, 0)
        total_reward = 0
        done = False
        steps = 0
        
        while not done and steps < 50:  # limit steps per episode
            action = choose_action(Q, state, epsilon)
            next_state, r = step(state, action, board)
            
            # Update Q-Table
            x, y, e, f = state
            new_x, new_y, new_e, new_f = next_state
            e_idx = discretize(e, max_energy, n_energy_levels)
            f_idx = discretize(f, max_fatigue, n_fatigue_levels)
            new_e_idx = discretize(new_e, max_energy, n_energy_levels)
            new_f_idx = discretize(new_f, max_fatigue, n_fatigue_levels)
            
            Q[x, y, e_idx, f_idx, action] = (1 - alpha) * Q[x, y, e_idx, f_idx, action] + \
                                            alpha * (r + gamma * np.max(Q[new_x, new_y, new_e_idx, new_f_idx, :]))
            
            total_reward += r
            state = next_state
            steps += 1
            
            # End if wolf encountered
            if board.grid[new_x][new_y] == 'wolf':
                done = True
                
    return Q

# --- Simulation Functions ---
def run_game(board, Q=None, random_walk=False):
    """
    Simulates a single game for either the RL agent or a random walk agent.
    
    Parameters:
        board (Board): The game board.
        Q (np.ndarray): The Q-table for the RL agent. If None, random walk is used.
        random_walk (bool): If True, the agent takes random actions.
    
    Returns:
        bool: True if the agent wins (defeats the wolf), False otherwise.
    """
    # Initialize the agent's starting state
    x, y = random.randint(0, board.width-1), random.randint(0, board.height-1)
    state = (x, y, max_energy, 0)  # Start with full energy and no fatigue
    steps = 0  # Step counter
    
    while steps < 50:  # Limit the number of steps per game
        if random_walk:
            # Random walk agent chooses actions randomly
            action = random.choice(list(actions.keys()))
        else:
            # RL agent chooses actions based on the Q-table
            action = choose_action(Q, state, epsilon=0)
        
        # Perform the chosen action and update the state
        state, _ = step(state, action, board)
        x, y, e, f = state
        
        # Check if the agent encounters the wolf
        if board.grid[x][y] == 'wolf':
            # Determine if the agent wins or loses based on energy and fatigue
            if e >= required_energy and f <= max_fatigue_for_battle:
                return True  # Win
            else:
                return False  # Lose
        
        steps += 1  # Increment step counter
    
    return False  # Return False if the game ends without encountering the wolf

def compare_agents(n_games=100):
    """
    Compares the performance of the RL agent and the random walk agent over multiple games.
    
    Parameters:
        n_games (int): The number of games to simulate for each agent.
    
    Prints:
        The number of wins for the RL agent and the random walk agent.
    """
    # Initialize the game board
    board = Board()
    
    # Train the RL agent using Q-learning
    Q = q_learning(board, epochs=5000)
    
    # Simulate games for the RL agent
    rl_wins = sum(run_game(board, Q) for _ in range(n_games))
    
    # Simulate games for the random walk agent
    rw_wins = sum(run_game(board, random_walk=True) for _ in range(n_games))
    
    # Print the results
    print(f"RL Agent Wins: {rl_wins}/{n_games}")
    print(f"Random Walk Wins: {rw_wins}/{n_games}")

# --- Run Comparison ---
compare_agents()

RL Agent Wins: 24/100
Random Walk Wins: 11/100
