In [None]:
import numpy as np
import pygame
import random
import time  # Import the time module
from collections import defaultdict

# Grid and environment
grid = [
    "_,_,_,_,#,_,_,_,_,_",
    "_,s1,_,_,#,g2,_,_,s3,_",
    "_,_,_,_,#","#,_,_,_,_",
    "_,_,_,_,_,_,_,_,_,_",
    "#","#","#,_,_,_,_,#","#","#",
    "_,g4","#,_,_,_,_,#,g1,_",
    "_,_,_,_,_,_,_,_,_,_",
    "_,_,_,_,#","#,_,_,_,_",
    "_,s2,_,_,g3","#,_,_,s4,_",
    "_,_,_,_,_,#,"","#"
]

# Map positions
START_POS = [(1, 1), (8, 1), (1, 8), (8, 8)]
GOALS = [(5, 8), (1, 5), (8, 4), (5, 1)]
OBSTACLES = [(r, c) for r, row in enumerate(grid) for c, val in enumerate(row.split(",")) if val == "#"]

# Parameters
ACTIONS = [(0, 1), (0, -1), (1, 0), (-1, 0), (0, 0)]  # Right, Left, Down, Up, Stay
epsilon = 0.1  # Exploration factor
alpha = 0.1  # Learning rate
gamma = 0.9  # Discount factor

def is_valid(pos):
    """Check if the position is valid and not an obstacle."""
    return (
        0 <= pos[0] < len(grid) and
        0 <= pos[1] < len(grid[0].split(",")) and
        pos not in OBSTACLES
    )

def q_learning_multi_agent(start_positions, goals, max_episodes=1000):
    """Multi-agent Q-learning for navigation."""
    q_tables = [defaultdict(float) for _ in range(len(start_positions))]
    policy = [defaultdict(int) for _ in range(len(start_positions))]
    
    best_time = float('inf')  # Initialize best time to infinity to track the minimum

    for episode in range(max_episodes):
        start_time = time.time()  # start time of the episode
        
        positions = start_positions[:]
        steps = 0
        
        # Run episode until all agents reach their goal
        while any(pos != goal for pos, goal in zip(positions, goals)):
            new_positions = positions[:]
            for i, (pos, goal) in enumerate(zip(positions, goals)):
                if pos == goal:
                    continue

                state = tuple(positions)

                # Epsilon-greedy action selection
                if random.random() < epsilon:
                    action = random.choice(range(len(ACTIONS)))
                else:
                    action = max(
                        range(len(ACTIONS)), 
                        key=lambda a: q_tables[i][(state, a)]
                    )

                # Action execution
                move = ACTIONS[action]
                next_pos = (pos[0] + move[0], pos[1] + move[1])
                
                # Collision and boundary handling
                if not is_valid(next_pos) or next_pos in new_positions:
                    next_pos = pos
                new_positions[i] = next_pos

                # Update Q-table
                reward = -1 if next_pos != goal else 0
                next_state = tuple(new_positions)
                best_next_action = max(
                    q_tables[i][(next_state, a)] for a in range(len(ACTIONS))
                )
                q_tables[i][(state, action)] += alpha * (
                    reward + gamma * best_next_action - q_tables[i][(state, action)]
                )

                # Update policy
                policy[i][state] = max(
                    range(len(ACTIONS)), 
                    key=lambda a: q_tables[i][(state, a)]
                )

            positions = new_positions
            steps += 1
        
        end_time = time.time()  # Record the end time of episode
        episode_time = end_time - start_time  # Calculate the time taken for the episode
        
        # Track the best (minimum) time across episodes
        if episode_time < best_time:
            best_time = episode_time

    return policy, best_time


policy, best_time = q_learning_multi_agent(START_POS, GOALS)
print(f"Best time (minimum time taken to solve): {best_time:.10f} seconds")


Best time (minimum time taken to solve): 0.0050051212 seconds
