In [4]:
import numpy as np
import random
import time

class GridWorld:
    def __init__(self, size=5, start=(0, 0), goal=(4, 4), obstacles=None):
        #Enviroment
        self.size = size
        self.start_pos = start
        self.goal_pos = goal
        self.obstacles = obstacles if obstacles is not None else []
        self.agent_pos = self.start_pos
        # Define the action space: 0: up, 1: down, 2: left, 3: right
        self.actions = [0, 1, 2, 3]

    def reset(self):
        #Agent at Starting pos
        self.agent_pos = self.start_pos
        return self.get_state()

    def get_state(self):
        #current state of agent
        return self.agent_pos

    def step(self, action):
        row, col = self.agent_pos

        # Move the agent based on the action
        if action == 0:  # Up
            row = max(0, row - 1)
        elif action == 1:  # Down
            row = min(self.size - 1, row + 1)
        elif action == 2:  # Left
            col = max(0, col - 1)
        elif action == 3:  # Right
            col = min(self.size - 1, col + 1)

        next_pos = (row, col)

        # Check for collisions with obstacles
        if next_pos in self.obstacles:
            reward = -10  
            done = False
            # Agent stays in the same position if it hits an obstacle
        else:
            self.agent_pos = next_pos
            # Check if the goal is reached
            if self.agent_pos == self.goal_pos:
                reward = 100 
                done = True
            else:
                reward = -1 
                done = False

        return self.get_state(), reward, done

    def render(self):
        #print the state
        grid = np.full((self.size, self.size), '_', dtype=str)
        grid[self.start_pos] = 'S'
        grid[self.goal_pos] = 'G'
        for obs in self.obstacles:
            grid[obs] = 'X'
        grid[self.agent_pos] = 'A'
        print("\n".join(" ".join(row) for row in grid))
        print("-" * (2 * self.size - 1))


class QLearningAgent:
    def __init__(self, state_space_size, action_space_size, learning_rate=0.1, discount_factor=0.99, exploration_rate=1.0, exploration_decay=0.995, min_exploration_rate=0.01):
        
        self.q_table = np.zeros(state_space_size + (action_space_size,))
        self.lr = learning_rate
        self.gamma = discount_factor
        self.epsilon = exploration_rate
        self.epsilon_decay = exploration_decay
        self.min_epsilon = min_exploration_rate
        self.actions = list(range(action_space_size))

    def choose_action(self, state):
        if random.uniform(0, 1) < self.epsilon:
            return random.choice(self.actions)  # Explore
        else:
            return np.argmax(self.q_table[state])  # Exploit

    def update_q_table(self, state, action, reward, next_state):
        old_value = self.q_table[state][action]
        next_max = np.max(self.q_table[next_state])
        
        # Q-learning formula
        new_value = (1 - self.lr) * old_value + self.lr * (reward + self.gamma * next_max)
        self.q_table[state][action] = new_value

    def decay_exploration(self):
        if self.epsilon > self.min_epsilon:
            self.epsilon *= self.epsilon_decay


def train_agent(env, agent, episodes=1000):
    print("--- Starting Training ---")
    for episode in range(episodes):
        state = env.reset()
        done = False
        total_reward = 0
        
        while not done:
            action = agent.choose_action(state)
            next_state, reward, done = env.step(action)
            agent.update_q_table(state, action, reward, next_state)
            state = next_state
            total_reward += reward

        agent.decay_exploration()

        if (episode + 1) % 100 == 0:
            print(f"Episode {episode + 1}/{episodes} | Total Reward: {total_reward}")
    print("--- Training Finished ---")


def evaluate_agent(env, agent, episodes=5):
    print("\n--- Evaluating Agent ---")
    agent.epsilon = 0  # Turn off exploration for evaluation
    for episode in range(episodes):
        state = env.reset()
        done = False
        total_reward = 0
        path = [state]
        print(f"\n--- Evaluation Episode {episode + 1} ---")
        env.render()
        
        while not done:
            action = agent.choose_action(state)
            next_state, reward, done = env.step(action)
            state = next_state
            total_reward += reward
            path.append(state)
            time.sleep(0.5)
            env.render()

        print(f"Path taken: {path}")
        print(f"Total Reward: {total_reward}")


def get_user_input():
    """Gets environment parameters from the user."""
    try:
        size = int(input("Enter grid size (e.g., 5 for a 5x5 grid): "))
        
        start_str = input("Enter start position (row,col), e.g., 0,0: ")
        start = tuple(map(int, start_str.split(',')))
        
        goal_str = input(f"Enter goal position (row,col), e.g., {size-1},{size-1}: ")
        goal = tuple(map(int, goal_str.split(',')))
        
        obstacles = []
        print("Enter obstacle positions (row,col), one per line. Type 'done' when finished.")
        while True:
            obs_str = input("> ")
            if obs_str.lower() == 'done':
                break
            obstacles.append(tuple(map(int, obs_str.split(','))))
            
        return size, start, goal, obstacles
    except ValueError:
        print("Invalid input. Please use the correct format.")
        return None, None, None, None


if __name__ == "__main__":
    # Get environment configuration from the user
    size, start, goal, obstacles = get_user_input()

    if size is not None:
        # 1. Define the Custom Environment
        environment = GridWorld(size=size, start=start, goal=goal, obstacles=obstacles)
        
        # 2. Select and Implement an RL Algorithm (Q-learning)
        state_space_size = (size, size)
        action_space_size = len(environment.actions)
        rl_agent = QLearningAgent(state_space_size, action_space_size)
        
        # 3. Train the Agent
        train_agent(environment, rl_agent, episodes=2000)
        
        # 4. Evaluate the performance
        evaluate_agent(environment, rl_agent)


Enter grid size (e.g., 5 for a 5x5 grid):  5
Enter start position (row,col), e.g., 0,0:  0,0
Enter goal position (row,col), e.g., 4,4:  4,4


Enter obstacle positions (row,col), one per line. Type 'done' when finished.


>  1,1
>  2,2
>  done


--- Starting Training ---
Episode 100/2000 | Total Reward: 56
Episode 200/2000 | Total Reward: 76
Episode 300/2000 | Total Reward: 87
Episode 400/2000 | Total Reward: 82
Episode 500/2000 | Total Reward: 93
Episode 600/2000 | Total Reward: 93
Episode 700/2000 | Total Reward: 93
Episode 800/2000 | Total Reward: 93
Episode 900/2000 | Total Reward: 93
Episode 1000/2000 | Total Reward: 93
Episode 1100/2000 | Total Reward: 93
Episode 1200/2000 | Total Reward: 93
Episode 1300/2000 | Total Reward: 93
Episode 1400/2000 | Total Reward: 93
Episode 1500/2000 | Total Reward: 93
Episode 1600/2000 | Total Reward: 93
Episode 1700/2000 | Total Reward: 93
Episode 1800/2000 | Total Reward: 93
Episode 1900/2000 | Total Reward: 93
Episode 2000/2000 | Total Reward: 93
--- Training Finished ---

--- Evaluating Agent ---

--- Evaluation Episode 1 ---
A _ _ _ _
_ X _ _ _
_ _ X _ _
_ _ _ _ _
_ _ _ _ G
---------
S A _ _ _
_ X _ _ _
_ _ X _ _
_ _ _ _ _
_ _ _ _ G
---------
S _ A _ _
_ X _ _ _
_ _ X _ _
_ _ _ _ _
_