In [24]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque
from time import sleep
from ipythonblocks import BlockGrid
from IPython.display import clear_output

# Define the Q-Network
class DQN(nn.Module):
    def __init__(self):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(2, 128)  # Input is the (x, y) position of the agent
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 4)  # Output is the Q-value for each action (up, down, left, right)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

# Agent class for DQN
class DQNAgent:
    def __init__(self):
        self.model = DQN()
        self.target_model = DQN()  # Target network for stable training
        self.target_model.load_state_dict(self.model.state_dict())
        self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)
        self.loss_fn = nn.MSELoss()
        self.memory = deque(maxlen=10000)
        self.gamma = 0.99  # Discount factor
        self.epsilon = 1.0  # Exploration rate
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01
        self.batch_size = 64
        self.update_target_every = 10
        self.steps = 0

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() < self.epsilon:
            return random.choice([0, 1, 2, 3])  # Random action (explore)
        state = torch.FloatTensor(state).unsqueeze(0)
        q_values = self.model(state)
        return torch.argmax(q_values).item()  # Choose action with highest Q-value (exploit)

    def replay(self):
        if len(self.memory) < self.batch_size:
            return

        batch = random.sample(self.memory, self.batch_size)
        for state, action, reward, next_state, done in batch:
            state = torch.FloatTensor(state).unsqueeze(0)
            next_state = torch.FloatTensor(next_state).unsqueeze(0)
            target = self.model(state)
            with torch.no_grad():
                target_next = self.target_model(next_state)

            target[0][action] = reward + (self.gamma * torch.max(target_next)) * (1 - done)

            self.optimizer.zero_grad()
            loss = self.loss_fn(self.model(state), target)
            loss.backward()
            self.optimizer.step()

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def update_target_network(self):
        self.target_model.load_state_dict(self.model.state_dict())

# Update FLModel to return rewards
class FLModel:
    def __init__(self):
        self.decision_dict = {"up": np.array((0, -1)), "down": np.array((0, 1)), "left": np.array((-1, 0)), "right": np.array((1, 0))}

    def exog_info(self, state, decision):
        if random.random() < 0.3:
            return "slip"
        else:
            return "move"

    def transition_function(self, state, decision, exog_info):
        if exog_info != "slip":
            new_position = state + self.decision_dict[decision]
            if not ((new_position == (-1, -1)).any() or (new_position == (4, 4)).any()):
                return new_position
            
        possible_positions = []
        for key in self.decision_dict.keys():
            new_position = state + self.decision_dict[key]
            if not ((new_position == (-1, -1)).any() or (new_position == (4, 4)).any()):
                possible_positions.append(new_position)

        return random.choice(possible_positions)

    def is_complete(self, state):
        return (state == (3, 3)).all()

    def get_reward(self, state):
        if self.is_complete(state):
            return 10  # Goal reached
        return -1  # Small penalty for each move
    
    def draw_state(self, state):
        agent_colour = (0, 0, 255)
        goal_colour = (210, 30, 30)
        clean_colour = (255, 255, 255)
        
        # Create the grid with the given width and height
        grid = BlockGrid(4, 4, fill=clean_colour)
        
        goal_position = (3, 3)
        agent_position = state
        
        # Set the agent's position with its color
        grid[goal_position[1], goal_position[0]] = goal_colour
        grid[int(agent_position[1]), int(agent_position[0])] = agent_colour
        
        clear_output(wait=True)
        # Show the grid
        grid.show()
        
        # Add a small delay for visualization purposes
        sleep(0.5)

    # Method to run the trained agent in the environment and visualize
    def run_trained_agent(self, S0, agent, n_iterations: int = 100):
        current_state = S0
        for n in range(n_iterations):
            self.draw_state(current_state)  # Visualize the state
            action = agent.act(current_state)  # Let the trained agent decide an action
            action_name = ["up", "down", "left", "right"][action]  # Convert action index to action name
            exog = self.exog_info(current_state, action_name)  # Get external environment info
            current_state = self.transition_function(current_state, action_name, exog)  # Transition to next state
            
            if self.is_complete(current_state):  # Check if goal is reached
                self.draw_state(current_state)  # Final state visualization
                print(f"Goal reached in {n+1} steps!")
                break
        else:
            print("Agent did not reach the goal.")

Episode 1: Total Reward = -43
Episode 2: Total Reward = -61
Episode 3: Total Reward = -71
Episode 4: Total Reward = -3
Episode 5: Total Reward = -1
Episode 6: Total Reward = -23
Episode 7: Total Reward = -91
Episode 8: Total Reward = -89
Episode 9: Total Reward = -7
Episode 10: Total Reward = -11
Episode 11: Total Reward = -13
Episode 12: Total Reward = -29
Episode 13: Total Reward = -3
Episode 14: Total Reward = -67
Episode 15: Total Reward = -19
Episode 16: Total Reward = -67
Episode 17: Total Reward = -13
Episode 18: Total Reward = -61
Episode 19: Total Reward = -3
Episode 20: Total Reward = 3
Episode 21: Total Reward = -41
Episode 22: Total Reward = -31
Episode 23: Total Reward = -9
Episode 24: Total Reward = -5
Episode 25: Total Reward = -1
Episode 26: Total Reward = -19
Episode 27: Total Reward = -13
Episode 28: Total Reward = -3
Episode 29: Total Reward = -19
Episode 30: Total Reward = -53
Episode 31: Total Reward = -7
Episode 32: Total Reward = -73
Episode 33: Total Reward = -5

In [None]:
# Training loop
env = FLModel()
agent = DQNAgent()
start_state = np.array((0, 0))

n_episodes = 500
for episode in range(n_episodes):
    state = start_state
    done = False
    total_reward = 0

    while not done:
        action = agent.act(state)
        action_name = ["up", "down", "left", "right"][action]
        exog = env.exog_info(state, action_name)
        next_state = env.transition_function(state, action_name, exog)
        reward = env.get_reward(next_state)
        done = env.is_complete(next_state)
        
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward

        if done:
            print(f"Episode {episode + 1}: Total Reward = {total_reward}")
            break

    agent.replay()
    if episode % agent.update_target_every == 0:
        agent.update_target_network()

In [23]:
env.run_trained_agent(start_state, agent)

Goal reached in 6 steps!
