In [1]:
!pip install gym==0.23.1 --quiet
!pip install tensorflow --quiet

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/626.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m626.2/626.2 kB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for gym (pyproject.toml) ... [?25l[?25hdone


In [2]:
#Library for environments
import gym
from gym.envs.registration import register
from gym.envs.toy_text.frozen_lake import generate_random_map

#Librairies to represent the output
from IPython.display import clear_output
import time
import matplotlib.pyplot as plt

#Essential libraries for computation
import numpy as np
import random
import tensorflow as tf
from collections import deque

  from jax import xla_computation as _xla_computation


In [None]:
random_map = generate_random_map(size=10, p=0.3)
env = gym.make("FrozenLake-v1", desc=random_map)
env.reset()
env.render()

In [3]:
# Get GPU device name
if tf.test.gpu_device_name():
    print('\nDefault GPU Device:', tf.test.gpu_device_name())
else:
    print('\nNo GPU device found')


Default GPU Device: /device:GPU:0


In [4]:
class DQNAgent:
    """
    Deep Q-Network agent for the FrozenLake environment.

    The agent uses a neural network to approximate the Q-function and
    implements epsilon-greedy exploration strategy with decay.
    """
    def __init__(self, state_size, action_size):
        # Set device before other initializations
        self.device_name = '/GPU:0' if tf.config.list_physical_devices('GPU') else '/CPU:0'
        print(f"Using device: {self.device_name}")

        # Environment parameters
        self.state_size = state_size
        self.action_size = action_size

        # Learning parameters
        self.gamma = 0.95  # Discount factor
        self.epsilon = 1.0  # Initial exploration rate
        self.epsilon_min = 0.1  # Minimum exploration rate
        self.epsilon_decay = 0.995  # Decay rate for exploration
        self.learning_rate = 0.001  # Learning rate for the optimizer

        # Memory for experience replay
        self.memory = deque(maxlen=2000)
        self.batch_size = 32

        # Create the neural network model
        self.model = self._build_model()

    def _build_model(self):
        """
        Neural Network to approximate Q-value function using functional API:
        * Input: state
        * Output: Q-values for each action
        """
        with tf.device(self.device_name):
            # Define input layer explicitly
            inputs = tf.keras.layers.Input(shape=(self.state_size,))

            # Hidden layers
            x = tf.keras.layers.Dense(24, activation='relu')(inputs)
            x = tf.keras.layers.Dense(24, activation='relu')(x)

            # Output layer
            outputs = tf.keras.layers.Dense(self.action_size, activation='linear')(x)

            # Create model
            model = tf.keras.Model(inputs=inputs, outputs=outputs)

            # Compile model
            model.compile(
                optimizer=tf.keras.optimizers.Adam(learning_rate=self.learning_rate),
                loss='mse'
            )

            # Print model summary
            model.summary()

        return model

    def remember(self, state, action, reward, next_state, done):
        """Store experience in memory for replay"""
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        """
        Choose action using epsilon-greedy policy
        """
        if random.random() <= self.epsilon:
            return random.randrange(self.action_size)

        with tf.device(self.device_name):
            state = np.reshape(state, [1, self.state_size])
            act_values = self.model.predict(state, verbose=0)
            return np.argmax(act_values[0])

    def replay(self):
        """
        Train the network using experience replay
        """
        if len(self.memory) < self.batch_size:
            return

        # Sample random experiences from memory
        minibatch = random.sample(self.memory, self.batch_size)

        with tf.device(self.device_name):
            states = np.array([x[0] for x in minibatch])
            next_states = np.array([x[3] for x in minibatch])

            # Predict Q-values for current and next states
            current_q_values = self.model.predict(states, verbose=0)
            next_q_values = self.model.predict(next_states, verbose=0)

            # Create training data
            X = states
            Y = current_q_values

            for i, (state, action, reward, next_state, done) in enumerate(minibatch):
                if done:
                    target = reward
                else:
                    target = reward + self.gamma * np.amax(next_q_values[i])
                Y[i][action] = target

            # Train the model
            self.model.fit(X, Y, batch_size=self.batch_size, epochs=1, verbose=0)

        # Decay epsilon
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def convert_state(self, state):
        """Convert state number to one-hot encoding"""
        state_one_hot = np.zeros(self.state_size)
        state_one_hot[state] = 1
        return state_one_hot

# Training loop helper function
def train_dqn_agent(env, agent, episodes=1000):
    """
    Train the DQN agent on the environment
    """
    scores = []
    for episode in range(episodes):
        # Reset environment
        state, _ = env.reset()
        state = agent.convert_state(state)
        score = 0
        done = False

        while not done:
            # Choose and take action
            action = agent.act(state)
            next_state, reward, done, _, _ = env.step(action)

            # Convert next_state to one-hot and store experience
            next_state = agent.convert_state(next_state)
            agent.remember(state, action, reward, next_state, done)

            # Move to next state
            state = next_state
            score += reward

            # Train the network
            agent.replay()

        scores.append(score)

        # Print progress
        if (episode + 1) % 100 == 0:
            avg_score = np.mean(scores[-100:])
            print(f"Episode: {episode + 1}, Average Score: {avg_score:.2f}, Epsilon: {agent.epsilon:.2f}")

    return scores

# Helper function to create and initialize the agent
def create_agent(env):
    """
    Create and initialize a DQN agent for the given environment
    """
    state_size = env.observation_space.n  # Number of states
    action_size = env.action_space.n      # Number of actions
    agent = DQNAgent(state_size, action_size)
    return agent

In [7]:
# Initialize the environment and agent
random_map = generate_random_map(size=4, p=0.1)
env = gym.make("FrozenLake-v1", desc=random_map, is_slippery=True)
agent = create_agent(env)

# Training parameters
n_episodes = 1000
max_steps = 100  # Maximum steps per episode
training_history = {
    'scores': [],
    'avg_scores': [],
    'epsilons': [],
    'steps': []
}

  and should_run_async(code)


Using device: /GPU:0


In [9]:
# Training loop
for episode in range(n_episodes):
    state = env.reset()  # Just get the state
    state = agent.convert_state(state)
    score = 0
    done = False
    steps = 0

    for step in range(max_steps):
        # Choose and take action
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)  # Updated step unpacking

        # Convert next_state and store experience
        next_state = agent.convert_state(next_state)
        agent.remember(state, action, reward, next_state, done)

        # Move to next state
        state = next_state
        score += reward
        steps += 1

        # Train the network
        agent.replay()

        if done:
            break

    # Store training history
    training_history['scores'].append(score)
    training_history['epsilons'].append(agent.epsilon)
    training_history['steps'].append(steps)

    # Calculate average score over last 100 episodes
    avg_score = np.mean(training_history['scores'][-100:])
    training_history['avg_scores'].append(avg_score)

    # Print progress
    if (episode + 1) % 50 == 0:
        print(f"Episode: {episode + 1}/{n_episodes}")
        print(f"Score: {score:.2f}")
        print(f"Average Score (last 100): {avg_score:.2f}")
        print(f"Epsilon: {agent.epsilon:.3f}")
        print(f"Steps: {steps}")
        print("-" * 40)
        # Optional: render the environment to see the agent's behavior
        if episode % 100 == 0:
            env.render()

# Print final training results
print("\nTraining completed!")
print(f"Final average score: {training_history['avg_scores'][-1]:.2f}")
print(f"Final epsilon: {agent.epsilon:.3f}")

KeyboardInterrupt: 