# Prg 3 

In [2]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

2025-03-31 12:29:07.166573: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-31 12:29:07.403552: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743449358.199915    3355 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743449358.238266    3355 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-31 12:29:07.767242: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

### Environment creation

In [3]:
# grid setup
m = 10
n = 10
p = 10

def create_grid(m, n, p):
    grid = np.zeros((m, n))

    # add dirty tiles randomly
    for i in range(p):
        while True:
            x, y = np.random.randint(0, m), np.random.randint(0, n)

            if grid[x, y] == 0 and (x, y) != (0, 0):
                grid[x, y] = 1
                break

    grid[0, 0] = 2
    return grid

grid = create_grid(m, n, p)

print(grid)


[[2. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 1. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]]


### Setup action and reward 

In [4]:
# Reward:
# Same pos = -0.5
# Dirty tile = m * n / p
# Clean tile = -0.1
def move(grid, pos, action, p):
    grid = grid.copy()
    old_pos = pos
    reward = -0.1
    
    if action == 0: # north
        if pos[1] != 0:
            pos = (pos[0], pos[1] - 1)
    elif action == 1: # east
        if pos[0] != grid.shape[1] - 1:
            pos = (pos[0] + 1, pos[1])
    elif action == 2: # south
        if pos[1] != grid.shape[0] - 1:
            pos = (pos[0], pos[1] + 1)
    elif action == 3: # west
        if pos[0] != 0:
            pos = (pos[0] - 1, pos[1])
        
    if pos == old_pos:
        reward -= 0.4
    elif grid[pos[0], pos[1]] == 1:
        reward += 10

    grid[old_pos[0], old_pos[1]] = 0
    grid[pos[0], pos[1]] = 2
    
    return pos, reward, grid

### Constructing model

In [4]:
num_actions = 4

def create_q_model():
    inputs = layers.Input(shape=(10, 10, 1))

    layer1 = layers.Conv2D(32, (3,3), padding="same", activation="relu")(inputs)
    layer2 = layers.MaxPooling2D((2, 2))(layer1)
    layer3 = layers.Conv2D(64, (3,3), padding="same", activation="relu")(layer2)

    layer4 = layers.Flatten()(layer3)

    layer5 = layers.Dense(128, activation="relu")(layer4)
    action = layers.Dense(num_actions, activation="linear")(layer5)

    return keras.Model(inputs=inputs, outputs=action)

model = create_q_model()
model_target = create_q_model()

I0000 00:00:1743407149.284773 2443516 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 3539 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4050 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.9


### Train model

In [None]:
optimizer = keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0)

gamma = 0.9  # Discount factor for past rewards
epsilon = 1.0  # Epsilon greedy parameter
epsilon_min = 0.1  # Minimum epsilon greedy parameter
epsilon_max = 1.0  # Maximum epsilon greedy parameter
epsilon_interval = (
    epsilon_max - epsilon_min
)  # Rate at which to reduce chance of random action being taken
epsilon_decay = 0.99

batch_size = 32  # Size of batch taken from replay buffer
max_steps_per_episode = 250

# Experience replay buffers
action_history = []
state_history = []
state_next_history = []
rewards_history = []
done_history = []

episode_reward_history = []
running_reward = 0
episode_count = 0

max_memory_length = 100000
update_target_network = 50
loss_function = keras.losses.Huber()

while True:  # Run until solved
    grid = create_grid(m, n, p)
    pos = (0,0)
    state = np.expand_dims(grid, axis=-1) # fit shape for conv
    episode_reward = 0
    
    for timestep in range(1, max_steps_per_episode):
        # Use epsilon-greedy for exploration
        if epsilon > np.random.rand(1)[0]:
            # Take random action
            action = np.random.choice(num_actions)
        else:
            # Predict action Q-values
            state_tensor = tf.convert_to_tensor(state)
            state_tensor = tf.expand_dims(state_tensor, 0)
            action_probs = model(state_tensor, training=False)
            # Take best action
            action = tf.argmax(action_probs[0]).numpy()

        # Apply the sampled action in our environment
        pos, reward, grid = move(grid, pos, action, p)
        state_next = np.expand_dims(grid, axis=-1)  # Add dimension
        done = np.sum(grid == 1) == 0  # Check if all tiles are cleaned

        episode_reward += reward

        # Save actions and states in replay buffer
        action_history.append(action)
        state_history.append(state)
        state_next_history.append(state_next)
        done_history.append(done)
        rewards_history.append(reward)
        state = state_next

        # Update the model if there's enough data in the replay buffer
        if len(done_history) > batch_size:
            # Get indices of samples for replay buffers
            indices = np.random.choice(range(len(done_history)), size=batch_size)

            # Sample from replay buffer
            state_sample = np.array([state_history[i] for i in indices])
            state_next_sample = np.array([state_next_history[i] for i in indices])
            rewards_sample = [rewards_history[i] for i in indices]
            action_sample = [action_history[i] for i in indices]
            done_sample = tf.convert_to_tensor([float(done_history[i]) for i in indices])

            # Build the updated Q-values for the sampled future states
            future_rewards = model_target.predict(state_next_sample, verbose=0)
            updated_q_values = rewards_sample + gamma * tf.reduce_max(future_rewards, axis=1)
            updated_q_values = updated_q_values * (1 - done_sample) - done_sample

            # Create a mask so we only calculate loss on the updated Q-values
            masks = tf.one_hot(action_sample, num_actions)

            with tf.GradientTape() as tape:
                # Train the model on the states and updated Q-values
                q_values = model(state_sample)
                q_action = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)
                loss = loss_function(updated_q_values, q_action)

            # Backpropagation
            grads = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))
        
        # Limit the replay buffer size
        if len(rewards_history) > max_memory_length:
            del rewards_history[:1]
            del state_history[:1]
            del state_next_history[:1]
            del action_history[:1]
            del done_history[:1]

        if done:
            print("Job done at step {}".format(timestep))
            break

    # Decay probability of taking random action
    epsilon = max(epsilon * epsilon_decay, epsilon_min)

    # Update running reward to check condition for solving
    episode_reward_history.append(episode_reward)
    
    if len(episode_reward_history) > 100:
        del episode_reward_history[:1]
    running_reward = np.mean(episode_reward_history)

    if episode_count % update_target_network == 0:
        model_target.set_weights(model.get_weights())

    episode_count += 1
    template = "Running reward: {:.2f} at episode {}, Reward: {:.2f}, Epsilon: {:0.5f}"
    print(template.format(running_reward, episode_count, episode_reward, epsilon))
    
    if np.sum(grid == 1) != 0:
        print(grid)
    
    if running_reward > 93:  # Condition to consider the task solved
        print("Solved at episode {}!".format(episode_count))
        break

I0000 00:00:1743407150.397324 2443516 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1743407151.308860 2443596 service.cc:148] XLA service 0x7f49ec003da0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1743407151.308980 2443596 service.cc:156]   StreamExecutor device (0): NVIDIA GeForce RTX 4050 Laptop GPU, Compute Capability 8.9
2025-03-31 00:45:51.329239: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1743407152.115961 2443596 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Job done at step 111
Running reward: 90.12 at episode 385, Reward: 88.90, Epsilon: 0.10000
Job done at step 45
Running reward: 90.12 at episode 386, Reward: 95.50, Epsilon: 0.10000
Job done at step 59
Running reward: 90.11 at episode 387, Reward: 94.10, Epsilon: 0.10000
Job done at step 50
Running reward: 90.10 at episode 388, Reward: 95.00, Epsilon: 0.10000
Job done at step 39
Running reward: 90.29 at episode 389, Reward: 96.10, Epsilon: 0.10000
Job done at step 46
Running reward: 90.36 at episode 390, Reward: 95.40, Epsilon: 0.10000
Job done at step 88
Running reward: 90.31 at episode 391, Reward: 91.20, Epsilon: 0.10000
Job done at step 60
Running reward: 90.29 at episode 392, Reward: 93.60, Epsilon: 0.10000
Job done at step 37
Running reward: 90.28 at episode 393, Reward: 96.30, Epsilon: 0.10000
Job done at step 38
Running reward: 90.38 at episode 394, Reward: 96.20, Epsilon: 0.10000
Job done at step 57
Running reward: 90.39 at episode 395, Reward: 94.30, Epsilon: 0.10000
Job done 

### Testing model

In [10]:
def test_model(model, m, n, p):
    grid = create_grid(m, n, p)
    move_grid = grid.copy()
    pos = (0, 0)
    step_count = 1
    print(grid)

    while np.sum(grid == 1) != 0 & step_count < 250:
        state = np.expand_dims(grid.astype(float), axis=-1)

        state_tensor = tf.convert_to_tensor(state)
        state_tensor = tf.expand_dims(state_tensor, 0)
        action_probs = model(state_tensor, training=False)
        action = tf.argmax(action_probs[0]).numpy()

        pos, reward, grid = move(grid, pos, action, p)
        move_grid[pos[0], pos[1]] = step_count
        step_count += 1

    print(move_grid)
test_model(model, m, n, p)

[[2. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 1. 0. 0.]]
[[ 2.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 1.  2.  3.  4.  0.  0.  0.  0. 31. 32.]
 [ 0.  0.  0.  5.  6.  0.  0.  0. 34. 33.]
 [ 0.  0.  0.  0.  7.  0.  0.  0. 35.  0.]
 [ 0.  0.  0.  0.  8.  0.  0.  0. 36.  0.]
 [ 0.  0.  0.  0.  9.  0. 25. 26. 37.  0.]
 [ 0.  0. 20. 21. 22. 23. 24.  0. 38.  0.]
 [ 0.  0. 19. 12.  0.  0.  0. 40. 39.  0.]
 [ 0.  0. 18. 15. 14.  0.  0. 41.  0.  0.]
 [ 0.  0. 17.  0.  0.  0.  0. 42.  0.  0.]]


### Training model for 20 by 20 with 10 dirty

In [14]:
m = 20
n = 20
p = 10

def create_q_model_20():
    inputs = layers.Input(shape=(20, 20, 1))

    layer1 = layers.Conv2D(64, (5,5), padding="same", activation="relu")(inputs)
    layer2 = layers.MaxPooling2D((2, 2))(layer1)
    
    layer3 = layers.Conv2D(128, (3,3), padding="same", activation="relu")(layer2)
    layer4 = layers.MaxPooling2D((2, 2))(layer3)
    
    layer5 = layers.Conv2D(256, (2,2), padding="same", activation="relu")(layer4)
    
    layer6 = layers.Flatten()(layer5)
    
    layer7 = layers.Dense(512, activation="relu")(layer6)
    layer8 = layers.Dense(256, activation="relu")(layer7)
    action = layers.Dense(num_actions, activation="linear")(layer8)

    return keras.Model(inputs=inputs, outputs=action)

model_20 = create_q_model_20()
model_target_20 = create_q_model_20()

optimizer = keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0)

gamma = 0.9  # Discount factor for past rewards
epsilon = 1.0  # Epsilon greedy parameter
epsilon_min = 0.1  # Minimum epsilon greedy parameter
epsilon_max = 1.0  # Maximum epsilon greedy parameter
epsilon_interval = (
    epsilon_max - epsilon_min
)  # Rate at which to reduce chance of random action being taken
epsilon_decay = 0.995

batch_size = 32  # Size of batch taken from replay buffer
max_steps_per_episode = 300

# Experience replay buffers
action_history_20 = []
state_history_20 = []
state_next_history_20 = []
rewards_history_20 = []
done_history_20 = []

episode_reward_history_20 = []
running_reward = 0
episode_count = 0

max_memory_length = 100000
update_target_network = 50
loss_function = keras.losses.Huber()

while True:  # Run until solved
    grid = create_grid(m, n, p)
    pos = (0,0)
    state = np.expand_dims(grid, axis=-1) # fit shape for conv
    episode_reward = 0
    
    for timestep in range(1, max_steps_per_episode):
        # Use epsilon-greedy for exploration
        if epsilon > np.random.rand(1)[0]:
            # Take random action
            action = np.random.choice(num_actions)
        else:
            # Predict action Q-values
            state_tensor = tf.convert_to_tensor(state)
            state_tensor = tf.expand_dims(state_tensor, 0)
            action_probs = model_20(state_tensor, training=False)
            # Take best action
            action = tf.argmax(action_probs[0]).numpy()

        # Apply the sampled action in our environment
        pos, reward, grid = move(grid, pos, action, p)
        state_next = np.expand_dims(grid, axis=-1)  # Add dimension
        done = np.sum(grid == 1) == 0  # Check if all tiles are cleaned

        episode_reward += reward

        # Save actions and states in replay buffer
        action_history_20.append(action)
        state_history_20.append(state)
        state_next_history_20.append(state_next)
        done_history_20.append(done)
        rewards_history_20.append(reward)
        state = state_next

        # Update the model if there's enough data in the replay buffer
        if len(done_history_20) > batch_size:
            # Get indices of samples for replay buffers
            indices = np.random.choice(range(len(done_history_20)), size=batch_size)

            # Sample from replay buffer
            state_sample = np.array([state_history_20[i] for i in indices])
            state_next_sample = np.array([state_next_history_20[i] for i in indices])
            rewards_sample = [rewards_history_20[i] for i in indices]
            action_sample = [action_history_20[i] for i in indices]
            done_sample = tf.convert_to_tensor([float(done_history_20[i]) for i in indices])

            # Build the updated Q-values for the sampled future states
            future_rewards = model_target_20.predict(state_next_sample, verbose=0)
            updated_q_values = rewards_sample + gamma * tf.reduce_max(future_rewards, axis=1)
            updated_q_values = updated_q_values * (1 - done_sample) - done_sample

            # Create a mask so we only calculate loss on the updated Q-values
            masks = tf.one_hot(action_sample, num_actions)

            with tf.GradientTape() as tape:
                # Train the model on the states and updated Q-values
                q_values = model_20(state_sample)
                q_action = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)
                loss = loss_function(updated_q_values, q_action)

            # Backpropagation
            grads = tape.gradient(loss, model_20.trainable_variables)
            optimizer.apply_gradients(zip(grads, model_20.trainable_variables))
        
        # Limit the replay buffer size
        if len(rewards_history_20) > max_memory_length:
            del rewards_history_20[:1]
            del state_history_20[:1]
            del state_next_history_20[:1]
            del action_history_20[:1]
            del done_history_20[:1]

        if done:
            print("Job done at step {}".format(timestep))
            break

    # Decay probability of taking random action
    epsilon = max(epsilon * epsilon_decay, epsilon_min)

    # Update running reward to check condition for solving
    episode_reward_history_20.append(episode_reward)
    
    if len(episode_reward_history_20) > 100:
        del episode_reward_history_20[:1]
    running_reward = np.mean(episode_reward_history_20)

    if episode_count % update_target_network == 0:
        model_target_20.set_weights(model_20.get_weights())
        model_20.save_weights("model_20.weights.h5")
        model_target_20.save_weights("target_model_20.weights.h5")  

        import pickle
        with open("replay_buffer_20.pkl", "wb") as f:
            pickle.dump({
                "action_history_20": action_history_20,
                "state_history_20": state_history_20,
                "state_next_history_20": state_next_history_20,
                "rewards_history_20": rewards_history_20,
                "done_history_20": done_history_20,
                "episode_reward_history_20": episode_reward_history_20,
                "epsilon": epsilon,
                "episode_count": episode_count
            }, f)

    episode_count += 1
    template = "Running reward: {:.2f} at episode {}, Reward: {:.2f}, Epsilon: {:0.5f}"
    print(template.format(running_reward, episode_count, episode_reward, epsilon))
    
    if np.sum(grid == 1) != 0:
        print(grid)
    
    if running_reward > 80:  # Condition to consider the task solved
        print("Solved at episode {}!".format(episode_count))
        break

Running reward: -25.50 at episode 1, Reward: -25.50, Epsilon: 0.99500
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 2. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0

### Train more

In [None]:
while True:  # Run until solved
    grid = create_grid(m, n, p)
    pos = (0,0)
    state = np.expand_dims(grid, axis=-1) # fit shape for conv
    episode_reward = 0
    
    for timestep in range(1, max_steps_per_episode):
        # Use epsilon-greedy for exploration
        if epsilon > np.random.rand(1)[0]:
            # Take random action
            action = np.random.choice(num_actions)
        else:
            # Predict action Q-values
            state_tensor = tf.convert_to_tensor(state)
            state_tensor = tf.expand_dims(state_tensor, 0)
            action_probs = model_20(state_tensor, training=False)
            # Take best action
            action = tf.argmax(action_probs[0]).numpy()

        # Apply the sampled action in our environment
        pos, reward, grid = move(grid, pos, action, p)
        state_next = np.expand_dims(grid, axis=-1)  # Add dimension
        done = np.sum(grid == 1) == 0  # Check if all tiles are cleaned

        episode_reward += reward

        # Save actions and states in replay buffer
        action_history_20.append(action)
        state_history_20.append(state)
        state_next_history_20.append(state_next)
        done_history_20.append(done)
        rewards_history_20.append(reward)
        state = state_next

        # Update the model if there's enough data in the replay buffer
        if len(done_history_20) > batch_size:
            # Get indices of samples for replay buffers
            indices = np.random.choice(range(len(done_history_20)), size=batch_size)

            # Sample from replay buffer
            state_sample = np.array([state_history_20[i] for i in indices])
            state_next_sample = np.array([state_next_history_20[i] for i in indices])
            rewards_sample = [rewards_history_20[i] for i in indices]
            action_sample = [action_history_20[i] for i in indices]
            done_sample = tf.convert_to_tensor([float(done_history_20[i]) for i in indices])

            # Build the updated Q-values for the sampled future states
            future_rewards = model_target_20.predict(state_next_sample, verbose=0)
            updated_q_values = rewards_sample + gamma * tf.reduce_max(future_rewards, axis=1)
            updated_q_values = updated_q_values * (1 - done_sample) - done_sample

            # Create a mask so we only calculate loss on the updated Q-values
            masks = tf.one_hot(action_sample, num_actions)

            with tf.GradientTape() as tape:
                # Train the model on the states and updated Q-values
                q_values = model_20(state_sample)
                q_action = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)
                loss = loss_function(updated_q_values, q_action)

            # Backpropagation
            grads = tape.gradient(loss, model_20.trainable_variables)
            optimizer.apply_gradients(zip(grads, model_20.trainable_variables))
        
        # Limit the replay buffer size
        if len(rewards_history_20) > max_memory_length:
            del rewards_history_20[:1]
            del state_history_20[:1]
            del state_next_history_20[:1]
            del action_history_20[:1]
            del done_history_20[:1]

        if done:
            print("Job done at step {}".format(timestep))
            break

    # Decay probability of taking random action
    epsilon = max(epsilon * epsilon_decay, epsilon_min)

    # Update running reward to check condition for solving
    episode_reward_history_20.append(episode_reward)
    
    if len(episode_reward_history_20) > 100:
        del episode_reward_history_20[:1]
    running_reward = np.mean(episode_reward_history_20)

    if episode_count % update_target_network == 0:
        model_target_20.set_weights(model_20.get_weights())
        model_20.save_weights("model_20.weights.h5")
        model_target_20.save_weights("target_model_20.weights.h5")  

        import pickle
        with open("replay_buffer_20.pkl", "wb") as f:
            pickle.dump({
                "action_history_20": action_history_20,
                "state_history_20": state_history_20,
                "state_next_history_20": state_next_history_20,
                "rewards_history_20": rewards_history_20,
                "done_history_20": done_history_20,
                "episode_reward_history_20": episode_reward_history_20,
                "epsilon": epsilon,
                "episode_count": episode_count
            }, f)

    episode_count += 1
    template = "Running reward: {:.2f} at episode {}, Reward: {:.2f}, Epsilon: {:0.5f}"
    print(template.format(running_reward, episode_count, episode_reward, epsilon))
    
    if np.sum(grid == 1) != 0:
        print(grid)
    
    if running_reward > 85:  # Condition to consider the task solved
        print("Solved at episode {}!".format(episode_count))
        break

I0000 00:00:1743449443.844834    3355 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1743449444.718647    4038 service.cc:148] XLA service 0x7fd6240155c0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1743449444.719092    4038 service.cc:156]   StreamExecutor device (0): NVIDIA GeForce RTX 4050 Laptop GPU, Compute Capability 8.9
2025-03-31 12:30:44.735743: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1743449446.396926    4038 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Running reward: 74.91 at episode 501, Reward: -115.50, Epsilon: 0.10000
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 2.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.

### Test model

In [8]:
def test_model(model, m, n, p):
    grid = create_grid(m, n, p)
    move_grid = grid.copy()
    pos = (0, 0)
    step_count = 1
    print(grid)

    while np.sum(grid == 1) != 0 & step_count < 250:
        state = np.expand_dims(grid.astype(float), axis=-1)

        state_tensor = tf.convert_to_tensor(state)
        state_tensor = tf.expand_dims(state_tensor, 0)
        action_probs = model(state_tensor, training=False)
        action = tf.argmax(action_probs[0]).numpy()

        pos, reward, grid = move(grid, pos, action, p)
        move_grid[pos[0], pos[1]] = step_count
        step_count += 1

    print(move_grid)
test_model(model_20, m, n, p)

[[2. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.