In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import random
from collections import deque
import time
from IPython.display import display, clear_output
from tqdm import tqdm

In [102]:
import importlib
import games #import the module here, so that it can be reloaded.
importlib.reload(games)
Game2 = games.Game2

In [150]:
class DeepQNetwork:
    def __init__(self, model, exp_size):
        self.epsilon = 1.0
        self.gamma = 0.97
        self.epsilin_down_factor = 0.995
        
        self.policy_network = model
        self.target_network = keras.models.clone_model(self.policy_network)
        self.target_network.set_weights(self.policy_network.get_weights())
        self.policy_network.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), 
                                    loss=keras.losses.mean_squared_error)
        
        self.replay_buffer = deque(maxlen=exp_size)
        
        self.steps = 0
        self.c = 3
        self.win_episodes = []
        self.consecutive_wins = 0
        self.rewards_gained = 0
        
    
    def get_action(self, state_input, action_list, random=True):
        if not random:
            action = np.argmax(self.policy_network.predict(state_input, verbose=0))
        elif np.random.rand() > self.epsilon:
            action = np.argmax(self.policy_network.predict(state_input, verbose=0))
        else:
            action = np.random.choice(action_list, 1).item()
        return action
    
    
    def action(self, state_input):
        np.argmax(self.target_network.predict(state_input))
    
    
    def train(self, batch_size, state_input, action, reward, next_state_input, done):
        self.replay_buffer.append((state_input, action, reward, next_state_input, done))
        
        if batch_size >= len(self.replay_buffer):
            return
        else:
            memories = random.sample(self.replay_buffer, batch_size)
        states = np.squeeze(np.array([memory[0] for memory in memories]))
        actions = np.array([memory[1] for memory in memories])
        rewards = np.array([memory[2] for memory in memories])
        next_states = np.squeeze(np.array([memory[3] for memory in memories]))
        dones = np.array([memory[4] for memory in memories])
        
        q_values = self.policy_network.predict(states, verbose=0)
        next_q_values = self.target_network.predict(next_states, verbose=0)
        
        targets = np.copy(q_values)
        for i in range(batch_size):
            targets[i, int(actions[i])] = rewards[i] + self.gamma * np.max(next_q_values[i]) * (1 - dones[i])
        
        self.policy_network.fit(states, targets, batch_size=32, epochs=1, verbose=0)
        
        self.steps += 1
        if self.steps == self.c:
            self.target_network.set_weights(self.policy_network.get_weights()) 
            self.steps = 0
        
        if reward > 0:
            self.rewards_gained += 1
            self.epsilon = max(0.1, self.epsilon * self.epsilin_down_factor)
        
        if done:
            gain_perc = self.rewards_gained / 8
            print(f'{gain_perc:0.5f}')
            self.rewards_gained = 0
            if gain_perc >= 0.7:
                self.win_episodes.append(episode)
                self.consecutive_wins += 1
            elif gain_perc < 0.7:
                self.consecutive_wins = 0


In [151]:
x, y = 5, 5
num_actions = 4
action_list = np.array(range(num_actions))
consecutive_wins_lmt = 5
max_steps_replay = 5

model = keras.Sequential(
    [
        keras.Input(shape=(x*y*max_steps_replay)),
        layers.Dense(128, activation='relu'),
        layers.Dense(64, activation='relu'),
        layers.Dense(32, activation='relu'),
        layers.Dense(4, activation='linear')
    ]
)
agent = DeepQNetwork(model, 1000000)

In [None]:
# init_agent_pos = 13
init_agent_pos = 15
# init_rewards_pos = [3, 4, 8, 11, 22, 24, 17, 7]
init_rewards_pos = [3, 2, 13, 11, 23, 24, 16, 0]
init_holes_pos = []
max_steps = 30
env = Game2(x, y, init_agent_pos, init_rewards_pos, init_holes_pos, max_steps, max_steps_replay)

agent.epsilon = 1.0
agent.consecutive_wins = 0
agent.win_episodes = []

for episode in range(500):
    print(f'episode: {episode}, wins: {agent.win_episodes}, epsilon: {agent.epsilon}')
    observation, reward, done = env.reset()
    while not done:
#         state_input = observation.reshape(-1, x * y)
        action = agent.get_action(observation, action_list)
        next_observation, reward, done = env.step(action)
#         next_state_input = next_observation.reshape(-1, x * y)
        agent.train(256, observation, action, reward, next_observation, done)
        observation = next_observation
    if agent.consecutive_wins >= consecutive_wins_lmt:
        break

episode: 0, wins: [], epsilon: 1.0
episode: 1, wins: [], epsilon: 1.0
episode: 2, wins: [], epsilon: 1.0
episode: 3, wins: [], epsilon: 1.0
episode: 4, wins: [], epsilon: 1.0
episode: 5, wins: [], epsilon: 1.0
episode: 6, wins: [], epsilon: 1.0
episode: 7, wins: [], epsilon: 1.0
episode: 8, wins: [], epsilon: 1.0
episode: 9, wins: [], epsilon: 1.0
episode: 10, wins: [], epsilon: 1.0
episode: 11, wins: [], epsilon: 1.0
episode: 12, wins: [], epsilon: 1.0
episode: 13, wins: [], epsilon: 1.0
episode: 14, wins: [], epsilon: 1.0
episode: 15, wins: [], epsilon: 1.0
episode: 16, wins: [], epsilon: 1.0
episode: 17, wins: [], epsilon: 1.0
episode: 18, wins: [], epsilon: 1.0
episode: 19, wins: [], epsilon: 1.0
episode: 20, wins: [], epsilon: 1.0
episode: 21, wins: [], epsilon: 1.0
episode: 22, wins: [], epsilon: 1.0
episode: 23, wins: [], epsilon: 1.0
episode: 24, wins: [], epsilon: 1.0
episode: 25, wins: [], epsilon: 1.0
episode: 26, wins: [], epsilon: 1.0
episode: 27, wins: [], epsilon: 1.0
ep

0.00000
episode: 158, wins: [133, 141], epsilon: 0.37251769488706843
0.00000
episode: 159, wins: [133, 141], epsilon: 0.37251769488706843
0.25000
episode: 160, wins: [133, 141], epsilon: 0.36880183088056995
0.25000
episode: 161, wins: [133, 141], epsilon: 0.36512303261753626
0.00000
episode: 162, wins: [133, 141], epsilon: 0.36512303261753626
0.25000
episode: 163, wins: [133, 141], epsilon: 0.3614809303671764
0.12500
episode: 164, wins: [133, 141], epsilon: 0.3596735257153405
0.25000
episode: 165, wins: [133, 141], epsilon: 0.35608578229633
0.12500
episode: 166, wins: [133, 141], epsilon: 0.3543053533848483
0.12500
episode: 167, wins: [133, 141], epsilon: 0.35253382661792404
0.00000
episode: 168, wins: [133, 141], epsilon: 0.35253382661792404
0.25000
episode: 169, wins: [133, 141], epsilon: 0.34901730169741024
0.00000
episode: 170, wins: [133, 141], epsilon: 0.34901730169741024
0.00000
episode: 171, wins: [133, 141], epsilon: 0.34901730169741024
0.12500
episode: 172, wins: [133, 141], 

0.50000
episode: 258, wins: [133, 141, 196, 209, 219, 227, 229, 233, 236, 238, 249], epsilon: 0.11820406108847166
0.75000
episode: 259, wins: [133, 141, 196, 209, 219, 227, 229, 233, 236, 238, 249, 258], epsilon: 0.11470197137452155
0.62500
episode: 260, wins: [133, 141, 196, 209, 219, 227, 229, 233, 236, 238, 249, 258], epsilon: 0.11186295456362313
0.75000
episode: 261, wins: [133, 141, 196, 209, 219, 227, 229, 233, 236, 238, 249, 258, 260], epsilon: 0.1085487359239089
1.00000
episode: 262, wins: [133, 141, 196, 209, 219, 227, 229, 233, 236, 238, 249, 258, 260, 261], epsilon: 0.1042820154910064
0.00000
episode: 263, wins: [133, 141, 196, 209, 219, 227, 229, 233, 236, 238, 249, 258, 260, 261], epsilon: 0.1042820154910064
0.00000
episode: 264, wins: [133, 141, 196, 209, 219, 227, 229, 233, 236, 238, 249, 258, 260, 261], epsilon: 0.1042820154910064
0.00000
episode: 265, wins: [133, 141, 196, 209, 219, 227, 229, 233, 236, 238, 249, 258, 260, 261], epsilon: 0.1042820154910064
0.00000
episo

In [134]:
# init_agent_pos = 15
init_agent_pos = 13
# init_rewards_pos = [3, 2, 13, 11, 23, 24, 16, 0]
init_rewards_pos = [3, 4, 8, 11, 22, 24, 17, 7]
init_holes_pos = []
max_steps = 30
env = Game2(x, y, init_agent_pos, init_rewards_pos, init_holes_pos, max_steps, max_steps_replay)
observation, reward, terminated = env.reset()
env.render()
time.sleep(0.2)
while not terminated:
#     state_input = observation.reshape(-1, x * y)
#     action = agent.get_action(state_input, action_list)
    action = np.argmax(agent.policy_network.predict(observation))
    next_observation, reward, terminated = env.step(action)
    clear_output(wait=True)
    env.render()
    time.sleep(0.2)
    observation = next_observation

| | | |O|O|
| | |O|O| |
| | | | | |
| | |O| | |
| | |O| |O|


In [128]:
len(agent.replay_buffer)

10000

# Game testing

In [80]:
init_agent_pos = 0
init_rewards_pos = [3, 4, 8, 11, 22, 24, 17, 7]
init_holes_pos = []
max_steps = 30
env = Game2(x, y, init_agent_pos, init_rewards_pos, init_holes_pos, max_steps, 3)
observation, reward, terminated = env.reset()
env.render()
time.sleep(0.2)
while not terminated:
    action = random.randint(0, 3)
    next_observation, reward, terminated = env.step(action)
    clear_output(wait=True)
    env.render()
    time.sleep(0.2)

| | | |O|O|
| | |O|O| |
| |O| | | |
| | |O| | |
| | |O| |O|


In [79]:
init_agent_pos = 0
init_rewards_pos = [3, 4, 8, 11, 22, 24, 17, 7]
init_holes_pos = []
max_steps = 30
env = Game2(x, y, init_agent_pos, init_rewards_pos, init_holes_pos, max_steps, 5)
while True:
    observation, reward, terminated = env.reset()
    env.render()
    time.sleep(0.2)
    while not terminated:
        action = random.randint(0, 3)
        next_observation, reward, terminated = env.step(action)
        clear_output(wait=True)
        env.render()
        time.sleep(0.2)

| | | |O|O|
| | |O|O| |
| |Y| | | |
| | |O| | |
| | |O| |O|


KeyboardInterrupt: 