In [1]:
from platform import python_version
import gym
from gym import envs
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import itertools
print(python_version())
print(tf.__version__)

3.7.5
2.2.0


In [2]:
# Definindo nosso buffer de experience replay:

class ExperienceReplay():
    def __init__(self, mem_max, input_dims):
        
        self.mem_max = mem_max       # Tamanho máximo do buffer
        self.mem_counter = 0         # Contador de quantos elementos o buffer possui



        # inicializando experiências guardadas

        self.s = np.zeros((self.mem_max, *input_dims), dtype=np.float32)  # Estado 1
        self.s2 = np.zeros((self.mem_max, *input_dims),dtype=np.float32)  # Estado 2
        self.r = np.zeros(self.mem_max, dtype = np.float32)               # Recompensas
        self.a = np.zeros(self.mem_max, dtype = np.int32)                 # Ações
        self.terminal = np.zeros(self.mem_max, dtype=np.int32)            # Se a memória é de um estado terminal ou não
        

    def store_transition(self, s, a, r, s2, teminado):
        index = self.mem_counter % self.mem_max
        self.s[index] = s                                   # Estado 1
        self.s2[index] = s2                                 # Estado 2
        self.r[index] = r                                   # Reward da Experiência
        self.a[index] = a                                   # Ação da Experiência
        self.terminal[index] = 1 - int(teminado)            # Caso a memória seja terminal
        self.mem_counter += 1                               # Incrementando o contador


    def sample_buffer(self, batch_size):
        mem_max = min(self.mem_counter, self.mem_max)
        batch = np.random.choice(mem_max, batch_size, replace=False)

        s = self.s[batch]
        s2 = self.s2[batch]
        r = self.r[batch]
        a = self.a[batch]
        terminal = self.terminal[batch]
        return s, a, r, s2, terminal



In [3]:
# Agora montaremos a estrutura de nossa DQN() :

##

def fazerDQN(alpha, n_acoes, input_dims, fc1, fc2):
    layers = tf.keras.layers
    DQN = tf.keras.models.Sequential()
    DQN.add(layers.Flatten(input_shape=(input_dims)))
    DQN.add(layers.Dense(fc1, activation='relu'))
    DQN.add(layers.Dense(fc2, activation='relu'))
    DQN.add(layers.Dense(n_acoes, activation=None))
    
    
    DQN.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=alpha),loss='huber_loss')

    return DQN


In [4]:
class Agent():
    def __init__(self, alpha, gamma, n_acoes, epsilon, batch_size, input_dims,
                 epsilon_dec=1e-3, epsilon_end=0.01, mem_max=1000000, fname='dqn_save.h5' ):


        self.acoes = [i for i in range(n_acoes)]
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_dec = epsilon_dec
        self.eps_min = epsilon_end
        self.batch_size = batch_size
        self.model_file = fname
        self.memoria = ExperienceReplay(mem_max, input_dims)
        self.q_eval = fazerDQN(alpha, n_acoes, input_dims, 256,256) ##

    def store_transition(self, s, a, r, s2, terminado):
        self.memoria.store_transition(s, a, r, s2, terminado)

    def escolher_acao(self, obs):
        if np.random.random() < self.epsilon:
            acao = np.random.choice(self.acoes)
        else:
            s = np.array([obs])
            acoes = self.q_eval.predict(s) ##

            acao = np.argmax(acoes)
        return acao

    def learn(self):
        if self.memoria.mem_counter < self.batch_size:
            return
        s, a, r, s2, terminados = self.memoria.sample_buffer(self.batch_size)
        q_eval = self.q_eval.predict(s) ##
        q_next = self.q_eval.predict(s2) ##

        q_target = np.copy(q_eval)
        batch_index = np.arange(self.batch_size, dtype=np.int32)

        q_target[batch_index, a] = r + self.gamma*np.max(q_next, axis=1)*terminados

        self.q_eval.train_on_batch(s, q_target) ##

        self.epsilon = self.epsilon - self.epsilon_dec if self.epsilon > self.eps_min else self.eps_min

    def save_model(self):
        self.q_eval.save(self.model_file)   ##
    def load_model(self):
        self.q_eval = tf.keras.models.load_model(self.model_file)    ##

In [5]:
# Carregando agente, ambiente e hiperparametros

lr = 0.001
n_games = 500

env = gym.make('LunarLander-v2')

agent = Agent(gamma=0.99, epsilon = 1.0, alpha=lr, input_dims=env.observation_space.shape,
              n_acoes=env.action_space.n, mem_max=1000000,
              batch_size=64, epsilon_end=0.01)
try:
    agent.load_model()  ##
except:
    pass

In [6]:
# Treinando o modelo e salvando:


scores = []
eps_history = []
for i in range(n_games):
    done = False
    score = 0
    observation = env.reset()
    while not done:
        action = agent.escolher_acao(observation)        
        observation_, reward, done, info = env.step(action)
        score += reward
        agent.store_transition(observation, action, reward, observation_, done)
        observation = observation_
        agent.learn()
    eps_history.append(agent.epsilon)
    scores.append(score)
    
    avg_score = np.mean(scores[-100:])
    print("episode: ", i, "score %.2f" % score, "average_score %.2f" % avg_score, "epsilon %.2f" % agent.epsilon)
    agent.save_model()

episode:  0 score -151.71 average_score -151.71 epsilon 0.96
episode:  1 score -155.59 average_score -153.65 epsilon 0.90
episode:  2 score -212.37 average_score -173.23 epsilon 0.81
episode:  3 score -389.37 average_score -227.26 epsilon 0.71
episode:  4 score 29.10 average_score -175.99 epsilon 0.57
episode:  5 score -53.26 average_score -155.53 epsilon 0.46
episode:  6 score -187.91 average_score -160.16 epsilon 0.37


KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt

x = [i+1 for i in range(n_games)]
plt.plot(x, scores)
plt.title("Curva de aprendizado")
plt.xlabel("Número de Jogos")
plt.ylabel("Pontuação")
plt.savefig(fname="learning_curve")
plt.show()

In [None]:
# Jogando o jogo:

done = False
score = 0
observation = env.reset()
while not done:
    action = agent.escolher_acao(observation)
    observation_, reward, done, info = env.step(action)
    env.render()
    score += reward
    observation = observation_
print("Score: ", score)
env.close()