In [None]:
import gym
import cv2
import numpy as np
from collections import deque
import imageio
import os
import random
import matplotlib.pyplot as plt
import pickle
import time
import pandas as pd
import itertools

import tensorflow as tf
from keras import backend as K

from keras.models import Sequential
from keras.layers import Dense, Conv2D, MaxPooling2D, Flatten
from keras.optimizers import Adam


In [None]:
TRAIN = True
LEN_MEMORY_QUEUE = 60000

In [None]:

def huber_loss(y_true, y_pred, clip_delta=1.0):
    error = y_true - y_pred
    cond  = tf.keras.backend.abs(error) < clip_delta

    squared_loss = 0.5 * tf.keras.backend.square(error)
    linear_loss  = clip_delta * (tf.keras.backend.abs(error) - 0.5 * clip_delta)

    return tf.where(cond, squared_loss, linear_loss)

def huber_loss_mean(y_true, y_pred, clip_delta=1.0):
    return tf.keras.backend.mean(huber_loss(y_true, y_pred, clip_delta))

In [None]:
class Agent(): 
    def __init__(self, state_size, action_size):
        #self.weight_backup              =       experiment_params['dirs']['dir_model']+experiment_params['game']+".h5"
        self.state_size                 =       state_size
        self.action_size                =       action_size
        self.memory                     =       deque(maxlen=LEN_MEMORY_QUEUE)
        self.min_learning_rate          =       .001
        self.max_learning_rate          =       .0008
        self.epochs_interval_lr         =       1
        self.learning_rate_decay        =       (self.max_learning_rate - self.min_learning_rate) / self.epochs_interval_lr
        self.gamma                      =       .99
        self.exploration_rate           =       .7
        self.exploration_min            =       .1
        self.exploration_decay          =       1 / 90000 # A variável irá se atualizar.
#        self.exploration_map            =       experiment_params['params_agent']['exploration_map']
        self.k_frames                   =       4
        self.frame_height               =       self.state_size[0]
        self.frame_width                =       self.state_size[1]
        self.brain                      =       self._build_model()
        self.brain_target               =       self._build_model()
        self.freq_update_nn             =       1000
        self.initial_start_size         =       5000
        
        #self.update_exploration_decay(0)

    # def update_exploration_decay(self, frame):
    #     for frame_limit, eps_limit in self.exploration_map:
    #         if frame <= frame_limit:
    #             self.exploration_decay =  (eps_limit[0] - eps_limit[1]) / (frame_limit/self.k_frames)
    #             break
        
    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        
        ## Build Convolutional NetWork
        model.add(Conv2D(32, (8,8), strides=4, input_shape=(self.state_size[0], self.state_size[1], self.k_frames), activation='relu', padding='valid'))
        model.add(Conv2D(64, (4,4), strides=2, activation='relu', padding='valid'))
        model.add(Conv2D(64, (3,3), strides=1, activation='relu', padding='valid'))

        model.add(Flatten())
        ## Build Neural Network
        model.add(Dense(512, activation='relu', kernel_initializer='he_uniform'))          
        #model.add(Dense(256, activation='relu', kernel_initializer='glorot_uniform'))
        #model.add(Dense(64, activation='relu', kernel_initializer='glorot_uniform'))
            
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss=huber_loss_mean, optimizer=Adam(lr=self.max_learning_rate))
            
        model.summary()
        # if self.has_load_data():
        #     model.load_weights(self.weight_backup)
        #     self.exploration_rate = self.exploration_min
        #     self.max_learning_rate = self.min_learning_rate
        return model

    
    # def save_model(self):
    #     self.brain.save(self.weight_backup)
        
            
    def get_last_k_frames(self, state):
        frames = np.empty((self.k_frames, self.frame_height, self.frame_width))

        for i in range(0, self.k_frames):
            _, _, _, n_s, _ = self.memory[len(self.memory)-(self.k_frames-i)]
            frames[i] = n_s
            
        return np.transpose(frames, axes=(1,2,0))

    
    def act(self, state, current):
        if np.random.rand() <= self.exploration_rate or len(self.memory) < self.k_frames+1 or current < self.initial_start_size:
            return random.randrange(self.action_size)
        
        k_frames_state = self.get_last_k_frames(state)
        k_frames_state = np.expand_dims(k_frames_state, axis=0)
        #act_values = self.brain.predict(k_frames_state, verbose=0)
        act_values = self.brain.predict(k_frames_state, verbose=0)
        return np.argmax(act_values[0])

    def remember(self, state, action, reward, next_state, done):        
        self.memory.append((state, action, reward, next_state, done))      


    def pack_K_frames(self, sample_batch_size):
        if len(self.memory) < self.k_frames+2:
            return
        
        state = np.empty((sample_batch_size, self.k_frames, self.frame_height, self.frame_width))
        action = np.empty(sample_batch_size, dtype=np.uint8)
        reward = np.empty(sample_batch_size, dtype=np.float32)
        next_state = np.empty((sample_batch_size, self.k_frames, self.frame_height, self.frame_width))
        done = np.empty(sample_batch_size, dtype=np.bool_)
        

        for k in range(sample_batch_size):
            index = random.randint(0, len(self.memory)-self.k_frames-2)
            
            for i, idx_memory in enumerate(range(index, index+self.k_frames)):
                s, a, r, n_s, d = self.memory[idx_memory]
            
                state[k][i] = s
                next_state[k][i] = n_s
                
            done[k] = d
            action[k] = a
            reward[k] = r
              
        #State = (32,4,84,84)  -> State Transpose = (32,84,84,4) 
        return np.transpose(state, axes=(0,2,3,1)), action, reward, np.transpose(next_state, axes=(0,2,3,1)), done
        

    def replay(self, sample_batch_size):
        if len(self.memory) < sample_batch_size:
            return
        #sample_batch = random.sample(self.memory, sample_batch_size)
        state, action, reward, next_state, done = self.pack_K_frames(sample_batch_size)
        
        #print('State: {}'.format(state.shape))
        #print('Action: {}'.format(action.shape))
        #print('Reward: {}'.format(reward.shape))
        #print('Next_State: {}'.format(next_state.shape))
        #print('Done: {}'.format(done.shape))
        #input()
        #target = reward
        predicted = self.brain_target.predict(next_state, verbose=0) #Previsão proximo estado.
        target_f = self.brain.predict(state, verbose=0) #Previsão estado atual.
        #print('Predicted: {}'.format(predicted))
        #print('Predicted[0]: {}'.format(predicted[2]))
        #print('Predicted Max: {}'.format(np.amax(predicted)))
        #print('Reward: {}'.format(reward))
        #print('Target_f: {}'.format(target_f))
        #input()
        
        # for i in range(sample_batch_size):
        #     target = reward[i] + (self.gamma * np.amax(predicted[i]) * (1-done[i]))
        #     target_f[i][action[i]] = target
        targets = reward + (self.gamma * np.amax(predicted, axis=1) * (1 - done))
        target_f[np.arange(sample_batch_size), action] = targets
   
            
        
        #print('Target_f Formatado: {}'.format(target_f))
        #input()
        history = self.brain.fit(state, target_f, batch_size=sample_batch_size, epochs=1, verbose=0)
            
        if self.exploration_rate > self.exploration_min:
            self.exploration_rate -= self.exploration_decay
        
        return history, self.exploration_rate

    def update_target_model(self, current_frame):
        if (current_frame % self.freq_update_nn == 0 and TRAIN and current_frame > self.initial_start_size):
            self.brain_target.set_weights(self.brain.get_weights())
            print('-------------------------UPDATED TARGET MODEL() ------------------------------')

    def update_learning_rate(self,):
        ## Modified 
        if self.max_learning_rate > self.min_learning_rate:
            if self.epochs_interval_lr > 0:
                self.max_learning_rate-=self.learning_rate_decay
                self.epochs_interval_lr-=1
                K.set_value(self.brain.optimizer.lr, self.max_learning_rate) # Change learning rate.
                print('Current Learning rate: {}'.format(K.eval(self.brain.optimizer.lr)))


In [None]:
class GameConstructor():

    def __init__(self, game):
        self.env = gym.make(game)
        self.sample_batch_size          =              32
        self.episodes                   =              1000
        self.action_size                =              self.env.action_space.n
        self.state_size                 =              (84,84)
        self.agent                      =              Agent(self.state_size, self.action_size)
        self.best_score                 =              -99999999
        self.crop_on_top                =              34
        self.crop_on_bottom             =              16
        self.crop_on_left               =              7
        self.crop_on_right              =              7
        self.frames_skip                =              1
        self.current_frame              =              0
        #self.freq_update                =              10000
        #self.canSave                    =              True
        self.time_train_init            =              time.time()
#        self.epochs_to_save_results     =              experiment_params['epochs_to_save_results']
        self.frames_in_atual_episode    =              0
        self.seconds_in_atual_episode   =              0
        self.freq_save_video            =              10

        self.initialize_dirs()

    def initialize_dirs(self):
        for values in [
            'experiments/Pong-useToPaper01/movies/'
        ]:
            if not os.path.isdir(values):
                os.makedirs(values)
    
    def to_gray_scale(self, img):
        return 0.299*img[:,:,0] + 0.587*img[:,:,1] + 0.114*img[:,:,2]
    
    def get_frames_per_seconds_in_atual_episode(self):
        return int(self.frames_in_atual_episode / (time.time() - self.seconds_in_atual_episode))
    
   
    def crop_img(self, img):
        #return img[self.crop_on_top: -self.crop_on_bottom, self.crop_on_border:-self.crop_on_border]
        return img[self.crop_on_top:-self.crop_on_bottom, self.crop_on_left:-self.crop_on_right]    

    def preprocess_img(self, img):
        return self.to_gray_scale(cv2.resize(self.crop_img(img), (self.state_size[0],self.state_size[0]), interpolation=cv2.INTER_AREA)) / 255
        
    
    def save_image_epoch(self, gif_to_save, epoch, best_current_play=False):
        if best_current_play:
            epoch = str(epoch)+'_best'
        else:
            epoch = str(epoch)
        
        #if (self.canSave and len(gif_to_save) > 0):
        if (len(gif_to_save) > 0):
            dir_save = 'experiments/Pong-useToPaper01/movies/'
            imageio.mimwrite(dir_save+epoch+'.mp4', np.multiply(gif_to_save, 255).astype(np.uint8), fps = 100)
            print('Gif Salvo') 
            #self.canSave = False
            
    def restart_chronometer(self):
        self.frames_in_atual_episode=0
        self.seconds_in_atual_episode=0
    
    def run(self):
        #global total_reward_game

        try:
            for i_episodes in range(self.episodes):
                state = self.env.reset()
               
                state = self.preprocess_img(state)
                
                done = False
                current_images_episode = [] 
                total_reward=0
                history_list = []
                exploration = 1.0
                self.seconds_in_atual_episode= time.time()
                
                while not done:
                                        
                    #if i_episodes > 600 or not TRAIN:
                    #self.env.render()
                    action = self.agent.act(state, self.current_frame)
                    next_state, reward, done, info = self.env.step(action)
                    
                    reward = np.sign(reward)
                    total_reward+=reward
                    next_state = self.preprocess_img(next_state)
                    
                    current_images_episode.append(next_state)

                    
                    self.agent.remember(state, action, reward, next_state, done)
                    state = next_state
                        
                    self.current_frame+=1
                    self.frames_in_atual_episode+=1
                    if TRAIN:
                        if self.current_frame > self.agent.initial_start_size and (self.current_frame % self.frames_skip) == 0:
                           
                            history, exploration = self.agent.replay(self.sample_batch_size)
                            history_list.append(history.history['loss'])
                    
                        self.agent.update_target_model(self.current_frame)
    
                #total_reward_game.append(total_reward)
                
                
                if TRAIN and self.current_frame > self.agent.initial_start_size:
                    self.agent.update_learning_rate()
                    #self.agent.update_exploration_decay(self.current_frame)
                    if i_episodes % self.freq_save_video == 0:
                        #self.canSave=True
                        
                        self.save_image_epoch( current_images_episode, i_episodes)   
                    
                    # self.add_new_result(total_reward, (self.current_frame / self.frames_skip), time.time(), 
                    #                     np.mean(history_list), exploration, self.get_frames_per_seconds_in_atual_episode(), K.eval(breakout.agent.brain.optimizer.lr))
                   
                    # if total_reward > self.best_score and self.current_frame > self.agent.initial_start_size:
                    #         self.best_score = total_reward
                    #         self.agent.save_model()
                    #         self.save_image_epoch(current_images_episode, i_episodes, best_current_play=True)
                    #         print('Save best current model -> ', end='')
                current_images_episode = []
                print("Episode {}# r: {}# Loss: {:.6} # Trains: {} # eps: {:.3}# Space: {}% 'fps: {}:".format(i_episodes, total_reward, np.mean(history_list), (self.current_frame / self.frames_skip), exploration, round(((len(self.agent.memory) / (LEN_MEMORY_QUEUE)) * 100), 2), self.get_frames_per_seconds_in_atual_episode() ))
                self.restart_chronometer()
                
                
        finally:
            if TRAIN:
                print('Finish')
                #self.agent.save_model()
                #self.save_results()
                
                
            self.env.close()

In [None]:
if __name__ == '__main__':
    game_constructor = GameConstructor('Pong-v0')
    game_constructor.run()