In [1]:
import tensorflow as tf
import random
import gym
import numpy as np
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.optimizers import Adam
import gym_super_mario_bros
from gym_super_mario_bros.actions import RIGHT_ONLY
from nes_py.wrappers import JoypadSpace
from IPython.display import clear_output

from keras.models import save_model, load_model
import time

In [2]:
with tf.device("/gpu:0"):
    env = gym_super_mario_bros.make('SuperMarioBros-v0')
    env = JoypadSpace(env, RIGHT_ONLY)

In [13]:
total_reward = 0
done = True

for step in range(1000):
    env.render()
    
    if done:
        state = env.reset()
    state, reward, done, info = env.step(env.action_space.sample())
    print(info)
    total_reward += reward
    clear_output(wait=True)
    
env.close()

{'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 350, 'world': 1, 'x_pos': 594, 'y_pos': 84}


In [8]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_space = state_size
        self.action_space = action_size
        self.memory = deque(maxlen=5000)
        self.gamma = 0.8
        self.chosenAction = 0
        
        self.epsilon = 1
        self.max_epsilon = 1
        self.min_epsilon = 0.01
        self.decay_epsilon = 0.0001
        
        self.main_network = self.build_network()
        self.target_network = self.build_network()
        self.update_target_network()
        
    def build_network(self):
        model = Sequential()
        model.add(Conv2D(64, (4,4), strides=4, padding='same', input_shape=self.state_space))
        model.add(Activation('relu'))
        
        model.add(Conv2D(64, (4,4), strides=4, padding='same'))
        model.add(Activation('relu'))
        
        model.add(Conv2D(64, (3,3), strides=4, padding='same'))
        model.add(Activation('relu'))
        model.add(Flatten())
        
        model.add(Dense(512, activation='relu'))
        model.add(Dense(256, activation='relu'))
        model.add(Dense(self.action_space, activation='linear'))
        
        model.compile(loss='mse', optimizer=Adam())
        
        return model
        
    def update_target_network(self):
        self.target_network.set_weights(self.main_network.get_weights())
        
    def act(self, state, onGround):
        if onGround < 83:
            print('on ground')
            if random.uniform(0,1) < self.epsilon:
                self.chosenAction = np.random.randint(self.action_space)
                return self.chosenAction
            q_value = self.main_network.predict(state)
            self.chosenAction = np.argmax(q_value[0])
            #print(q_value)
            return self.chosenAction
        else:
            print('not on ground')
            return self.chosenAction
    
    def update_epsilon(self, episode):
        self.epsilon = self.min_epsilon + (self.max_epsilon - self.min_epsilon) * np.exp(-self.decay_epsilon * episode)
        
    def train(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        
        for state, action, reward, next_state, done in minibatch:
            target = self.main_network.predict(state)
            
            if done:
                target[0][action] = reward
            else:
                target[0][action] = (reward + self.gamma * np.amax(self.target_network.predict(next_state)))
                
            self.main_network.fit(state, target, epochs=1, verbose=0)
            
    def store_transition(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        
    def get_pred_act(self, state):
        q_values = self.main_network.predict(state)
        return np.argmax(q_values[0])
        
    def load(self, name):
        self.main_network = load_model(name)
        self.target_network = load_model(name)
        
    def save(self, name):
        save_model(self.main_network, name)


In [9]:
action_size = env.action_space.n
state_size = (80, 88, 1)

from PIL import Image

def preprocess_state(state):
    image = Image.fromarray(state)
    image = image.resize((88,80))
    image = image.convert('L')
    image = np.array(image)
    
    return image

In [10]:
num_episodes = 1000000
num_timesteps = 400000
batch_size = 64
DEBUG_LENGTH = 200

In [11]:
with tf.device("/gpu:0"):
    dqn = DQNAgent(state_size, action_size)

In [None]:
with tf.device("/gpu:0"):
    print('STARTING TRAINING')

    stuck_buffer = deque(maxlen=DEBUG_LENGTH)

    for i in range(num_episodes):
        Return = 0
        done = False
        time_step = 0
        onGround = 79

        state = preprocess_state(env.reset())
        state = state.reshape(-1, 80, 88, 1)

        for t in range(num_timesteps):
            #env.render()
            time_step += 1

            if t > 1 and stuck_buffer.count(stuck_buffer[-1]) > DEBUG_LENGTH - 50:
                action = dqn.act(state, onGround=79)
            else:
                action = dqn.act(state, onGround)

            next_state, reward, done, info = env.step(action)

            onGround = info['y_pos']
            stuck_buffer.append(info['x_pos'])

            next_state = preprocess_state(next_state)
            next_state = next_state.reshape(-1, 80, 88, 1)

            dqn.store_transition(state, action, reward, next_state, done)
            state = next_state

            Return += reward
            print(f"Episode is: {i}\nTotal Time Step: {time_step}\nCurrent Reward: {Return}\nEpsilon is: {dqn.epsilon}")

            clear_output(wait=True)

            if done:
                break

            if len(dqn.memory) > batch_size and i > 5:
                dqn.train(batch_size)

        dqn.update_epsilon(i)
        clear_output(wait=True)
        dqn.update_target_network()

    env.close()

not on ground
Episode is: 6
Total Time Step: 10
Current Reward: 3
Epsilon is: 0.9995051237293776


In [10]:
dqn.save('Mario.h5')

AttributeError: 'DQNAgent' object has no attribute 'save'

In [None]:
dqn.load('Mario.h5')

In [None]:
while 1:
    done = False
    state = preprocess_state(env.reset())
    state = state.reshape(-1, 80, 88, 1)
    total_reward = 0
    onGround = 79
    
    while not done:
        env.render()
        action = dqn.act(state, onGround)
        next_state, reward, done, info = env.step(action)
        
        onGround = info['y_pos']
        
        next_state = preprocess_state(next_state)
        next_state = next_state.reshape(-1, 80, 88, 1)
        state = next_state
        
env.close()