In [33]:
# importing mario dependencies
import tensorflow as tf
import random
import gym
import numpy as np
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.optimizers import Adam
import gym_super_mario_bros
from gym_super_mario_bros.actions import RIGHT_ONLY
from nes_py.wrappers import JoypadSpace
from IPython.display import clear_output

from keras.models import save_model
from keras.models import load_model 

import time

In [1]:
env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = JoypadSpace(env, RIGHT_ONLY)



NameError: name 'gym_super_mario_bros' is not defined

total_reward = 0
done = True

for step in range(100000):
    env.render()
    
    if done:
        state = env.reset()
    state, reward, done, info = env.step(env.action_space.sample())
    print(info)
    total_reward += reward
    clear_output(wait=True)
env.close()

In [35]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        # Create variables for agent
        self.state_space = state_size
        self.action_space = action_size
        self.memory = deque(maxlen = 5000)
        self.gamma = 0.8
        self.chosenAction = 0
        
        # Exploration vs explotation
        self.epsilon = 1
        self.max_epsilon = 1
        self.min_epsilon = 0.01
        self.decay_epsilon = 0.0001
        
        # Building Neural Networks for Agent
        self.main_network = self.build_network()
        self.target_network = self.build_network()
        self.update_target_network()
        
    def build_network(self):
        model = Sequential()
        model.add(Conv2D(64, (4,4), strides=4, padding='same', input_shape=self.state_space))
        model.add(Activation('relu'))
        
        model.add(Conv2D(64, (4,4), strides=2, padding='same'))
        model.add(Activation('relu'))
        
        model.add(Conv2D(64, (3,3), strides=1, padding='same'))
        model.add(Activation('relu'))
        model.add(Flatten())
        
        model.add(Dense(512, activation='relu'))
        model.add(Dense(256, activation='relu'))
        model.add(Dense(self.action_space, activation='linear'))
        
        model.compile(loss='mse', optimizer=Adam())
        
        return model
    
    def update_target_network(self):
        self.target_network.set_weights(self.main_network.get_weights())  
        
    def act(self, state, onGround):
        if onGround < 83:
            print('On Ground')
            if random.uniform(0,1) < self.epsilon:
                self.chosenAction = np.random.randint(self.action_space)
                return self.chosenAction
            Q_value = self.main_network.predict(state)
            self.chosenAction = np.argmax(Q_value[0])
    #        print(Q_value)
            return self.chosenAction
        else:
            print('Not on Ground')
            return self.chosenAction

    def update_epsilon(self, episode):
        self.epsilon = self.min_epsilon + (self.max_epsilon - self.min_epsilon) * np.exp(-self.decay_epsilon * episode)

    # Train the network
    def train(self, batch_size):
        # minibatch from memory
        minibatch = random.sample(self.memory, batch_size)
        
        # Get variables from batch so we can find q-value
        for state, action, reward, next_state, done in minibatch:
            target = self.main_network.predict(state)
            print(target)
            
            if done:
                target[0][action] = reward
            else:
                target[0][action] = (reward + self.gamma * np.amax(self.target_network.predict(next_state)))
                
            self. main_network.fit(state, target, epochs=1, verbose=0)    
        
        
    def store_transition(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        
    def get_pred_act(self, state):
        Q_values = self.main_network.predict(state)
        return np.argmax(Q_values[0])
        
        
    def load(self, name):
        self.main_network = load_model(name)
        self.target_network = load_model(name)
        
    def save(self, name):
        save_model(self.main_network, name)
        
            

In [36]:
action_space = env.action_space.n
state_space = (80, 88, 1)

from PIL import Image

def preprocess_state(state):
    image = Image.fromarray(state)
    image = image.resize((88,80))
    image = image.convert('L')
    image = np.array(image)
    
    return image

#dqn = DQNAgent(state_size)

In [37]:
num_episodes = 1000000
num_timesteps = 400000
batch_size = 64
DEBUG_LENGTH = 300

In [38]:
dqn = DQNAgent(state_space, action_space)

In [39]:
dqn.load('MarioRL.h5')

In [43]:
print('STARTING TRAINING')

stuck_buffer = deque(maxlen=DEBUG_LENGTH)
y_buffer = deque(maxlen=DEBUG_LENGTH)

for i in range(num_episodes):
    Return = 0
    done = False
    time_step = 0
    onGround = 79
    
    state = preprocess_state(env.reset())
    state = state.reshape(-1, 80, 88, 1)
    
    for t in range(num_timesteps):
        env.render()
        time_step += 1
        
        if t > 1 and stuck_buffer.count(stuck_buffer[-1]) > DEBUG_LENGTH - 50:
            action = dqn.act(state, onGround = 79)
        else:
            action = dqn.act(state, onGround)
            
    
            
        
        print('ACTION IS ' + str(action)) 
        

        
        next_state, reward, done, info = env.step(action)
        
        #print(info['y_pos'])
        onGround = info['y_pos']
        stuck_buffer.append(info['x_pos'])
        y_buffer.append(info['y_pos'])
        
        next_state = preprocess_state(next_state)
        next_state = next_state.reshape(-1, 80, 88, 1)
        
        dqn.store_transition(state, action, reward, next_state, done)
        state = next_state
        
        Return += reward
        print('Episode is: {}\nTotal Time Step: {}\nCurrent Reward: {}\nEpsilon is: {}'.format(str(i), str(time_step), str(Return), str(dqn.epsilon)))
        
        clear_output(wait=True)
        
        if done:
            break
        
        if len(dqn.memory) > batch_size and i > 5:
            dqn.train(batch_size)
            
    dqn.update_epsilon(i)
    clear_output(wait=True)
    dqn.update_target_network()
    # Save Model
    
env.close()

[[-2.2937758 23.531214  13.650911  33.302364  24.283241 ]]
[[ 4.3187737 13.163657   9.982404  19.301273   6.511949 ]]
[[ 5.7367096 10.284222   9.135763  15.296214   2.6382604]]
[[ 6.12954   10.53536    9.196941  15.60462    2.9019296]]
[[11.982199  15.12371   13.132543  22.341642   3.6461887]]
[[18.729708 18.463772 15.879629 27.140587  3.842517]]
[[16.527405    6.4254904  10.737882   14.334472    0.83452004]]
[[14.602374    0.15833834  8.981685    6.365984   -2.2022147 ]]
[[14.81761    -0.72502446 11.588421    5.9502664  -1.640629  ]]
[[9.277575  1.3161894 8.731137  5.9944057 2.4218855]]
[[7.5541615 2.806184  7.222246  6.229103  2.8733866]]
[[7.79278  4.495775 7.303487 7.700537 4.015381]]
[[11.756711   7.2552543 11.360169  11.809618   8.232837 ]]
[[12.453796  8.773743 12.582499 12.007861 10.843824]]
[[13.955826 10.49309  15.153606 12.487238 15.023069]]
[[11.462698  9.091023 12.648635  9.653771 13.10728 ]]
[[ 9.658142   7.981949  10.815672   7.6566887 11.184681 ]]
[[ 9.304737   7.850981

KeyboardInterrupt: 

In [44]:
dqn.save('marioRL.h5')

dqn.load('MarioRL.h5')

# Visualizing model

while 1:
    done = False
    state = preprocess_state(env.reset())
    state = state.reshape(-1, 80, 88, 1)
    total_reward = 0
    onGround = 79
    
    while not done:
        env.render()
        action = dqn.act(state, onGround)
        next_state, reward, done, info = env.step(action)
        
        onGround = info['y_pos']
        
        next_state = preprocess_state(next_state)
        next_state = next_state.reshape(-1, 80, 88, 1)
        state = next_state
        clear_output(wait=True)
        
env.close()