In [18]:
### importing Dependencies 
import tensorflow as tf
import random
import gym
import numpy as np
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten, Conv2D, MaxPool2D
from tensorflow.keras.optimizers import Adam
import gym_super_mario_bros
from gym_super_mario_bros.actions import RIGHT_ONLY
from nes_py.wrappers import JoypadSpace
from IPython.display import clear_output

from keras.models import save_model
from keras.models import load_model 
from tqdm.keras import TqdmCallback


import time
tf.__version__

'2.5.0'

In [19]:
env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = JoypadSpace(env, RIGHT_ONLY)

In [3]:
# total_reward = 0 
# done = True

# for step in range(10000):
#     env.render()

#     if done:
#         state = env.reset()
#     state, reward, done, info = env.step(env.action_space.sample())
#     print(info)
#     total_reward += reward
#     clear_output(wait=True)

# env.close()


In [20]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        # Create the Variables for our agent 
        self.state_space = state_size
        self.action_space = action_size
        self.memory = deque(maxlen=5000)
        self.gamma = 0.8
        self.chosenAction = 0

        # Exploration vs Exploitation 
        self.epsilon = 1
        self.max_epsilon = 1
        self.min_epsilon = 0.01
        self.decay_epsilon = 0.001

        # Building NN for Agent
        self.main_network = self.build_network()
        self.target_network = self.build_network()
        self.update_target_network()

    def build_network(self):
        model = Sequential()
        model.add(Conv2D(64,(4,4), strides=4,padding='same',input_shape=self.state_space))
        model.add(Activation('relu'))

        model.add(Conv2D(64,(4,4), strides=2,padding='same'))
        model.add(Activation('relu'))

        model.add(Conv2D(64,(3,3), strides=1,padding='same'))
        model.add(Activation('relu'))
        model.add(Flatten())

        model.add(Dense(512,activation='relu'))
        model.add(Dense(512,activation='relu'))
        model.add(Dense(256,activation='relu'))
        model.add(Dense(self.action_space,activation='linear'))

        model.compile(loss='mse',optimizer=Adam())

        return model

    def update_target_network(self):
        self.target_network.set_weights(self.main_network.get_weights())

    def act(self,state,onGround):
        if onGround < 83:
            if random.uniform(0,1) < self.epsilon:
                self.chosenAction = np.random.randint(self.action_space)
                return self.chosenAction
            Q_value = self.main_network.predict(state)
            self.chosenAction = np.argmax(Q_value[0])
        else:
            return self.chosenAction

    def update_epsilon(self,episode):
        self.epsilon = self.min_epsilon + (self.max_epsilon - self.min_epsilon) * np.exp(-self.decay_epsilon * episode)

    # Train the network
    def train(self,batch_size):
        #minibatch from memory
        minibatch = random.sample(self.memory,batch_size)
        # Get Variables from batch so we can find q-value 
        for state,action,reward,next_state,done in minibatch:
            target = self.main_network.predict(state)

            if done:
                target[0][action] = reward
            else:
                target[0][action] = (reward + self.gamma * np.amax(self.target_network.predict(next_state)))

            self.main_network.fit(state,target,epochs = 10, verbose=0,callbacks=[TqdmCallback(verbose=2)])
    
    def store_transistion(self,state,action,reward,next_state,done):
        self.memory.append((state,action,reward,next_state,done))

    def get_pred_act(self,state):
        Q_value = self.main_network.predict(state)
        return np.argmax(Q_value[0])


    def load(self,name):
        self.main_network = load_model(name)
        self.target_network = load_model(name)

    def save(self,name):
        save_model(self.main_network,name)







In [21]:
action_space = env.action_space.n
state_space = (80,88,1)

from PIL import Image

def preprocess_state(state):
    image = Image.fromarray(state)
    image = image.resize((88,80))
    image = image.convert('L')
    image = np.array(image)

    return image

In [22]:
num_episodes = 1000000
num_timesteps = 400000
batch_size = 64
DEBUG_LENGTH = 300



In [23]:
dqn = DQNAgent(state_space,action_space)

KeyError: None

In [9]:
dqn.save('mariorl.h5')

In [24]:
# Viz 
while 1:
    done = False 
    state = preprocess_state(env.reset())
    state = state.reshape(-1,80,88,1)
    total_reward = 0 

    while not done:
        env.render()
        action = dqn.get_pred_act(state)
        next_state,reward,done,info = env.step(action)

        next_state=preprocess_state(next_state)
        next_state = next_state.reshape(-1,80,88,1)

        state = next_state
env.close()

In [25]:
dqn.load('mariorl.h5')

KeyboardInterrupt: 