In [2]:
# Imports

import numpy as np
import gym
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Conv2D, MaxPool2D, Flatten, BatchNormalization
from keras.optimizers import rmsprop
import keras.backend as K
import random
import matplotlib.pyplot as plt
%matplotlib inline

Using TensorFlow backend.


In [3]:
# Creating the game with the action space
env = gym.make("MsPacman-v4")
action_space = [0,1,2,3,4,5,6,7,8] # All possible movements 


In [4]:
def prepare(render):
    # Cuts off the bottom
    render = render[:166]

    # Cuts down the image to be smaller and less info, easier for ML to digest
    render = render[::2,::2,0]
    return render.astype(np.float)[:,:,None]

In [5]:

def rewards(r):
    # Gamma is the value for the belmans equation
    gamma = 0.75
    discounted = np.zeros_like(r)
    running_total = 0

    for t in reversed(range(len(discounted))):
        if r[t] != 0: running_add = 0
        # Uses belmans equation to keep a cheep running total and to get momentum
        running_total = r[t] + running_total * gamma
        discounted[t] = running_total
    
    return discounted


def discounted_reward(r):
    # standardizes the reward by using z-scores 
    dr = rewards(r)
    dr = (dr - dr.mean()) / dr.std()
    return dr



In [6]:
# Creating model with conv2d layers to observe image
model = Sequential()
model.add(Conv2D(16, kernel_size=(1,1), padding='same', activation='relu', input_shape = (83,80,1)))
model.add(MaxPool2D(pool_size=(2, 2)))
model.add(Conv2D(32, kernel_size=(1,1), padding='same', activation='relu'))
model.add(MaxPool2D(pool_size=(2, 2)))
model.add(Conv2D(64, kernel_size=(1,1), padding='same', activation='relu'))
model.add(MaxPool2D(pool_size=(2, 2)))
model.add(Flatten())

# Setting the output size with softmax actication or a probabiity dist
model.add(Dense(len(action_space), activation='softmax'))

# RMSPROP seemed to work alot better tha adam, which seemed to die after a while
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy') 





Instructions for updating:
keep_dims is deprecated, use keepdims instead




In [7]:
# Creating local var's
episode = 0
total_episodes = 1000
reward_sums = np.zeros(total_episodes)
loss = np.zeros(total_episodes)
time_steps = np.zeros(total_episodes)

running_reward = 0

# Size of the game after it has been prepared
input_size = (83, 80, 1)

prev_frame = None
steps = 15000 # How many action steps it allows befoe re starting the env

# Creating arrays that are the size of the amount of steps that each game can take maxx
xs = np.zeros((steps,)+input_size)
ys = np.zeros((steps,1))
rs = np.zeros((steps))

# spin up env
current_steps = 0
observation = env.reset()

while episode < total_episodes:
    # env.render() # uncomment line to show the gameplay and learning
    
    # This gets the preprocessed difference of frames to be fed into the model
    x = prepare(observation)
    xs[current_steps] = x - prev_frame if prev_frame is not None else np.zeros(input_size)
    prev_frame = x
    
    # This takes an action based on the current model, using random choice to stimuate the enviorment
    p = model.predict(xs[current_steps][None,:,:,:])
    a = np.random.choice(len(action_space), p=p[0]) 
    action = action_space[a] 
    ys[current_steps] = a #saves the action
    
    # Renew state of environment
    observation, reward, done, info = env.step(action) #takes a step in the enviorment, getting a new set of varibles
    running_reward += reward #record total rewards
    rs[current_steps] = reward # record reward

    current_steps += 1

    if done or current_steps == steps:
        reward_sums[episode] = running_reward
        running_reward = 0

        # Recording the episode frames, choices, and rewards to correct size
        episode_x = xs[:current_steps]
        episode_y = ys[:current_steps]
        episode_r = rs[:current_steps]


        # Discount and stanndardize rewards
        episode_r = discounted_reward(episode_r)

        # Train model
        model.fit(episode_x,episode_y, sample_weight=episode_r, batch_size=512, epochs=1, verbose=0)

        time_steps[episode] = current_steps

        # Reset env for new game
        current_steps = 0
        prev_frame = None
        observation = env.reset()

        loss[episode] = model.evaluate(episode_x,episode_y, sample_weight=episode_r, batch_size=512, verbose=0)

        # Iterate episode
        episode += 1

        # Prints 100 updates and saves it 100 times 
        if episode%(total_episodes//100) == 0:
            avg_reward = np.mean(reward_sums[max(0,episode-200):episode])
            avg_loss = np.mean(loss[max(0,episode-200):episode])
            avg_time = np.mean(time_steps[max(0,episode-200):episode])
            model.save('iLoveYoutNOT')
            print(f"{avg_reward} {avg_loss} {avg_time}")
        








Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


314.0 -0.375464635045125 796.0
328.0 -0.2606655054145767 765.35
335.0 -0.22350086265722752 744.5
371.25 -0.19029911307721387 780.275
380.6 -0.17677964520050776 792.14
400.0 -0.16964681085125635 789.3666666666667
399.7142857142857 -0.1639542481300238 786.8428571428572
399.125 -0.15368137607167326 797.325
395.55555555555554 -0.1496067949832614 790.1555555555556
400.7 -0.14888389275287423 795.85
418.27272727272725 -0.15614858094549441 794.3363636363637
422.75 -0.16420821972793426 785.8583333333333
425.84615384615387 -0.1599876157535034 784.1692307692308
430.07142857142856 -0.16137067789615983 782.4714285714285
431.8 -0.16891697053754878 776.9
425.9375 -0.1744406007569319 766.36875
425.4117647058824 -0.17822646663009958 760.0058823529412
423.3888888888889 -0.180279432445229

In [25]:
# Plays through with the model data 
import time
observation = env.reset()
env.render()

# Pause to be able to get my window in the correct area
time.sleep(5)

# First frame so prev frame is none and it is not done
prev_frame = None
done = False
while not done:

    # prepare image and calculate diff of frames
    x = prepare(observation) 
    diff = x - prev_frame if prev_frame is not None else np.zeros(input_size)

    # Predict 
    p = model.predict(diff[None,:,:,:])
    prev_frame = x
    a = np.random.choice(len(action_space), p=p[0])
    action = action_space[a]
    
    # Render into buffer. 
    env.render()
    observation, reward, done, info = env.step(action)
    time.sleep(.02)
env.close()

In [24]:
# Reloading the model to make sure it works
reload_model = keras.models.load_model('iLoveYoutNOT')