# 1. Install Dependencies

In [None]:
!pip install gym[atari]
!pip install numpy
!pip install keras==2.1.2
!pip install tensorflow==1.15.5
!pip install matplotlib

# 2. Import Dependencies

In [None]:
import numpy as np

import gym
from keras.models import Sequential
from keras.layers import Dense, Conv2D, MaxPool2D, Flatten, BatchNormalization
from keras.optimizers import rmsprop
import keras.backend as K

import matplotlib.pyplot as plt
%matplotlib inline

# 3. Create The Enviorment

In [None]:
env = gym.make('PongDeterministic-v4')
action_space = [4,5] # ["up","down"]

# 4. Prepare The Image

In [None]:
def prepro(I):
    """ prepro 210x160x3 uint8 frame into 80x80x1 float """
    I = I[35:195] # crop
    I = I[::2,::2,0] # downsample by factor of 2
    I[I == 144] = 0 # erase background (background type 1)
    I[I == 109] = 0 # erase background (background type 2)
    I[I != 0] = 1 # everything else (paddles, ball) just set to 1
    return I.astype(np.float)[:,:,None]

# 5. Create Discounted Reward

In [None]:
gamma = 0.99
def discount_rewards(r):
    """ take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(range(len(discounted_r))):
        if r[t] != 0: running_add = 0 # reset the sum, since this was a game boundary (pong specific!)
        running_add =  r[t] + running_add * gamma # bellman's equation
        discounted_r[t] = running_add
    return discounted_r

def discount_n_standardise(r):
    # standardizes the reward by using z-scores 
    dr = discount_rewards(r)
    dr = (dr - dr.mean()) / dr.std()
    return dr

# 6. Create The Machine Learning Model

In [None]:
model = Sequential()
model.add(Conv2D(16, kernel_size=(1,1), padding='same', activation='relu', input_shape = (80,80,1)))
model.add(MaxPool2D(pool_size=(2, 2)))
model.add(Conv2D(32, kernel_size=(1,1), padding='same', activation='relu'))
model.add(MaxPool2D(pool_size=(2, 2)))
model.add(Conv2D(64, kernel_size=(1,1), padding='same', activation='relu'))
model.add(MaxPool2D(pool_size=(2, 2)))
model.add(Flatten())
model.add(Dense(len(action_space), activation='softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy') #

model.summary()

# 7. Train the Model

In [None]:
episodes = 0
n_episodes = 1000
reward_sums = np.zeros(n_episodes)
losses = np.zeros(n_episodes)
time_steps = np.zeros(n_episodes)
reward_sum = 0
input_shape = (80,80,1)

prev_x = None
steps = 2500
xs = np.zeros((steps,)+input_shape)
ys = np.zeros((steps,1))
rs = np.zeros((steps))

k = 0
observation = env.reset()

while episodes < n_episodes:
    env.render() # uncomment line to show the gameplay and learning
    
    # This gets the preprocessed difference of frames to be fed into the model
    x = prepro(observation)
    xs[k] = x - prev_x if prev_x is not None else np.zeros(input_shape)
    prev_x = x
    
    # This takes an action based on the current model, using random choice to stimuate the enviorment
    p = model.predict(xs[k][None,:,:,:])
    a = np.random.choice(len(action_space), p=p[0]) #returns either a 0 or a 1
    action = action_space[a] #sets a to an action, 0 for up, 1 for down
    ys[k] = a #saves the action
    
    # Renew state of environment
    observation, reward, done, info = env.step(action) #takes a step in the enviorment, getting a new set of varibles
    reward_sum += reward #record total rewards
    rs[k] = reward # record reward per step
    
    k += 1
    
    if done or k==steps:
        reward_sums[episodes] = reward_sum
        reward_sum = 0
        
        # Gather state, action (y), and rewards (and preprocess)
        ep_x = xs[:k]
        ep_y = ys[:k]
        ep_r = rs[:k]
        ep_r = discount_n_standardise(ep_r)
        
        model.fit(ep_x, ep_y, sample_weight=ep_r, batch_size=512, epochs=1, verbose=0)
        
        time_steps[episodes] = k
        k = 0
        prev_x = None
        observation = env.reset()
        losses[episodes] = model.evaluate(ep_x, 
                                          ep_y,
                                          sample_weight=ep_r,
                                          batch_size=512, 
                                          verbose=0)
        
        episodes += 1
        if np.mean(reward_sums[max(0,episodes-10):episodes]) >= 19:
            break

            
        # Prints out stats such as average loss, average reward, and average steps 20 times over the course of the code
        if episodes%(n_episodes//20) == 0:
            avg_reward = np.mean(reward_sums[max(0,episodes-200):episodes])
            avg_loss = np.mean(losses[max(0,episodes-200):episodes])
            avg_time = np.mean(time_steps[max(0,episodes-200):episodes])
            plt.plot(reward_sums[:episodes])
            plt.show()
env.close() # uncomment if env.render() is running

# 8. Plot The Reward Over Episodes

In [None]:
plot_from = 0
plot_until = 1000
plt.plot(reward_sums[plot_from:plot_until])
plt.savefig("reward_image.jpg")
plt.show()
plt.plot(losses[plot_from:plot_until])
plt.show()
plt.plot(time_steps[plot_from:plot_until])
plt.show()


In [None]:
import time

# 9. Watch The Model Play!

In [None]:
observation = env.reset()
prev_frame = None
done = False
while not done:
    x = prepro(observation) 
    diff = x - prev_frame if prev_frame is not None else np.zeros(input_shape)
    p = model.predict(diff[None,:,:,:])
    prev_frame = x
    a = np.random.choice(len(action_space), p=p[0])
    action = action_space[a]
    
    # Render into buffer. 
    env.render()
    observation, reward, done, info = env.step(action)
    time.sleep(.01)
env.close()
