In [1]:
import numpy as np
import random
from collections import deque
import gym

In [2]:
import tensorflow as tf

In [3]:
from tensorflow.keras.models import Sequential, clone_model
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.optimizers import Adam


In [4]:
env_name = 'CartPole-v1'

In [5]:
env = gym.make(env_name)

In [6]:
# env.reset()
# for step in range(500):
#     env.render(mode='human')
#     random_action = env.action_space.sample()
#     env.step(random_action)
    
# env.close()

In [7]:
# 4 observations we have in cartpole
env.observation_space.shape

(4,)

In [8]:
num_observations = env.observation_space.shape[0]

In [9]:
env.observation_space.shape

(4,)

In [10]:
num_observations

4

In [11]:
num_actions = env.action_space.n

In [12]:
num_actions

2

In [13]:
# input shape of the ANN should be equal to number of observation --> 4 here
# output size of the ANN should be equal to number of actions --> 2 here

In [14]:
model = Sequential()
# by adding the input_shape we become sure that we match currently to the firt layer regardless of num of obs
model.add(Dense(16, input_shape=(1, num_observations)))
# model.add(Dense(16))

model.add(Activation('relu'))

model.add(Dense(32))
model.add(Activation('relu'))


# Neurons == action_space
model.add(Dense(num_actions))
model.add(Activation('linear'))

In [15]:
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 1, 16)             80        
                                                                 
 activation (Activation)     (None, 1, 16)             0         
                                                                 
 dense_1 (Dense)             (None, 1, 32)             544       
                                                                 
 activation_1 (Activation)   (None, 1, 32)             0         
                                                                 
 dense_2 (Dense)             (None, 1, 2)              66        
                                                                 
 activation_2 (Activation)   (None, 1, 2)              0         
                                                                 
Total params: 690
Trainable params: 690
Non-trainable pa

In [16]:
target_model = clone_model(model)

In [17]:
# Now these tasks ought to be done:
#     1- Define Hyper parameters
#     2- Epsilon greedy action selection
#     3- Undersatnd Deque Object
#     4- Create Replay Function
#     5- Tartget Model Update Function

In [18]:
EPOCHS = 1000
BATCH_SIZE = 32

epsilon = 1.0
EPSILON_REDUCE = 0.995

LEARNIN_RATE = 0.001
GAMMA = 0.95

In [19]:
def epsilon_greedy_action_selection(model, epsilon, observation, batch_size=32):
    if np.random.random() > epsilon:

        prediction = model.predict(observation.reshape([1, 1, 4]))
        action = np.argmax(prediction)
    else:
        action = np.random.randint(0, env.action_space.n)
        
    return action

In [20]:
# deque add a limit to the number of values that can be added to it and once it reach the threshold it will stop adding elements
# and if continue appending items to it it will remove the very first one and add th last one
deque_1 = deque(maxlen=5)

In [21]:
deque_1

deque([])

In [22]:
for i in range(10):
    deque_1.append(i)
    print(deque_1)

deque([0], maxlen=5)
deque([0, 1], maxlen=5)
deque([0, 1, 2], maxlen=5)
deque([0, 1, 2, 3], maxlen=5)
deque([0, 1, 2, 3, 4], maxlen=5)
deque([1, 2, 3, 4, 5], maxlen=5)
deque([2, 3, 4, 5, 6], maxlen=5)
deque([3, 4, 5, 6, 7], maxlen=5)
deque([4, 5, 6, 7, 8], maxlen=5)
deque([5, 6, 7, 8, 9], maxlen=5)


In [23]:
# We use the deque to define replay buffer
replay_buffer = deque(maxlen=20000)
update_target_model = 10

In [24]:
def replay(replay_buffer, batch_size, model, target_model):
    if len(replay_buffer)<batch_size:
        return
   
    samples = random.sample(replay_buffer, batch_size)

    target_batch = list()
    zipped_samples = list(zip(*samples))

    states, actions, rewards, new_states, dones = zipped_samples
    states = np.array(states).reshape([batch_size, 1, 4])
    new_states = np.array(new_states).reshape([batch_size, 1, 4])
    targets = target_model.predict(states)
    # predict q values for all the samples
#     q_values = model.predict(np.array(new_states))
    q_values = model.predict(new_states)

    for i in range(batch_size):
        q_value = max(q_values[i][0])
        target = targets[i].copy()
        if dones[i]:
            target[0][actions[i]] = rewards[i]
        else:
            target[0][actions[i]] = rewards[i]+q_value*GAMMA
        target_batch.append(target)

    model.fit(np.array(states), np.array(target_batch), epochs=1, verbose=0)
    

In [25]:
def update_model_handler(epoch, update_target_model, model, target_moel):
    if epoch > 0 and epoch%update_target_model==0:
        target_model.set_weights(model.get_weights())

In [26]:
model.compile(loss='mse', optimizer=(Adam(lr=LEARNIN_RATE)))

  super(Adam, self).__init__(name, **kwargs)


In [27]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 1, 16)             80        
                                                                 
 activation (Activation)     (None, 1, 16)             0         
                                                                 
 dense_1 (Dense)             (None, 1, 32)             544       
                                                                 
 activation_1 (Activation)   (None, 1, 32)             0         
                                                                 
 dense_2 (Dense)             (None, 1, 2)              66        
                                                                 
 activation_2 (Activation)   (None, 1, 2)              0         
                                                                 
Total params: 690
Trainable params: 690
Non-trainable pa

In [28]:
best_so_far = 0

for epoch in range(EPOCHS):
    observation = env.reset()
    # (1, X) X observation size here is 4 we resgape to be (1, 4)
    obsrvation = observation.reshape([1, 4])
    done = False
    
    points = 0
    
    while not done:
        action = epsilon_greedy_action_selection(model, epsilon, observation)
        next_observation, reward, done, info = env.step(action)
        next_observation = next_observation.reshape([1, 4])
        replay_buffer.append((observation, action, reward, next_observation, done))
        observatin = next_observation
        
        points += 1
        
        replay(replay_buffer, BATCH_SIZE, model, target_model)
    
    epsilon *= EPSILON_REDUCE # eps*0.995
    update_model_handler(epoch, update_target_model, model, target_model)
    if points > best_so_far:
        best_so_far = points
        
    if epoch%25 == 0:
        print(f"{epoch}: POINTS: {points} eps: {epsilon} BSF: {best_so_far}")

0: POINTS: 18 eps: 0.995 BSF: 18
25: POINTS: 13 eps: 0.8778091417340573 BSF: 72
50: POINTS: 29 eps: 0.7744209942832988 BSF: 72
75: POINTS: 14 eps: 0.6832098777212641 BSF: 72
100: POINTS: 17 eps: 0.6027415843082742 BSF: 72
125: POINTS: 15 eps: 0.531750826943791 BSF: 72
150: POINTS: 13 eps: 0.46912134373457726 BSF: 72
175: POINTS: 10 eps: 0.41386834584198684 BSF: 72
200: POINTS: 13 eps: 0.36512303261753626 BSF: 72
225: POINTS: 9 eps: 0.322118930542046 BSF: 72
250: POINTS: 10 eps: 0.28417984116121187 BSF: 72
275: POINTS: 9 eps: 0.2507092085103961 BSF: 72
300: POINTS: 11 eps: 0.2211807388415433 BSF: 72
325: POINTS: 12 eps: 0.19513012515638165 BSF: 72
350: POINTS: 10 eps: 0.17214774642209296 BSF: 72
375: POINTS: 11 eps: 0.1518722266715875 BSF: 72
400: POINTS: 8 eps: 0.13398475271138335 BSF: 72
425: POINTS: 11 eps: 0.11820406108847166 BSF: 72
450: POINTS: 10 eps: 0.1042820154910064 BSF: 72
475: POINTS: 9 eps: 0.09199970504166631 BSF: 72
500: POINTS: 9 eps: 0.0811640021330769 BSF: 72
525: POI

In [32]:
observation = env.reset()
for counter in range(300):
    env.render()
    
    # TODO: Get discretized observation
    action = np.argmax(model.predict(observation.reshape([1, 1, 4])))
    
    # TODO: Perform the action 
    observation, reward, done, info = env.step(action) # Finally perform the action
    
    if done:
        print(f"done")
        break
env.close()

done
