In [1]:
import gym
import tensorflow as tf
import numpy as np
from tqdm import tqdm
from collections import deque
import random

In [2]:
# keras model approach
from tensorflow.keras import Model,Sequential
from tensorflow.keras.layers import Conv2D,MaxPooling2D,Flatten,BatchNormalization,Dense, Input
from tensorflow.keras.activations import relu

In [3]:
# ADDING THE CODE SO THAT TENSORFLOW DOES NOT EAT THE WHOLE GPU MEMORY
physical_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)
tf.keras.backend.set_floatx('float32')

In [4]:
env = gym.make('CartPole-v1')
env.action_space

Discrete(2)

In [5]:
adam = tf.keras.optimizers.Adam(learning_rate = 0.001)

In [6]:
def model_keras():
    
    inputs = Input(shape=(6,))
    
    x = Dense(100,activation='relu',kernel_initializer="glorot_uniform")(inputs)
    x = BatchNormalization()(x)
    x = Dense(100,activation='relu',kernel_initializer="glorot_uniform")(x)
    x = BatchNormalization()(x)
    x = Dense(100,activation='relu',kernel_initializer="glorot_uniform")(x)
    x = BatchNormalization()(x)
    x = Dense(100,activation='relu',kernel_initializer="glorot_uniform")(x)
    x = BatchNormalization()(x)
    output = Dense(1,activation='linear',kernel_initializer="glorot_uniform")(x)
    model = Model(inputs=inputs, outputs=output, name="RL_Value_Function")
    
    print(model.summary())
    
    model.compile(optimizer=adam,loss='mean_squared_error',metrics=['mean_squared_error'])
    return model

# model 2 is the target model
model_1 = model_keras()
model_2 = model_keras()

Model: "RL_Value_Function"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 6)]               0         
_________________________________________________________________
dense (Dense)                (None, 100)               700       
_________________________________________________________________
batch_normalization (BatchNo (None, 100)               400       
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
batch_normalization_1 (Batch (None, 100)               400       
_________________________________________________________________
dense_2 (Dense)              (None, 100)               10100     
_________________________________________________________________
batch_normalization_2 (Batch (None, 100)         

In [7]:
def custom_loss(y_true,y_pred):
    return tf.keras.losses.mean_squared_error(y_true,y_pred)

In [8]:
# lets build some memory into the model to perform decorrelated batch updates
# this is TD learning

# so apparantly the NN has to be adjusted only according to the action taken by it
# for example if action 2 is taken then only the weights for action 2 should be changed
# for this reason we should only update the q_2 vector with regarding the chosen action
# rewards will also be added to that action only
# q_2 is what we thought the value of the state will be after doing action
# we also add the reward and make this the target for the NN

replay_batch = deque(maxlen = 3000)

warmup = 10 #will start training after these many episodes have passed

# to balance exploration
epsilon = {
"epsilon" : 1.0,
"epsilon_decay": 0.999,
"epsilon_min":0.01,
}
possible_actions = [0,1]

In [9]:

#@tf.function
def batch_train(model_1,model_2,gamma,batch_size,epsilon):
    
    #decaying the exploration
    if epsilon['epsilon'] > epsilon['epsilon_min']:
         epsilon['epsilon'] =  epsilon['epsilon'] * epsilon['epsilon_decay']
   
    batch = random.sample(replay_batch,batch_size)
    
    batch_reward = []
    batch_action = []
    batch_done = []
    
    batch_curr_actions = []
    
    batch_current_state = []
    batch_next_state = []

    for i in range(batch_size):
        batch_reward.append(batch[i][3])
        batch_action.append(batch[i][2])
        
        batch_current_state.append(batch[i][0])
        batch_curr_actions.append(batch[i][1])
        
        batch_next_state.append(batch[i][4])
        
        batch_done.append(batch[i][6])
        

    max_q = []
    action_chosen_states = []
    # extracting only req states by the selected action
    for i in range(batch_size):
        next_q = model_2(batch_next_state[i],training = False)
        max_q.append(np.amax(next_q))
        action_chosen_states.append(batch_current_state[i][batch_action[i]])
        
    max_q = np.array(max_q,dtype = 'float32')
    
    
    target = batch_reward + gamma*max_q # this is the Q learning Target
    # VERY IMPORTANT NOTE, IF THE EPISODE ENDS THE DONE VALUE BECOMES TRUE
    # IT IS VERY IMPORTANT THAT THE NN UPDATES TOWARDS THIS TRUE VALUE RATHER THAN
    # ITS OWN THINKING VALUE (r + gamma*max(action)) THAT WE USE FOR ALL
    # NON TERMINAL REWARDS 
    # THIS MAKES OR BREAKS THE NETWORK VERY VERY IMPORTANT
    for i in range(batch_size):
        if batch_done[i]:
            target[i] = batch_reward[i]
    
    with tf.GradientTape() as tape:
        # logits is the forward pass
        logits = model_1(np.array(action_chosen_states), training=True)
        
        # calculating the loss
        loss_value = custom_loss(target,logits)
    
    #we retrieve the gradients
    grads = tape.gradient(loss_value, model_1.trainable_weights)
    
    #THIS IS ONE STEP OF GRAD DESCENT (Minimizes the loss)
    adam.apply_gradients(zip(grads, model_1.trainable_weights))
    
    
def policy(q_vals,q_actions,eps):
    # q_actions is nothing but the corrensponding actions to q_vals
    # lets implement a policy which decays
    if np.random.rand() <= eps:  
        return random.randrange(2)
    else:
        action = np.argmax(q_vals)
        action = q_actions[action]
        return action
def update_target_network():
    model_2.set_weights(model_1.get_weights())
def one_hot_action(action):
    if action == 0:
        return np.array([1,0],dtype = "float32")
    else:
        return np.array([0,1],dtype = "float32")
def generate_NN_input(state,poss_actions):
    output = []
    output_actions = []
    for i in poss_actions:
        one_hot = one_hot_action(i)
        output.append(np.append(state,one_hot))
        output_actions.append(i)
    output = np.array(output)
    return output,output_actions

In [15]:
global_steps = 0
# to have same networks in the starting
update_target_network()

for i in tqdm(range(500)):
    observation = env.reset()
    observation = np.expand_dims(observation, axis=0)
    done = False
    total_reward = 0
    while not done:        
        # getting all possible current states
        current_states,current_actions = generate_NN_input(observation,possible_actions)
        # this is the current q values
        q_state = model_1(current_states,training = False)
        
        action = policy(q_state,current_actions,epsilon["epsilon"]) # current action
        observation,reward,done,info = env.step(action)
        print(action)
        # calculating the total reward
        total_reward = total_reward + reward

        next_states,next_actions = generate_NN_input(observation,possible_actions)
        
        state_2 = observation
        state_reward = reward
        
        replay_batch.append((current_states,current_actions,action,state_reward,next_states,next_actions,done))
        
        if i>warmup:
            batch_train(model_1,model_2,0.99,64,epsilon)
            global_steps = global_steps + 1
            
        if done:
            update_target_network()
            
    print(total_reward)

  0%|          | 2/500 [00:00<00:30, 16.39it/s]

0
0
0
1
0
0
0
0
0
0
10.0
0
0
0
0
0
0
0
0
0
0
10.0
0
0
0
1
0
0
0
0
0
0
10.0
0
0
0
0

  1%|          | 6/500 [00:00<00:30, 16.33it/s]


0
0
0
0
0
9.0
0
0
0
0
0
0
0
0
0
9.0
0
0
0
1
0
0
0
0
0
0
0
11.0
0
0
0
0

  2%|▏         | 8/500 [00:00<00:28, 16.97it/s]


0
0
0
0
0
0
10.0
0
0
0
0
0
0
0
0
8.0
0
0
0
0
0
0
0
0
0
0
0
11.0
0
0
0
1
0
1
0

  2%|▏         | 10/500 [00:00<00:29, 16.34it/s]


0
0
0
0
0
12.0
0
0
0
0
0
0
0
0
0
0
10.0
0
0
0
0
0
0
0
0
0


  2%|▏         | 12/500 [00:03<04:06,  1.98it/s]

9.0
0
0
0
0
0
0
0
0
1
1


  3%|▎         | 13/500 [00:07<12:09,  1.50s/it]

10.0
1
1
0
0
0
0
1
1
1
0
0
0
0
1
1
0
0
0
0


  3%|▎         | 14/500 [00:14<25:30,  3.15s/it]

19.0
1
1
0
0
0
0
0
0
0
0
0
0
0


  3%|▎         | 15/500 [00:18<28:35,  3.54s/it]

13.0
0
0
0
0
0
0
0
0
0


  3%|▎         | 16/500 [00:22<28:03,  3.48s/it]

9.0
0


KeyboardInterrupt: 

In [13]:
env.close()

In [14]:
#lets test the nn
for i in tqdm(range(500)):
    observation = env.reset()
    observation = np.expand_dims(observation, axis=0)
    total_reward = 0
    for j in range(1000):
        env.render()
        nn_out = model_2.predict(observation)
        action = policy(nn_out,0)
        print(nn_out[0])
        print(action)
        observation,reward,done,info = env.step(action)
        observation = np.expand_dims(observation, axis=0)
        total_reward = total_reward + reward
        if done:
            print("episode ended")
            break
env.close()

  0%|          | 0/500 [00:00<?, ?it/s]


ValueError: Error when checking input: expected input_2 to have shape (6,) but got array with shape (4,)

In [None]:
env.close()

In [None]:
# random action
for i in tqdm(range(50)):
    observation = env.reset()
    observation = np.expand_dims(observation, axis=0)
    for j in range(1000):
        env.render()
        observation,reward,done,info = env.step(env.action_space.sample())
        print(reward,done)
        if done:
            break