In [None]:
import gym
import tensorflow as tf
import numpy as np
from tqdm import tqdm
from collections import deque

In [None]:
# keras model approach
from tensorflow.keras import Model,Sequential
from tensorflow.keras.layers import Conv2D,MaxPooling2D,Flatten,BatchNormalization,Dense, Input
from tensorflow.keras.activations import relu

In [None]:
# ADDING THE CODE SO THAT TENSORFLOW DOES NOT EAT THE WHOLE GPU MEMORY
physical_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)
tf.keras.backend.set_floatx('float32')

In [None]:
env = gym.make('MountainCar-v0')
env.observation_space

In [None]:
adam = tf.keras.optimizers.Adam(learning_rate = 0.001)

In [None]:
def model_keras():
    
    inputs = Input(shape=(2,))
    
    x = Dense(100,activation='relu',kernel_initializer="glorot_uniform")(inputs)
    x = BatchNormalization()(x)
    x = Dense(100,activation='relu',kernel_initializer="glorot_uniform")(x)
    x = BatchNormalization()(x)
    x = Dense(100,activation='relu',kernel_initializer="glorot_uniform")(x)
    x = BatchNormalization()(x)
    x = Dense(100,activation='relu',kernel_initializer="glorot_uniform")(x)
    x = BatchNormalization()(x)
    output = Dense(3,activation='linear',kernel_initializer="glorot_uniform")(x)
    model = Model(inputs=inputs, outputs=output, name="RL_Value_Function")
    
    print(model.summary())
    
    #model.compile(optimizer=adam,loss='mean_squared_error',metrics=['mean_squared_error'])
    
    return model

sample_model = model_keras()

In [None]:

def custom_loss(y_true,y_pred):
    return tf.keras.losses.mean_squared_error(y_true,y_pred)

In [None]:
z = np.array([[1,2],[4,2],[5,1]])
z[2][1]

In [None]:

warmup = 10 #will start training after these many episodes have passed
training_count = 0 # this is a counter which is updated each time batch training is done
# after certain number of batches we remove the old data (in the starting of the list)

# to balance exploration
epsilon = {
"epsilon" : 1.0,
"epsilon_decay": 0.999,
"epsilon_min":0.01,
}

In [None]:
import random
random.seed(2020)


#@tf.function
def batch_train(model,gamma,SARSA):
    
    #decaying the exploration
    if epsilon['epsilon'] > epsilon['epsilon_min']:
         epsilon['epsilon'] =  epsilon['epsilon'] * epsilon['epsilon_decay']
    
    curr_state = SARSA[0]
    action = SARSA[1]
    reward = SARSA[2]
    next_state = SARSA[3]
    next_action = SARSA[4]
    q_next = model.predict(next_state)[0]
    target = reward + q_next[next_action]*gamma

    
    done = SARSA[5]
    if done:
        target = reward
    
    with tf.GradientTape() as tape:
        # logits is the forward pass
        logits = model(curr_state, training=True)
        
        q_target = np.array(logits)
        q_target[0][action] = target
        
        loss_value = custom_loss(q_target,logits)

    #we retrieve the gradients
    grads = tape.gradient(loss_value, model.trainable_weights)
    
    #THIS IS ONE STEP OF GRAD DESCENT (Minimizes the loss)
    adam.apply_gradients(zip(grads, model.trainable_weights))

def policy(q_vals,eps):
    # lets implement a policy which decays
    if np.random.rand() <= eps:  
        return random.randrange(2)
    else:
        action = np.argmax(q_vals[0])
        return action

In [None]:
for i in tqdm(range(1200)):
    observation = env.reset()
    observation = np.expand_dims(observation, axis=0)
    total_reward = 0
    for j in range(400):
        #storing the current state
        state_1 = observation
        
        # this is the current q values
        q_state = sample_model(observation,training = False)
        
        action = policy(q_state,epsilon["epsilon"]) # current action
        
        observation,reward,done,info = env.step(action)
        
        # calculating the total reward
        total_reward = total_reward + reward
        
        
        #if done and j<195:
        #    reward = -1000
        observation = np.expand_dims(observation, axis=0)
        state_2 = observation
        state_reward = reward
        
        action_2 = policy(sample_model(observation,training = False),epsilon["epsilon"])
        
        SARSA = (state_1,action,state_reward,state_2,action_2,done)

        batch_train(sample_model,0.99,SARSA)
        
        
        if done:
            break
        if i >1150:
            env.render()
    print(total_reward)

In [None]:
env.close()

In [None]:
#lets test the nn
for i in tqdm(range(500)):
    observation = env.reset()
    observation = np.expand_dims(observation, axis=0)
    total_reward = 0
    done =False
    while not done:
        env.render()
        nn_out = sample_model.predict(observation)
        print(nn_out)
        action = policy(nn_out,0)
        print(action)
        observation,reward,done,info = env.step(action)
        observation = np.expand_dims(observation, axis=0)
        total_reward = total_reward + reward
        
env.close()

In [None]:
env.close()

In [None]:
# random action
for i in tqdm(range(50)):
    observation = env.reset()
    observation = np.expand_dims(observation, axis=0)
    for j in range(1000):
        env.render()
        observation,reward,done,info = env.step(env.action_space.sample())
        print(reward,done)
        if done:
            break