In [1]:
import gym
import tensorflow as tf
import numpy as np
from tqdm import tqdm
from collections import deque

In [2]:
# keras model approach
from tensorflow.keras import Model,Sequential
from tensorflow.keras.layers import Conv2D,MaxPooling2D,Flatten,BatchNormalization,Dense, Input
from tensorflow.keras.activations import relu

In [3]:
# ADDING THE CODE SO THAT TENSORFLOW DOES NOT EAT THE WHOLE GPU MEMORY
physical_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)
tf.keras.backend.set_floatx('float32')

In [4]:
env = gym.make('CartPole-v1')
env.action_space

Discrete(2)

In [5]:
adam = tf.keras.optimizers.Adam(learning_rate = 0.001)

In [6]:
def model_keras():
    
    inputs = Input(shape=(4,))
    
    x = Dense(100,activation='relu',kernel_initializer="glorot_uniform")(inputs)
    x = BatchNormalization()(x)
    x = Dense(100,activation='relu',kernel_initializer="glorot_uniform")(x)
    x = BatchNormalization()(x)
    x = Dense(100,activation='relu',kernel_initializer="glorot_uniform")(x)
    x = BatchNormalization()(x)
    x = Dense(100,activation='relu',kernel_initializer="glorot_uniform")(x)
    x = BatchNormalization()(x)
    output = Dense(2,activation='linear',kernel_initializer="glorot_uniform")(x)
    model = Model(inputs=inputs, outputs=output, name="RL_Value_Function")
    
    print(model.summary())
    
    model.compile(optimizer=adam,loss='mean_squared_error',metrics=['mean_squared_error'])
    
    return model
def model_keras():    
    model = Sequential()
    model.add(Dense(24, input_dim=4, activation='relu',
                    kernel_initializer='he_uniform'))
    model.add(Dense(24, activation='relu',
                    kernel_initializer='he_uniform'))
    model.add(Dense(2, activation='linear',
                    kernel_initializer='he_uniform'))
    print(model.summary())
    model.compile(loss='mse', optimizer='adam')
    return model
# model 2 is the target model
model_1 = model_keras()
model_2 = model_keras()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 24)                120       
_________________________________________________________________
dense_1 (Dense)              (None, 24)                600       
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 50        
Total params: 770
Trainable params: 770
Non-trainable params: 0
_________________________________________________________________
None
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 24)                120       
_________________________________________________________________
dense_4 (Dense)              (None, 24)                600       
_______________________________

In [7]:
def custom_loss(y_true,y_pred):
    return tf.keras.losses.mean_squared_error(y_true,y_pred)

In [8]:
# lets build some memory into the model to perform decorrelated batch updates
# this is TD learning

# so apparantly the NN has to be adjusted only according to the action taken by it
# for example if action 2 is taken then only the weights for action 2 should be changed
# for this reason we should only update the q_2 vector with regarding the chosen action
# rewards will also be added to that action only
# q_2 is what we thought the value of the state will be after doing action
# we also add the reward and make this the target for the NN

replay_batch = deque(maxlen = 3000)

warmup = 10 #will start training after these many episodes have passed

# to balance exploration
epsilon = {
"epsilon" : 1.0,
"epsilon_decay": 0.999,
"epsilon_min":0.01,
}

In [9]:
import random
random.seed(2020)

#@tf.function
def batch_train(model_1,model_2,gamma,batch_size,epsilon):
    
    #decaying the exploration
    if epsilon['epsilon'] > epsilon['epsilon_min']:
         epsilon['epsilon'] =  epsilon['epsilon'] * epsilon['epsilon_decay']
   
    batch = random.sample(replay_batch,batch_size)
    
    batch_reward = []
    batch_action = []
    batch_done = []
    
    batch_current_state = np.zeros((batch_size, 4))
    batch_next_state = np.zeros((batch_size, 4))

    for i in range(batch_size):
        batch_reward.append(batch[i][2])
        batch_action.append(batch[i][1])
        batch_current_state[i] = batch[i][0]
        batch_next_state[i] = batch[i][3]
        batch_done.append(batch[i][4])
    #lets calculate the next state value as the current value will be calculated in 
    # in gradient tape
    
    next_q = model_2.predict(batch_next_state)
    
    max_q = []
    for i in next_q:
        max_q.append(max(i))
    max_q = np.array(max_q,dtype = 'float32')
    
    target = batch_reward + gamma*max_q # this is the Q learning Target
    
    with tf.GradientTape() as tape:
        # logits is the forward pass
        logits = model_1(batch_current_state, training=True)
        
        q_target = np.array(logits)
        
        # VERY IMPORTANT NOTE, IF THE EPISODE ENDS THE DONE VALUE BECOMES TRUE
        # IT IS VERY IMPORTANT THAT THE NN UPDATES TOWARDS THIS TRUE VALUE RATHER THAN
        # ITS OWN THINKING VALUE (r + gamma*max(action)) THAT WE USE FOR ALL
        # NON TERMINAL REWARDS 
        # THIS MAKES OR BREAKS THE NETWORK VERY VERY IMPORTANT
        for i in range(batch_size):
            q_target[i][batch_action[i]] = target[i]
            if batch_done[i]:
                q_target[i][batch_action[i]] = batch_reward[i]
        
        # calculating the loss
        loss_value = custom_loss(q_target,logits)
    
    #we retrieve the gradients
    grads = tape.gradient(loss_value, model_1.trainable_weights)
    
    #THIS IS ONE STEP OF GRAD DESCENT (Minimizes the loss)
    adam.apply_gradients(zip(grads, model_1.trainable_weights))

def policy(q_vals,eps):
    # lets implement a policy which decays
    if np.random.rand() <= eps:  
        return random.randrange(2)
    else:
        action = np.argmax(q_vals[0])
        return action
def update_target_network():
    model_2.set_weights(model_1.get_weights())

In [10]:
global_steps = 0
# to have same networks in the starting
update_target_network()

for i in tqdm(range(150)):
    observation = env.reset()
    observation = np.expand_dims(observation, axis=0)
    done = False
    total_reward = 0
    while not done:        
        #storing the current state
        state_1 = observation
        
        # this is the current q values
        q_state = model_1(observation)
        
        action = policy(q_state,epsilon["epsilon"]) # current action
        observation,reward,done,info = env.step(action)
        
        # calculating the total reward
        total_reward = total_reward + reward

        observation = np.expand_dims(observation, axis=0)
        state_2 = observation
        state_reward = reward
        
        replay_batch.append((state_1,action,state_reward,state_2,done))
        
        if i>warmup:
            batch_train(model_1,model_2,0.99,64,epsilon)
            global_steps = global_steps + 1
            
        if done:
            update_target_network()
            
    print(total_reward)

  0%|          | 0/150 [00:00<?, ?it/s]



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



  4%|▍         | 6/150 [00:00<00:23,  6.00it/s]

21.0
21.0
35.0
15.0
24.0
17.0
12.0
13.0
25.0
24.0
15.0


  8%|▊         | 12/150 [00:01<00:20,  6.65it/s]

16.0
33.0


  9%|▉         | 14/150 [00:02<00:41,  3.30it/s]

11.0


 10%|█         | 15/150 [00:02<00:51,  2.60it/s]

21.0


 11%|█         | 16/150 [00:03<00:53,  2.50it/s]

11.0


 11%|█▏        | 17/150 [00:04<01:08,  1.95it/s]

28.0


 12%|█▏        | 18/150 [00:05<01:42,  1.28it/s]

46.0


 13%|█▎        | 19/150 [00:06<01:30,  1.44it/s]

17.0


 13%|█▎        | 20/150 [00:06<01:15,  1.72it/s]

11.0


 14%|█▍        | 21/150 [00:06<01:07,  1.90it/s]

14.0


 15%|█▍        | 22/150 [00:07<01:17,  1.66it/s]

20.0


 15%|█▌        | 23/150 [00:07<01:09,  1.83it/s]

14.0


 16%|█▌        | 24/150 [00:08<01:00,  2.07it/s]

12.0


 17%|█▋        | 25/150 [00:09<01:12,  1.71it/s]

29.0


 17%|█▋        | 26/150 [00:09<01:12,  1.71it/s]

21.0


 18%|█▊        | 27/150 [00:10<01:10,  1.75it/s]

11.0


 19%|█▊        | 28/150 [00:10<01:03,  1.93it/s]

14.0


 19%|█▉        | 29/150 [00:10<00:54,  2.24it/s]

9.0


 20%|██        | 30/150 [00:11<00:48,  2.47it/s]

11.0


 21%|██        | 31/150 [00:11<00:53,  2.20it/s]

20.0


 21%|██▏       | 32/150 [00:12<00:58,  2.03it/s]

21.0


 22%|██▏       | 33/150 [00:12<00:51,  2.28it/s]

10.0


 23%|██▎       | 34/150 [00:13<01:00,  1.93it/s]

15.0


 23%|██▎       | 35/150 [00:13<00:55,  2.06it/s]

15.0


 24%|██▍       | 36/150 [00:14<00:50,  2.25it/s]

12.0


 25%|██▍       | 37/150 [00:14<00:49,  2.30it/s]

15.0


 25%|██▌       | 38/150 [00:15<00:53,  2.10it/s]

21.0


 26%|██▌       | 39/150 [00:15<00:47,  2.36it/s]

11.0


 27%|██▋       | 40/150 [00:15<00:43,  2.54it/s]

12.0


 27%|██▋       | 41/150 [00:15<00:37,  2.94it/s]

8.0


 28%|██▊       | 42/150 [00:16<00:32,  3.29it/s]

8.0


 29%|██▊       | 43/150 [00:16<00:35,  3.04it/s]

14.0


 29%|██▉       | 44/150 [00:17<00:53,  1.98it/s]

21.0


 30%|███       | 45/150 [00:17<00:52,  2.00it/s]

18.0


 31%|███       | 46/150 [00:18<00:44,  2.36it/s]

9.0


 31%|███▏      | 47/150 [00:18<00:48,  2.11it/s]

22.0


 32%|███▏      | 48/150 [00:19<00:44,  2.27it/s]

13.0


 33%|███▎      | 49/150 [00:19<00:38,  2.62it/s]

9.0


 33%|███▎      | 50/150 [00:19<00:37,  2.68it/s]

13.0


 34%|███▍      | 51/150 [00:19<00:32,  3.07it/s]

8.0


 35%|███▍      | 52/150 [00:20<00:31,  3.07it/s]

12.0


 35%|███▌      | 53/150 [00:20<00:35,  2.77it/s]

16.0


 36%|███▌      | 54/150 [00:21<00:37,  2.59it/s]

16.0


 37%|███▋      | 55/150 [00:21<00:35,  2.65it/s]

13.0


 37%|███▋      | 56/150 [00:22<00:50,  1.86it/s]

18.0


 38%|███▊      | 57/150 [00:22<00:42,  2.18it/s]

10.0


 39%|███▊      | 58/150 [00:22<00:37,  2.47it/s]

10.0


 39%|███▉      | 59/150 [00:23<00:33,  2.72it/s]

10.0


 40%|████      | 60/150 [00:23<00:33,  2.67it/s]

13.0


 41%|████      | 61/150 [00:24<00:32,  2.71it/s]

13.0


 41%|████▏     | 62/150 [00:24<00:33,  2.63it/s]

15.0


 42%|████▏     | 63/150 [00:24<00:34,  2.51it/s]

16.0


 43%|████▎     | 64/150 [00:25<00:32,  2.66it/s]

12.0


 43%|████▎     | 65/150 [00:25<00:32,  2.64it/s]

14.0


 44%|████▍     | 66/150 [00:25<00:30,  2.76it/s]

12.0


 45%|████▍     | 67/150 [00:26<00:27,  3.06it/s]

9.0


 45%|████▌     | 68/150 [00:26<00:26,  3.05it/s]

12.0


 46%|████▌     | 69/150 [00:26<00:28,  2.88it/s]

14.0


 47%|████▋     | 70/150 [00:27<00:27,  2.89it/s]

12.0


 47%|████▋     | 71/150 [00:27<00:30,  2.62it/s]

16.0


 48%|████▊     | 72/150 [00:30<01:24,  1.09s/it]

81.0


 49%|████▊     | 73/150 [00:32<01:39,  1.29s/it]

65.0


 49%|████▉     | 74/150 [00:35<02:30,  1.98s/it]

110.0


 50%|█████     | 75/150 [00:37<02:17,  1.83s/it]

55.0


 51%|█████     | 76/150 [00:38<01:58,  1.60s/it]

39.0


 51%|█████▏    | 77/150 [00:39<01:57,  1.62s/it]

59.0


 52%|█████▏    | 78/150 [00:41<01:58,  1.65s/it]

64.0


 53%|█████▎    | 79/150 [00:42<01:48,  1.53s/it]

46.0


 53%|█████▎    | 80/150 [00:44<01:40,  1.43s/it]

44.0


 54%|█████▍    | 81/150 [00:45<01:41,  1.46s/it]

28.0


 55%|█████▍    | 82/150 [00:46<01:32,  1.37s/it]

42.0


 55%|█████▌    | 83/150 [00:47<01:27,  1.31s/it]

43.0


 56%|█████▌    | 84/150 [00:50<01:55,  1.75s/it]

101.0


 57%|█████▋    | 85/150 [00:51<01:36,  1.48s/it]

31.0


 57%|█████▋    | 86/150 [00:53<01:38,  1.55s/it]

64.0


 58%|█████▊    | 87/150 [00:57<02:19,  2.21s/it]

97.0


 59%|█████▊    | 88/150 [00:58<02:01,  1.96s/it]

49.0


 59%|█████▉    | 89/150 [00:59<01:43,  1.70s/it]

38.0


 60%|██████    | 90/150 [01:00<01:32,  1.54s/it]

43.0


 61%|██████    | 91/150 [01:01<01:22,  1.39s/it]

38.0


 61%|██████▏   | 92/150 [01:02<01:17,  1.33s/it]

43.0


 62%|██████▏   | 93/150 [01:05<01:30,  1.58s/it]

80.0


 63%|██████▎   | 94/150 [01:06<01:23,  1.49s/it]

47.0


 63%|██████▎   | 95/150 [01:07<01:15,  1.38s/it]

40.0


 64%|██████▍   | 96/150 [01:08<01:13,  1.37s/it]

48.0


 65%|██████▍   | 97/150 [01:11<01:33,  1.76s/it]

52.0


 65%|██████▌   | 98/150 [01:16<02:21,  2.72s/it]

177.0


 66%|██████▌   | 99/150 [01:18<02:06,  2.48s/it]

71.0


 67%|██████▋   | 100/150 [01:24<03:02,  3.65s/it]

235.0


 67%|██████▋   | 101/150 [01:28<02:57,  3.62s/it]

75.0


 68%|██████▊   | 102/150 [01:32<02:58,  3.72s/it]

143.0


 69%|██████▊   | 103/150 [01:34<02:30,  3.21s/it]

72.0


 69%|██████▉   | 104/150 [01:36<02:11,  2.85s/it]

69.0


 70%|███████   | 105/150 [01:38<02:03,  2.75s/it]

86.0


 71%|███████   | 106/150 [01:42<02:18,  3.15s/it]

135.0


 71%|███████▏  | 107/150 [01:44<01:59,  2.78s/it]

66.0


 72%|███████▏  | 108/150 [01:47<01:55,  2.75s/it]

96.0


 73%|███████▎  | 109/150 [01:51<02:08,  3.13s/it]

79.0


 73%|███████▎  | 110/150 [01:54<01:58,  2.97s/it]

93.0


 74%|███████▍  | 111/150 [01:56<01:52,  2.89s/it]

95.0


 75%|███████▍  | 112/150 [02:00<01:57,  3.08s/it]

121.0


 75%|███████▌  | 113/150 [02:06<02:27,  3.99s/it]

219.0


 76%|███████▌  | 114/150 [02:11<02:34,  4.28s/it]

179.0


 77%|███████▋  | 115/150 [02:19<03:04,  5.28s/it]

187.0


 77%|███████▋  | 116/150 [02:31<04:07,  7.28s/it]

435.0


 78%|███████▊  | 117/150 [02:41<04:32,  8.25s/it]

393.0


 79%|███████▊  | 118/150 [02:49<04:19,  8.12s/it]

184.0


 79%|███████▉  | 119/150 [02:55<03:56,  7.63s/it]

241.0


 80%|████████  | 120/150 [03:01<03:28,  6.95s/it]

200.0


 81%|████████  | 121/150 [03:14<04:17,  8.90s/it]

500.0


 81%|████████▏ | 122/150 [03:30<05:03, 10.85s/it]

443.0


 82%|████████▏ | 123/150 [03:36<04:20,  9.64s/it]

251.0


 83%|████████▎ | 124/150 [03:50<04:41, 10.81s/it]

500.0


 83%|████████▎ | 125/150 [04:02<04:43, 11.34s/it]

451.0


 84%|████████▍ | 126/150 [04:21<05:22, 13.43s/it]

500.0


 85%|████████▍ | 127/150 [04:30<04:36, 12.04s/it]

322.0


 85%|████████▌ | 128/150 [04:43<04:36, 12.58s/it]

500.0


 86%|████████▌ | 129/150 [04:58<04:36, 13.15s/it]

492.0


 87%|████████▋ | 130/150 [05:05<03:46, 11.34s/it]

249.0


 87%|████████▋ | 131/150 [05:12<03:09,  9.99s/it]

236.0


 88%|████████▊ | 132/150 [05:24<03:12, 10.68s/it]

243.0


 89%|████████▊ | 133/150 [05:30<02:37,  9.24s/it]

209.0


 89%|████████▉ | 134/150 [05:35<02:07,  7.99s/it]

184.0


 90%|█████████ | 135/150 [05:39<01:41,  6.78s/it]

146.0


 91%|█████████ | 136/150 [05:44<01:25,  6.12s/it]

168.0


 91%|█████████▏| 137/150 [05:47<01:08,  5.26s/it]

112.0


 92%|█████████▏| 138/150 [05:49<00:52,  4.34s/it]

71.0


 93%|█████████▎| 139/150 [05:52<00:41,  3.81s/it]

91.0


 93%|█████████▎| 140/150 [05:54<00:35,  3.51s/it]

98.0


 94%|█████████▍| 141/150 [06:00<00:35,  3.98s/it]

180.0


 95%|█████████▍| 142/150 [06:07<00:39,  4.91s/it]

255.0


 95%|█████████▌| 143/150 [06:14<00:39,  5.68s/it]

280.0


 96%|█████████▌| 144/150 [06:22<00:37,  6.23s/it]

279.0


 97%|█████████▋| 145/150 [06:35<00:42,  8.47s/it]

247.0


 97%|█████████▋| 146/150 [06:49<00:40, 10.07s/it]

500.0


 98%|█████████▊| 147/150 [07:03<00:33, 11.17s/it]

500.0


 99%|█████████▊| 148/150 [07:17<00:24, 12.03s/it]

500.0


 99%|█████████▉| 149/150 [07:19<00:09,  9.12s/it]

80.0


100%|██████████| 150/150 [07:22<00:00,  2.95s/it]

114.0





In [None]:
env.close()

In [11]:
#lets test the nn
for i in tqdm(range(500)):
    observation = env.reset()
    observation = np.expand_dims(observation, axis=0)
    total_reward = 0
    for j in range(1000):
        env.render()
        nn_out = model_2.predict(observation)
        action = policy(nn_out,0)
        print(nn_out[0])
        print(action)
        observation,reward,done,info = env.step(action)
        observation = np.expand_dims(observation, axis=0)
        total_reward = total_reward + reward
        if done:
            print("episode ended")
            break
env.close()


  0%|          | 0/500 [00:00<?, ?it/s][A

[79.41802  80.228676]
1
[79.88904 77.98007]
0
[79.7922  80.12785]
1
[79.79381 77.46305]
0
[80.15454 79.97158]
0
[76.915276 79.88707 ]
1
[80.14157 79.75447]
0
[77.2135  79.88257]
1
[80.132355 79.553185]
0
[77.49714  79.878746]
1
[80.12646 79.3618 ]
0
[77.772896 79.87449 ]
1
[80.119484 79.18199 ]
0
[78.04724 79.86868]
1
[80.11024  79.010155]
0
[78.3265  79.86023]
1
[80.10439 78.83025]
0
[78.61708 79.84793]
1
[80.101654 78.6372  ]
0
[78.925575 79.830536]
1
[80.101776 78.425606]
0
[79.25896 79.80666]
1
[80.05612  78.105095]
0
[79.61746 79.76999]
1
[79.99204 77.71726]
0
[79.63201 79.47102]
0
[76.699646 79.40876 ]
1
[79.62061 79.35282]
0
[76.95575 79.51327]
1
[79.612595 79.26398 ]
0
[77.18036 79.6185 ]
1
[79.60743 79.20015]
0
[77.3781  79.72436]
1
[79.60468  79.157646]
0
[77.55287 79.83075]
1
[79.60397 79.13336]
0
[77.70793  79.937584]
1
[79.60499 79.12472]
0
[77.846054 80.04482 ]
1
[79.60745 79.12955]
0
[77.96952 80.15243]
1
[79.611115 79.14604 ]
0
[78.080246 80.260315]
1
[79.61579 79.17271


  0%|          | 1/500 [00:09<1:18:40,  9.46s/it][A

[59.57892 57.07643]
0
[57.181557 56.835228]
0
[53.943108 55.065994]
1
[56.70844 56.05567]
0
episode ended
[79.69784 80.28251]
1
[79.76599  77.879166]
0
[79.91855 80.19908]
1
[79.708336 77.511795]
0
[80.1535  80.07321]
0
[76.818245 79.979546]
1
[80.14122 79.976  ]
0
[76.97207 79.99902]
1
[80.125854 79.90497 ]
0
[77.10804 80.01513]
1
[80.11358  79.845505]
0
[77.229546 80.03298 ]
1
[80.10392 79.79543]
0
[77.33937 80.05222]
1
[80.0965  79.75278]
0
[77.439926 80.07247 ]
1
[80.09099  79.715836]
0
[77.53343 80.09341]
1
[80.08713  79.683044]
0
[77.62183 80.11472]
1
[80.08468  79.652954]
0
[77.70699  80.136116]
1
[80.08346 79.62423]
0
[77.7906   80.157326]
1
[80.08333  79.595566]
0
[77.87436 80.17804]
1
[80.08415  79.565704]
0
[77.95988  80.197975]
1
[80.08583 79.53332]
0
[78.04885 80.21683]
1
[80.08828 79.4971 ]
0
[78.143005 80.234245]
1
[80.09146  79.455605]
0
[78.2442  80.24986]
1
[80.09532 79.40732]
0
[78.35444  80.263275]
1
[80.09986 79.35054]
0
[78.47591 80.27404]
1
[80.10505  79.283424]



  0%|          | 2/500 [00:17<1:15:56,  9.15s/it][A


1
episode ended
[79.15304 80.16695]
1
[79.77528  78.412544]
0
[79.16611 80.13332]
1
[79.77285 78.30936]
0
[79.22415 80.09148]
1
[79.75895 78.15487]
0
[79.328316 80.040886]
1
[79.73356  77.946335]
0
[79.48089  79.980835]
1
[79.69636  77.679665]
0
[79.68532  79.910324]
1
[79.64673 77.34941]
0
[79.94636 79.82817]
0
[76.550316 79.629135]
1
[80.03464 79.82083]
0
[76.67871 79.67273]
1
[80.0266  79.76962]
0
[76.78438 79.71692]
1
[80.01999 79.73715]
0
[76.869644 79.76161 ]
1
[80.01026  79.729225]
0
[76.93633 79.80679]
1
[80.0026  79.73477]
0
[76.985756 79.85247 ]
1
[79.996635 79.75303 ]
0
[77.0188 79.8986]
1
[79.99205  79.783615]
0
[77.03592 79.9452 ]
1
[79.98857 79.82644]
0
[77.03716 79.99232]
1
[79.986   79.88174]
0
[77.02218 80.03993]
1
[79.98408 79.95002]
0
[76.99025 80.08812]
1
[79.98264  80.032166]
1
[79.65787 77.46147]
0
[80.01593 79.8961 ]
0
[77.10198 80.10908]
1
[80.01182 79.93501]
0
[77.108246 80.15604 ]
1
[80.0087  79.98667]
0
[77.09795 80.20355]
1
[80.00633  80.051506]
1
[79.69099

KeyboardInterrupt: 

In [12]:
env.close()

In [None]:
# random action
for i in tqdm(range(50)):
    observation = env.reset()
    observation = np.expand_dims(observation, axis=0)
    for j in range(1000):
        env.render()
        observation,reward,done,info = env.step(env.action_space.sample())
        print(reward,done)
        if done:
            break