In [1]:
import tensorflow as tf 
from tensorflow import keras 
import matplotlib.pyplot as plt
import numpy as np



In [2]:
physical_devices = tf.config.list_physical_devices(
    device_type=None
)



2022-02-11 08:48:46.433625: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-02-11 08:48:46.438755: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory
2022-02-11 08:48:46.438769: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1850] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [3]:
physical_devices


[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]

In [4]:
tf.config.experimental.set_memory_growth(physical_devices[1], True)


IndexError: list index out of range

In [None]:
env = gym.make('LunarLanderContinuous-v2')
import gym
from gym import wrappers

env.reset()

In [None]:
print( 'state space :' ,env.observation_space)
print( 'action space :' ,env.action_space)

In [6]:
class Noise(object):
    def __init__(self, mu, sigma = 0.15,theta =0.2,dt = 1e-2,x0=None):
        self.theta = theta 
        self.mu = mu
        self.sigma = sigma 
        self.dt = dt
        self.x0 = x0
        self.reset()
        
    def __call__(self):
        x = self.x_prev + self.theta*(self.mu-self.x_prev)*self.dt+\
        self.sigma*np.sqrt(self.dt)*np.random.normal(size = self.mu.shape)
        self.prev_x = x 
        return x
    
    def reset(self):
        self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu)

In [5]:
class ReplayBuffer(object):
    def __init__(self, max_size, input_shape, n_actions):
        self.mem_size = max_size
        self.mem_ctr = 0 
        self.state_memory = np.zeros((self.mem_size,input_shape))
        self.new_state_memory = np.zeros((self.mem_size,input_shape))
        self.action_memory = np.zeros((self.mem_size,n_actions))
        self.reward_memory = np.zeros(self.mem_size)
        self.terminal_memory = np.zeros(self.mem_size,dtype = np.float32)
        
    def store_transition(self,state,action,reward,next_state,done):
        index = self.mem_ctr % self.mem_size
        self.state_memory[index] = state
        self.new_state_memory[index] = next_state
        self.action_memory[index] = action
        self.reward_memory[index] = reward
        self.terminal_memory[index] = done 
        self.mem_ctr+=1
        
    def sample_buffer(self,batch_size):
        max_mem = min(self.mem_ctr,self.mem_size)
        batch = np.random.choice(max_mem, batch_size)
        states = self.state_memory[batch]
        new_states = self.new_state_memory[batch]
        actions = self.action_memory[batch]
        rewards = self.reward_memory[batch]
        terminal = self.terminal_memory[batch]
        
        return states, actions, rewards, new_states, terminal 


class Actor(tf.keras.Model):
    def __init__(self,n_actions,action_bounds,input_dims):
        super(Actor,self).__init__()
        self.list_layers = []
        self.n_layers = 2
        self.action_bounds = action_bounds
        self.list_layers.append(keras.layers.Input(shape=(input_dims,)))
        for i in range(self.n_layers):
            self.list_layers.append(keras.layers.BatchNormalization())
            self.list_layers.append(tf.keras.layers.Dense(512,activation = 'relu'))
        self.list_layers.append(keras.layers.BatchNormalization())
        self.list_layers.append(keras.layers.Dense(n_actions,activation = 'tanh'))
        self.actor = keras.Sequential(self.list_layers)
    def __call__(self,inputs):
        Z = self.actor(inputs)
        v_min,v_max = self.action_bounds
        Z = tf.clip_by_value(Z,v_min,v_max)
        return Z 
    
class Critic(tf.keras.Model):
    def __init__(self,input_dims,n_actions,num_layers = 2):
        super(Critic,self).__init__()
        self.list_layers = []
        self.list_layers.append(keras.layers.Input(shape=(input_dims+n_actions,)))
        for i in range(num_layers):
            self.list_layers.append(keras.layers.Dense(512,activation = 'relu'))
        self.list_layers.append(keras.layers.Dense(1))
        self.critic = keras.Sequential(self.list_layers)
    def call(self,state,action):
        Z = self.critic(tf.concat([state,action],axis = 1 ))
        return Z 

            
        

In [6]:
class Agent(object):
    
    def __init__(self,memory_size,n_actions,action_bounds,input_dims,optimizers,gamma=0.999):
        self.action_bounds = action_bounds
        self.Actor_local = Actor(n_actions=n_actions,action_bounds= action_bounds,input_dims=input_dims)
        self.Actor_target = Actor(n_actions=n_actions,action_bounds= action_bounds,input_dims=input_dims)
        self.Critic_local = Critic(input_dims,n_actions)
        self.Critic_target = Critic(input_dims,n_actions)
        self.ReplayBuffer = ReplayBuffer(memory_size,input_dims,n_actions)
        self.actor_opt,self.critic_opt = optimizers
        self.Actor_local.compile(loss = 'mse',optimizer = self.actor_opt)
        self.Critic_local.compile(loss = 'mse',optimizer = self.critic_opt)
        self.h = tf.keras.losses.Huber()
        #self.noise = Noise()
        self.gamma = gamma
        
    def remember(self,exp):
        state,action,reward,next_state,done = exp
        self.ReplayBuffer.store_transition(state,action,reward,next_state,done )
    
    def choose_action(self,state,noise=True):
        action = self.Actor_local(state[np.newaxis]).numpy()[0]
        if noise:
            action+= np.random.normal(0,0.05)
        

        return np.clip(action,self.action_bounds[0],self.action_bounds[1])
    
    def soft_update(self,eta= 0.001):
        target_weights_actor = self.Actor_target.get_weights()
        local_weights_actor = self.Actor_local.get_weights()
        target_weights_critic = self.Critic_target.get_weights()
        local_weights_critic = self.Critic_local.get_weights()
        update_weights_actor = []
        update_weights_critic = []
        for i in range(len(target_weights_actor)):
            update_weights_actor.append((1-eta)*target_weights_actor[i]+eta*local_weights_actor[i])
        self.Actor_target.set_weights(update_weights_actor)
        for i in range(len(target_weights_critic)):
            update_weights_critic.append((1-eta)*target_weights_critic[i]+eta*local_weights_critic[i])
        self.Critic_target.set_weights(update_weights_critic)
        
        
    def learn(self,batch_size):
        exp = self.ReplayBuffer. sample_buffer(batch_size)
        states,actions,rewards,next_states,dones = exp 
        
        with tf.GradientTape() as tape_critic:
            next_actions = self.Actor_target(next_states)
            next_critic_values = tf.squeeze(self.Critic_target(next_states,next_actions),1)
            target_values = rewards + self.gamma*(1-dones)*next_critic_values
            critic_values = self.Critic_local(states,actions)
            loss_critic = self.h(target_values,critic_values)
        gradient_critic = tape_critic.gradient(loss_critic,self.Critic_local.trainable_variables)
        self.critic_opt.apply_gradients(zip(gradient_critic,self.Critic_local.trainable_variables))
        
        with tf.GradientTape() as tape_actor:
            critic_policy_actions = self.Actor_local(states)
            critic_policy_values = self.Critic_local(states,critic_policy_actions)
            actor_loss = -tf.reduce_mean(critic_policy_values)
            
        gradient_actor = tape_actor.gradient(actor_loss, self.Actor_local.trainable_variables)
        self.actor_opt.apply_gradients(zip(gradient_actor,self.Actor_local.trainable_variables))
        
        self.soft_update()
            
            
        
        
        
    

In [7]:
n_actions = 2
input_dims = env.observation_space.shape[0]
memory_size = 2000
action_bounds =(-1,1)
optimizers= (keras.optimizers.Adam(lr=1e-4),keras.optimizers.Adam(lr=1e-4))
batch_size = 64


NameError: name 'env' is not defined

In [11]:
agent = Agent(memory_size,n_actions = n_actions,action_bounds = action_bounds,input_dims = input_dims,optimizers = optimizers)

2022-02-07 22:45:05.925242: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-02-07 22:45:05.925840: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-02-07 22:45:05.925972: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: Quadro RTX 4000 computeCapability: 7.5
coreClock: 1.545GHz coreCount: 36 deviceMemorySize: 7.79GiB deviceMemoryBandwidth: 387.49GiB/s
2022-02-07 22:45:05.925998: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.s

In [14]:



done = False 
history = []
avg_score = 0
n_episodes = 1500



max_avg_score = 0
lr_step = 0
for ep in range(n_episodes):
    state = env.reset()
    score = 0
    done = False
    step =0
  
    while not done:
        lr_step+=1
        action = agent.choose_action(state)
        next_state,reward,done,info = env.step(action)
        score+=reward
        step+=1 
        exp = (state,action,reward,next_state,done)
        agent.remember(exp)
        state = next_state
        if ep>50: 
            agent.learn(batch_size)
   
    history.append(score)
    avg_score =  np.mean(history[-100:])
    if (ep>50) & (avg_score>=max_avg_score+10):
        max_avg_score = avg_score
        agent.Actor_local.actor.save('Actor_LunarLanderContinuous_1.hf5')
        agent.Critic_local.critic.save('Critic_LunarLanderContinuous_1.hf5')
        
        
    print('episode :', ep ,'score %.2f average_score %.2f  '%(score,avg_score))
        
        
        

episode : 0 score 177.61 average_score 177.61  
episode : 1 score -51.42 average_score 63.09  
episode : 2 score 236.05 average_score 120.75  
episode : 3 score 211.54 average_score 143.45  
episode : 4 score -34.31 average_score 107.89  
episode : 5 score 265.38 average_score 134.14  
episode : 6 score 167.94 average_score 138.97  
episode : 7 score 167.77 average_score 142.57  
episode : 8 score 125.66 average_score 140.69  
episode : 9 score 233.47 average_score 149.97  
episode : 10 score 230.53 average_score 157.29  
episode : 11 score 139.77 average_score 155.83  
episode : 12 score 244.81 average_score 162.68  
episode : 13 score 262.70 average_score 169.82  
episode : 14 score 276.22 average_score 176.91  
episode : 15 score 210.82 average_score 179.03  
episode : 16 score 4.56 average_score 168.77  
episode : 17 score 202.51 average_score 170.64  
episode : 18 score 157.21 average_score 169.94  
episode : 19 score 173.83 average_score 170.13  
episode : 20 score 198.56 average

2022-02-08 09:12:24.836173: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: Actor_LunarLanderContinuous_1.hf5/assets
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'
INFO:tensorflow:Assets written to: Critic_LunarLanderContinuous_1.hf5/assets
episode : 51 score 220.82 average_score 190.29  
episode : 52 score 221.93 average_score 190.89  
episode : 53 score 205.24 average_score 191.15  
episode : 54 score 217.37 average_score 191.63  
episode : 55 score -242.31 average_score 183.88  
episode : 56 score 265.75 average_score 185.32  
episode : 57 score 68.23 average_score 183.30  
episode : 58 score 182.76 average_score 183.29  
episode :

episode : 195 score -3.20 average_score 137.16  
episode : 196 score 180.70 average_score 136.15  
episode : 197 score -266.35 average_score 130.88  
episode : 198 score 30.55 average_score 128.83  
episode : 199 score 3.98 average_score 128.93  
episode : 200 score 18.12 average_score 129.38  
episode : 201 score -27.84 average_score 126.65  
episode : 202 score -0.33 average_score 126.53  
episode : 203 score 9.02 average_score 124.59  
episode : 204 score -38.52 average_score 123.94  
episode : 205 score -37.21 average_score 120.77  
episode : 206 score -1.73 average_score 118.71  
episode : 207 score -19.06 average_score 118.60  
episode : 208 score 33.60 average_score 116.86  
episode : 209 score 7.37 average_score 119.83  
episode : 210 score 14.66 average_score 118.70  
episode : 211 score -41.55 average_score 118.41  
episode : 212 score 11.25 average_score 118.25  
episode : 213 score 20.28 average_score 115.80  
episode : 214 score 230.01 average_score 117.83  
episode : 215 

episode : 361 score -46.91 average_score -93.39  
episode : 362 score -61.13 average_score -93.82  
episode : 363 score -37.71 average_score -94.07  
episode : 364 score -370.50 average_score -97.65  
episode : 365 score -13.96 average_score -95.91  
episode : 366 score -50.11 average_score -97.95  
episode : 367 score 21.82 average_score -96.56  
episode : 368 score -8.97 average_score -96.88  
episode : 369 score 157.39 average_score -93.33  
episode : 370 score -213.23 average_score -93.70  
episode : 371 score -88.32 average_score -93.90  
episode : 372 score -100.57 average_score -92.54  
episode : 373 score -96.32 average_score -90.80  
episode : 374 score -93.88 average_score -91.55  
episode : 375 score -62.40 average_score -91.06  
episode : 376 score -91.65 average_score -91.56  
episode : 377 score -179.02 average_score -91.29  
episode : 378 score -185.38 average_score -92.26  
episode : 379 score -190.82 average_score -94.29  
episode : 380 score -181.52 average_score -94.

episode : 527 score -194.86 average_score -26.01  
episode : 528 score -309.31 average_score -29.20  
episode : 529 score -242.55 average_score -31.58  
episode : 530 score 76.56 average_score -31.14  
episode : 531 score -248.55 average_score -30.10  
episode : 532 score -237.26 average_score -35.36  
episode : 533 score -473.95 average_score -40.75  
episode : 534 score -207.75 average_score -44.44  
episode : 535 score -518.68 average_score -52.06  
episode : 536 score -485.21 average_score -56.54  
episode : 537 score -436.42 average_score -60.99  
episode : 538 score -552.23 average_score -69.27  
episode : 539 score -491.53 average_score -73.51  
episode : 540 score -498.01 average_score -78.82  
episode : 541 score -362.60 average_score -83.07  
episode : 542 score -546.17 average_score -88.83  
episode : 543 score -553.44 average_score -93.84  
episode : 544 score -486.12 average_score -98.90  
episode : 545 score -499.22 average_score -106.03  
episode : 546 score -286.66 aver

episode : 685 score -543.75 average_score -432.35  
episode : 686 score -538.58 average_score -433.77  
episode : 687 score -372.71 average_score -433.79  
episode : 688 score -400.02 average_score -433.70  
episode : 689 score -344.27 average_score -432.12  
episode : 690 score -436.88 average_score -432.18  
episode : 691 score -576.14 average_score -434.35  
episode : 692 score -351.96 average_score -434.10  
episode : 693 score -429.46 average_score -433.67  
episode : 694 score -546.72 average_score -433.03  
episode : 695 score -367.88 average_score -433.28  
episode : 696 score -524.77 average_score -435.16  
episode : 697 score -541.99 average_score -436.60  
episode : 698 score -355.04 average_score -435.88  
episode : 699 score -484.68 average_score -435.21  
episode : 700 score -410.81 average_score -436.01  
episode : 701 score -584.67 average_score -438.31  
episode : 702 score -559.88 average_score -440.62  
episode : 703 score -440.37 average_score -441.98  
episode : 70

episode : 843 score -584.54 average_score -510.70  
episode : 844 score -438.94 average_score -510.28  
episode : 845 score -416.04 average_score -510.71  
episode : 846 score -543.41 average_score -510.58  
episode : 847 score -406.31 average_score -509.21  
episode : 848 score -630.28 average_score -510.06  
episode : 849 score -684.33 average_score -511.28  
episode : 850 score -628.59 average_score -513.21  
episode : 851 score -583.47 average_score -515.01  
episode : 852 score -514.05 average_score -514.64  
episode : 853 score -535.93 average_score -516.02  
episode : 854 score -593.61 average_score -516.05  
episode : 855 score -447.34 average_score -516.99  
episode : 856 score -439.49 average_score -515.46  
episode : 857 score -654.18 average_score -516.79  
episode : 858 score -674.49 average_score -517.89  
episode : 859 score -504.77 average_score -517.45  
episode : 860 score -439.38 average_score -516.36  
episode : 861 score -494.85 average_score -517.62  
episode : 86

episode : 1001 score -520.53 average_score -540.24  
episode : 1002 score -544.17 average_score -540.94  
episode : 1003 score -654.87 average_score -543.64  
episode : 1004 score -622.14 average_score -543.22  
episode : 1005 score -607.76 average_score -543.19  
episode : 1006 score -557.49 average_score -542.61  
episode : 1007 score -521.30 average_score -543.64  
episode : 1008 score -434.51 average_score -541.84  
episode : 1009 score -455.00 average_score -540.30  
episode : 1010 score -480.11 average_score -539.82  
episode : 1011 score -477.23 average_score -538.68  
episode : 1012 score -484.88 average_score -537.52  
episode : 1013 score -555.53 average_score -538.25  
episode : 1014 score -559.23 average_score -539.26  
episode : 1015 score -488.72 average_score -537.25  
episode : 1016 score -627.40 average_score -539.18  
episode : 1017 score -514.95 average_score -540.12  
episode : 1018 score -467.34 average_score -539.55  
episode : 1019 score -405.80 average_score -53

episode : 1156 score -551.63 average_score -533.21  
episode : 1157 score -667.50 average_score -534.62  
episode : 1158 score -438.64 average_score -533.81  
episode : 1159 score -530.08 average_score -533.90  
episode : 1160 score -474.40 average_score -534.15  
episode : 1161 score -443.99 average_score -533.26  
episode : 1162 score -620.77 average_score -532.96  
episode : 1163 score -693.83 average_score -534.07  
episode : 1164 score -688.14 average_score -536.59  
episode : 1165 score -338.07 average_score -534.15  
episode : 1166 score -657.78 average_score -535.09  
episode : 1167 score -579.37 average_score -534.53  
episode : 1168 score -750.75 average_score -536.06  
episode : 1169 score -447.08 average_score -535.16  
episode : 1170 score -393.35 average_score -535.64  
episode : 1171 score -730.36 average_score -537.84  
episode : 1172 score -508.51 average_score -537.91  
episode : 1173 score -482.54 average_score -536.99  
episode : 1174 score -617.83 average_score -53

episode : 1311 score -493.64 average_score -557.02  
episode : 1312 score -564.81 average_score -557.16  
episode : 1313 score -574.04 average_score -557.22  
episode : 1314 score -703.42 average_score -557.24  
episode : 1315 score -720.70 average_score -558.06  
episode : 1316 score -497.55 average_score -558.27  
episode : 1317 score -428.94 average_score -556.87  
episode : 1318 score -405.79 average_score -554.11  
episode : 1319 score -417.33 average_score -552.96  
episode : 1320 score -419.67 average_score -553.14  
episode : 1321 score -630.89 average_score -555.72  
episode : 1322 score -665.09 average_score -555.48  
episode : 1323 score -478.33 average_score -555.82  
episode : 1324 score -655.30 average_score -556.10  
episode : 1325 score -419.85 average_score -554.32  
episode : 1326 score -494.97 average_score -552.57  
episode : 1327 score -687.57 average_score -553.07  
episode : 1328 score -511.00 average_score -551.77  
episode : 1329 score -727.45 average_score -55

episode : 1466 score -658.10 average_score -577.33  
episode : 1467 score -705.25 average_score -578.16  
episode : 1468 score -640.27 average_score -577.45  
episode : 1469 score -629.48 average_score -578.88  
episode : 1470 score -560.64 average_score -579.21  
episode : 1471 score -547.68 average_score -580.15  
episode : 1472 score -758.10 average_score -581.67  
episode : 1473 score -645.75 average_score -581.99  
episode : 1474 score -656.12 average_score -581.19  
episode : 1475 score -516.92 average_score -579.45  
episode : 1476 score -613.16 average_score -580.80  
episode : 1477 score -540.31 average_score -581.70  
episode : 1478 score -726.53 average_score -584.56  
episode : 1479 score -572.87 average_score -583.94  
episode : 1480 score -675.01 average_score -586.33  
episode : 1481 score -699.37 average_score -588.41  
episode : 1482 score -538.81 average_score -589.69  
episode : 1483 score -562.66 average_score -589.66  
episode : 1484 score -668.30 average_score -58

In [16]:

0.75**2+0.75**2

1.125

{'env': <Monitor<Monitor<Monitor<Monitor<TimeLimit<LunarLanderContinuous<LunarLanderContinuous-v2>>>>>>>,
 '_action_space': None,
 '_observation_space': None,
 '_reward_range': None,
 '_metadata': None,
 'videos': [],
 'stats_recorder': <gym.wrappers.monitoring.stats_recorder.StatsRecorder at 0x7f06541ebf10>,
 'video_recorder': None,
 'enabled': True,
 'episode_id': 0,
 '_monitor_id': 5,
 'env_semantics_autoreset': None,
 'directory': '/home/delcey/Documents/Python/results',
 'video_callable': <function gym.wrappers.monitor.capped_cubic_video_schedule(episode_id)>,
 'file_prefix': 'openaigym',
 'file_infix': '5.3792',
 'write_upon_reset': False}

In [14]:
state = env.reset()
plt.imshow(env.render(mode='rgb_array'))
n = 200
score =0
done = False
while not done:
    action = agent.choose_action(state,epsilon = 0.0)
    state,reward,done,_ = env.step(action)
    
    score+=reward
    env.render('rgb_array')
print(score)

libGL error: MESA-LOADER: failed to open iris: /usr/lib/dri/iris_dri.so: cannot open shared object file: No such file or directory (search paths /usr/lib/x86_64-linux-gnu/dri:\$${ORIGIN}/dri:/usr/lib/dri, suffix _dri)
libGL error: failed to load driver: iris
libGL error: MESA-LOADER: failed to open iris: /usr/lib/dri/iris_dri.so: cannot open shared object file: No such file or directory (search paths /usr/lib/x86_64-linux-gnu/dri:\$${ORIGIN}/dri:/usr/lib/dri, suffix _dri)
libGL error: failed to load driver: iris
libGL error: MESA-LOADER: failed to open swrast: /usr/lib/dri/swrast_dri.so: cannot open shared object file: No such file or directory (search paths /usr/lib/x86_64-linux-gnu/dri:\$${ORIGIN}/dri:/usr/lib/dri, suffix _dri)
libGL error: failed to load driver: swrast


ContextException: Could not create GL context

In [22]:
env.reset()

array([-0.00402613,  1.4208692 , -0.4078105 ,  0.44217455,  0.00467197,
        0.09237514,  0.        ,  0.        ], dtype=float32)