In [1]:
import tensorflow as tf
physical_devices = tf.config.list_physical_devices()
tf.config.experimental.set_memory_growth(physical_devices[3], True)

In [2]:
from tensorflow.keras.layers import Dense,Dropout,Conv2D,Flatten,MaxPooling2D
from tensorflow.keras.models import Sequential
from collections import deque
import numpy as np
import gym
import random
import os
from tensorflow.keras.optimizers import Adam

In [3]:
output_dir = os.getcwd()

memory_size = 8_000
gamma = 0.9995
learning_rate= 0.001
mini_batch_size = 32

state_size = (210,160,1)
n_episodes = 10_000

epsilon_max = 0.75
epsilon_min = 0.1

In [4]:
env = gym.make("Breakout-v0")
action_size = env.action_space.n
action_size

4

In [5]:
def process(state1,state2,state3,state4):
    
    def pre(state):
        s1 = state.astype('float')
        s2 = s1/255
        s2 = s2[30:200, 0:160]
        s3 = tf.image.resize(s2, (84, 84))
        s4 = tf.image.rgb_to_grayscale(s3)
        
        return s4
    
    st1 = pre(state1)
    st2 = pre(state2)
    st3 = pre(state3)
    st4 = pre(state4)
    
    states =  np.array(tf.concat((st1,st2,st3,st4),axis=2))
    
    states =  np.expand_dims(states,axis=0)
    
    return states



In [6]:
class DQNAgent:
  def __init__(self,memory_size,action_size):
    self.action_size=action_size
    self.gamma = 0.99
    self.epsilon = 1
    self.epsilon_decay = 0.999999
    self.epsilon_min = 0.1
    self.action_model  = self.build_model()
    self.target_model =  self.build_model()
    self.target_model.set_weights(self.action_model.get_weights())
    self.swap_model = self.build_model()
    self.memory = deque()   
    self.learning_rate = learning_rate

  def build_model(self):
    model = Sequential()
    model.add(Conv2D(32, kernel_size=(8,8),strides=4, activation='relu', input_shape=(84,84,4)))
    model.add(Conv2D(64, kernel_size=(4,4),strides=2, activation='relu'))
    model.add(Conv2D(64, kernel_size=(3,3), strides = 1, activation='relu'))
    model.add(Flatten())
    model.add(Dense(512,activation='relu'))
    model.add(Dense(action_size,activation='linear'))
    model.compile(loss = tf.keras.losses.Huber(),optimizer = tf.keras.optimizers.Adam(lr = 0.00025))
    return model

  def act(self,state, epsilon = epsilon_min):
    if np.random.random()<=epsilon:
        return random.randrange(self.action_size)
    else:
        act_value  = self.action_model.predict(state)
        return np.argmax(act_value[0])

  def load(self,name):
    self.action_model.load_weights(name)
    self.target_model.load_weights(name)

In [7]:
agent = DQNAgent(memory_size,action_size)
agent.load(output_dir + '/' + 'Breakout'+ '.hdf5')

In [8]:
def test():
    done = False
    states = deque()
    env.reset()
    time_step_count = 0
    for i in range(4):
        init_state = env.step(random.randrange(action_size))
        states.append(init_state[0])
    updated_states = states.copy()
    current_states = process(states[0],states[1],states[2],states[3])

    for episode in range(100):
        env.reset()
        cum_reward=0
        for time_step in range(5000):
            action = agent.act(current_states)
            next_state,reward,done,_=env.step(action)
            env.render()
            updated_states.popleft()
            updated_states.append(next_state)
            new_states = process(updated_states[0],updated_states[1],updated_states[2],updated_states[3])
            
            reward = reward if not done else -20
            cum_reward +=reward
            del current_states
            current_states =  new_states.copy()
            del new_states
            
            if time_step_count <2000:
                print(f"Time Step:{time_step_count}, Action: {action}, Reward: {reward}")
            
            time_step_count += 1
            
            if done:
                
                print("time step: {}, average reward: {}, episode: {}".format(time_step_count, cum_reward ,episode))
                break
        
    env.close()    

In [None]:
test()

Time Step:0, Action: 0, Reward: 0.0
Time Step:1, Action: 3, Reward: 0.0
Time Step:2, Action: 3, Reward: 0.0
Time Step:3, Action: 2, Reward: 0.0
Time Step:4, Action: 1, Reward: 0.0
Time Step:5, Action: 0, Reward: 0.0
Time Step:6, Action: 2, Reward: 0.0
Time Step:7, Action: 0, Reward: 0.0
Time Step:8, Action: 0, Reward: 0.0
Time Step:9, Action: 0, Reward: 0.0
Time Step:10, Action: 3, Reward: 0.0
Time Step:11, Action: 0, Reward: 0.0
Time Step:12, Action: 0, Reward: 0.0
Time Step:13, Action: 2, Reward: 0.0
Time Step:14, Action: 3, Reward: 0.0
Time Step:15, Action: 2, Reward: 0.0
Time Step:16, Action: 3, Reward: 0.0
Time Step:17, Action: 3, Reward: 0.0
Time Step:18, Action: 2, Reward: 0.0
Time Step:19, Action: 2, Reward: 0.0
Time Step:20, Action: 2, Reward: 0.0
Time Step:21, Action: 2, Reward: 0.0
Time Step:22, Action: 2, Reward: 0.0
Time Step:23, Action: 2, Reward: 0.0
Time Step:24, Action: 0, Reward: 0.0
Time Step:25, Action: 0, Reward: 0.0
Time Step:26, Action: 3, Reward: 0.0
Time Step:2

Time Step:223, Action: 0, Reward: 0.0
Time Step:224, Action: 0, Reward: 0.0
Time Step:225, Action: 0, Reward: 0.0
Time Step:226, Action: 3, Reward: 0.0
Time Step:227, Action: 2, Reward: 0.0
Time Step:228, Action: 1, Reward: 0.0
Time Step:229, Action: 0, Reward: 0.0
Time Step:230, Action: 3, Reward: 0.0
Time Step:231, Action: 1, Reward: 0.0
Time Step:232, Action: 3, Reward: 0.0
Time Step:233, Action: 2, Reward: 0.0
Time Step:234, Action: 3, Reward: 0.0
Time Step:235, Action: 3, Reward: 0.0
Time Step:236, Action: 3, Reward: 0.0
Time Step:237, Action: 3, Reward: 0.0
Time Step:238, Action: 3, Reward: 0.0
Time Step:239, Action: 2, Reward: 0.0
Time Step:240, Action: 2, Reward: 0.0
Time Step:241, Action: 2, Reward: 0.0
Time Step:242, Action: 2, Reward: 0.0
Time Step:243, Action: 3, Reward: 0.0
Time Step:244, Action: 3, Reward: 0.0
Time Step:245, Action: 0, Reward: 0.0
Time Step:246, Action: 0, Reward: 0.0
Time Step:247, Action: 0, Reward: 0.0
Time Step:248, Action: 0, Reward: 0.0
Time Step:24

Time Step:439, Action: 3, Reward: 0.0
Time Step:440, Action: 3, Reward: 0.0
Time Step:441, Action: 2, Reward: 0.0
Time Step:442, Action: 3, Reward: 0.0
Time Step:443, Action: 2, Reward: 0.0
Time Step:444, Action: 2, Reward: 0.0
Time Step:445, Action: 2, Reward: 0.0
Time Step:446, Action: 3, Reward: 0.0
Time Step:447, Action: 0, Reward: 0.0
Time Step:448, Action: 3, Reward: 0.0
Time Step:449, Action: 3, Reward: 0.0
Time Step:450, Action: 0, Reward: 0.0
Time Step:451, Action: 3, Reward: 0.0
Time Step:452, Action: 0, Reward: 0.0
Time Step:453, Action: 0, Reward: 0.0
Time Step:454, Action: 3, Reward: 0.0
Time Step:455, Action: 3, Reward: 0.0
Time Step:456, Action: 0, Reward: 0.0
Time Step:457, Action: 3, Reward: 0.0
Time Step:458, Action: 2, Reward: 0.0
Time Step:459, Action: 3, Reward: 0.0
Time Step:460, Action: 3, Reward: 0.0
Time Step:461, Action: 0, Reward: 0.0
Time Step:462, Action: 2, Reward: 0.0
Time Step:463, Action: 2, Reward: 0.0
Time Step:464, Action: 0, Reward: 0.0
Time Step:46

Time Step:658, Action: 0, Reward: 0.0
Time Step:659, Action: 3, Reward: 0.0
Time Step:660, Action: 1, Reward: 0.0
Time Step:661, Action: 2, Reward: 0.0
Time Step:662, Action: 2, Reward: 0.0
Time Step:663, Action: 2, Reward: 0.0
Time Step:664, Action: 2, Reward: 0.0
Time Step:665, Action: 2, Reward: 0.0
Time Step:666, Action: 1, Reward: 0.0
Time Step:667, Action: 3, Reward: 0.0
Time Step:668, Action: 3, Reward: 0.0
Time Step:669, Action: 2, Reward: 0.0
Time Step:670, Action: 3, Reward: 0.0
Time Step:671, Action: 3, Reward: 0.0
Time Step:672, Action: 3, Reward: 0.0
Time Step:673, Action: 1, Reward: 0.0
Time Step:674, Action: 0, Reward: 0.0
Time Step:675, Action: 1, Reward: 0.0
Time Step:676, Action: 1, Reward: 0.0
Time Step:677, Action: 1, Reward: 0.0
Time Step:678, Action: 1, Reward: 0.0
Time Step:679, Action: 2, Reward: -20
time step: 680, average reward: -9.0, episode: 0
Time Step:680, Action: 1, Reward: 0.0
Time Step:681, Action: 2, Reward: 0.0
Time Step:682, Action: 0, Reward: 0.0
T

Time Step:877, Action: 0, Reward: 0.0
Time Step:878, Action: 2, Reward: 0.0
Time Step:879, Action: 0, Reward: 0.0
Time Step:880, Action: 1, Reward: 0.0
Time Step:881, Action: 1, Reward: 0.0
Time Step:882, Action: 1, Reward: 0.0
Time Step:883, Action: 3, Reward: 0.0
Time Step:884, Action: 3, Reward: 0.0
Time Step:885, Action: 0, Reward: 0.0
Time Step:886, Action: 0, Reward: 0.0
Time Step:887, Action: 0, Reward: 0.0
Time Step:888, Action: 3, Reward: 0.0
Time Step:889, Action: 0, Reward: 0.0
Time Step:890, Action: 0, Reward: 0.0
Time Step:891, Action: 0, Reward: 0.0
Time Step:892, Action: 0, Reward: 0.0
Time Step:893, Action: 0, Reward: 0.0
Time Step:894, Action: 3, Reward: 0.0
Time Step:895, Action: 3, Reward: 0.0
Time Step:896, Action: 2, Reward: 0.0
Time Step:897, Action: 3, Reward: 0.0
Time Step:898, Action: 2, Reward: 0.0
Time Step:899, Action: 1, Reward: 0.0
Time Step:900, Action: 3, Reward: 1.0
Time Step:901, Action: 0, Reward: 0.0
Time Step:902, Action: 0, Reward: 0.0
Time Step:90

Time Step:1093, Action: 1, Reward: 0.0
Time Step:1094, Action: 0, Reward: 0.0
Time Step:1095, Action: 1, Reward: 0.0
Time Step:1096, Action: 3, Reward: 1.0
Time Step:1097, Action: 1, Reward: 0.0
Time Step:1098, Action: 0, Reward: 0.0
Time Step:1099, Action: 0, Reward: 0.0
Time Step:1100, Action: 3, Reward: 0.0
Time Step:1101, Action: 3, Reward: 0.0
Time Step:1102, Action: 3, Reward: 0.0
Time Step:1103, Action: 2, Reward: 0.0
Time Step:1104, Action: 3, Reward: 0.0
Time Step:1105, Action: 3, Reward: 0.0
Time Step:1106, Action: 3, Reward: 0.0
Time Step:1107, Action: 3, Reward: 0.0
Time Step:1108, Action: 2, Reward: 0.0
Time Step:1109, Action: 1, Reward: 0.0
Time Step:1110, Action: 2, Reward: 0.0
Time Step:1111, Action: 0, Reward: 0.0
Time Step:1112, Action: 0, Reward: 0.0
Time Step:1113, Action: 2, Reward: 0.0
Time Step:1114, Action: 2, Reward: 0.0
Time Step:1115, Action: 2, Reward: 0.0
Time Step:1116, Action: 2, Reward: 0.0
Time Step:1117, Action: 1, Reward: 0.0
Time Step:1118, Action: 3

Time Step:1307, Action: 2, Reward: 0.0
Time Step:1308, Action: 2, Reward: 0.0
Time Step:1309, Action: 3, Reward: 0.0
Time Step:1310, Action: 1, Reward: 0.0
Time Step:1311, Action: 2, Reward: 0.0
Time Step:1312, Action: 2, Reward: 0.0
Time Step:1313, Action: 2, Reward: 0.0
Time Step:1314, Action: 3, Reward: 0.0
Time Step:1315, Action: 0, Reward: 0.0
Time Step:1316, Action: 2, Reward: 0.0
Time Step:1317, Action: 2, Reward: 0.0
Time Step:1318, Action: 0, Reward: 0.0
Time Step:1319, Action: 2, Reward: 0.0
Time Step:1320, Action: 0, Reward: 0.0
Time Step:1321, Action: 0, Reward: 0.0
Time Step:1322, Action: 0, Reward: 0.0
Time Step:1323, Action: 2, Reward: 0.0
Time Step:1324, Action: 3, Reward: 0.0
Time Step:1325, Action: 0, Reward: 0.0
Time Step:1326, Action: 0, Reward: 0.0
Time Step:1327, Action: 0, Reward: 0.0
Time Step:1328, Action: 0, Reward: 0.0
Time Step:1329, Action: 0, Reward: 0.0
Time Step:1330, Action: 3, Reward: 0.0
Time Step:1331, Action: 1, Reward: 0.0
Time Step:1332, Action: 0

Time Step:1519, Action: 1, Reward: 0.0
Time Step:1520, Action: 2, Reward: 0.0
Time Step:1521, Action: 1, Reward: 0.0
Time Step:1522, Action: 1, Reward: 0.0
Time Step:1523, Action: 3, Reward: 0.0
Time Step:1524, Action: 3, Reward: 0.0
Time Step:1525, Action: 3, Reward: 0.0
Time Step:1526, Action: 2, Reward: 0.0
Time Step:1527, Action: 2, Reward: 0.0
Time Step:1528, Action: 2, Reward: 0.0
Time Step:1529, Action: 0, Reward: 0.0
Time Step:1530, Action: 3, Reward: 0.0
Time Step:1531, Action: 3, Reward: 0.0
Time Step:1532, Action: 0, Reward: 0.0
Time Step:1533, Action: 3, Reward: 0.0
Time Step:1534, Action: 0, Reward: 0.0
Time Step:1535, Action: 0, Reward: 0.0
Time Step:1536, Action: 3, Reward: 0.0
Time Step:1537, Action: 3, Reward: 0.0
Time Step:1538, Action: 3, Reward: 0.0
Time Step:1539, Action: 0, Reward: 0.0
Time Step:1540, Action: 0, Reward: 0.0
Time Step:1541, Action: 3, Reward: 0.0
Time Step:1542, Action: 2, Reward: 0.0
Time Step:1543, Action: 3, Reward: 0.0
Time Step:1544, Action: 3

Time Step:1730, Action: 0, Reward: 0.0
Time Step:1731, Action: 0, Reward: 0.0
Time Step:1732, Action: 0, Reward: 0.0
Time Step:1733, Action: 0, Reward: 0.0
Time Step:1734, Action: 2, Reward: 0.0
Time Step:1735, Action: 0, Reward: 0.0
Time Step:1736, Action: 0, Reward: 0.0
Time Step:1737, Action: 1, Reward: 0.0
Time Step:1738, Action: 0, Reward: 0.0
Time Step:1739, Action: 3, Reward: 0.0
Time Step:1740, Action: 0, Reward: 0.0
Time Step:1741, Action: 2, Reward: 0.0
Time Step:1742, Action: 2, Reward: 0.0
Time Step:1743, Action: 2, Reward: 0.0
Time Step:1744, Action: 3, Reward: 0.0
Time Step:1745, Action: 1, Reward: 0.0
Time Step:1746, Action: 1, Reward: 0.0
Time Step:1747, Action: 3, Reward: 0.0
Time Step:1748, Action: 2, Reward: 0.0
Time Step:1749, Action: 1, Reward: 0.0
Time Step:1750, Action: 2, Reward: 0.0
Time Step:1751, Action: 2, Reward: 0.0
Time Step:1752, Action: 2, Reward: 0.0
Time Step:1753, Action: 2, Reward: 0.0
Time Step:1754, Action: 2, Reward: 0.0
Time Step:1755, Action: 2

Time Step:1945, Action: 0, Reward: 0.0
Time Step:1946, Action: 0, Reward: 0.0
Time Step:1947, Action: 0, Reward: 0.0
Time Step:1948, Action: 0, Reward: 0.0
Time Step:1949, Action: 0, Reward: 0.0
Time Step:1950, Action: 0, Reward: 0.0
Time Step:1951, Action: 0, Reward: -20
time step: 1952, average reward: 15.0, episode: 1
Time Step:1952, Action: 0, Reward: 0.0
Time Step:1953, Action: 0, Reward: 0.0
Time Step:1954, Action: 0, Reward: 0.0
Time Step:1955, Action: 0, Reward: 0.0
Time Step:1956, Action: 2, Reward: 0.0
Time Step:1957, Action: 3, Reward: 0.0
Time Step:1958, Action: 1, Reward: 0.0
Time Step:1959, Action: 3, Reward: 0.0
Time Step:1960, Action: 0, Reward: 0.0
Time Step:1961, Action: 3, Reward: 0.0
Time Step:1962, Action: 3, Reward: 0.0
Time Step:1963, Action: 3, Reward: 0.0
Time Step:1964, Action: 3, Reward: 0.0
Time Step:1965, Action: 0, Reward: 0.0
Time Step:1966, Action: 0, Reward: 0.0
Time Step:1967, Action: 0, Reward: 0.0
Time Step:1968, Action: 2, Reward: 0.0
Time Step:1969

Exception ignored in: <function IteratorResourceDeleter.__del__ at 0x7f74fd42ee50>
Traceback (most recent call last):
  File "/home/himanshu/anaconda3/envs/project/lib/python3.8/site-packages/tensorflow/python/data/ops/iterator_ops.py", line 536, in __del__
    gen_dataset_ops.delete_iterator(
  File "/home/himanshu/anaconda3/envs/project/lib/python3.8/site-packages/tensorflow/python/ops/gen_dataset_ops.py", line 1277, in delete_iterator
    _result = pywrap_tfe.TFE_Py_FastPathExecute(
KeyboardInterrupt: 


In [None]:
env.close()