In [2]:
import numpy as np
# import pandas as pd
import random
from collections import deque
from keras.models import Sequential
from keras.layers import Dense,Activation,Conv2D,Flatten

#from keras.layers import MaxPooling2D



In [38]:
class DeepQNetwork:
    def __init__(
                 self,
                 n_actions,
                 learning_rate=0.01,
                 reward_decay=0.9,
                 e_greedy=0.9,
                 replay_target_iter=300,
                 memory_size=500,
                 batch_size=32
                 ):
        self.n_actions = n_actions
        self.lr = learning_rate
        self.gamma = reward_decay
        self.epsilon = e_greedy
        self.memory_size = memory_size
        self.batch_size = batch_size
        
        #bulid eval network and target network
        self.eval_net = self._build_net()
        self.target_net = self._build_net()
        
        #build replay buffer
        self.memory = deque(maxlen=self.memory_size)
        
        #frequency of updating target network
        self.learning_step_counter = 0
        self.replay_target_iter = replay_target_iter
        
        
    
    # build network fun
    def _build_net(self):
        #build sequential model
        model = Sequential()
        
        #add convolution layers
        model.add(Conv2D(32,(8,8),strides=(4,4),activation='relu',
                 input_shape=(84,84,3)))
        model.add(Conv2D(64,(4,4),strides=(2,2),activation='relu'
                  ))
        model.add(Conv2D(64,(3,3),strides=(1,1),activation='relu'
                  ))
        
        #data compression
        model.add(Flatten())
        
        #build first dense layer
        model.add(Dense(512))
        model.add(Activation('relu'))
        
        #bulid output layer
        model.add(Dense(self.n_actions,activation='linear'))
        
        #model optimization
        model.compile(loss='mse', optimizer='RMSprop')
        
        return model

    
    # target network para update fun
    def target_net_para_setting(self,eval_n,target_n):
        target_n.set_weights(
            eval_n.get_weights()
            )
        
    # store transition fun
    def store_transition_in_memory(self,s,a,r,s_,done):
        self.memory.append((s,a,r,s_,done))
    
    
    # choose action according state fun
    def choose_action(self,state):
        if np.random.rand() < self.epsilon:
            action_values = self.eval_net.predict(state)
            action =  np.argmax(action_values[0]) #返回最大值的列索引，即动作名
        else:
            action = np.random.choice(self.n_actions)
        return action
    
    # learn fun
    def learn(self):
        # Data Sample        
        minibatch = random.sample(self.memory, self.batch_size)
        
        # eval network update
        for state, action, reward, next_state, done in minibatch:
            if done:
                q_target_s_a = reward
            else:
                q_target_s_a = reward + self.gamma * np.max(
                    self.target_net.predict(next_state))
            
            q_eval = self.eval_net.predict(state)
            q_predict_s_a = q_eval[0][action]
            q_predict_s_a += self.lr * (q_target_s_a - q_predict_s_a)
            q_eval[0][action] =  q_predict_s_a
            
            self.eval_net.fit(state,q_eval)
            self.learning_step_counter += 1
        
        # target network update
        if self.learning_step_counter % self.replay_target_iter == 0:
            self.target_net_para_setting(self.eval_net,self.target_net)
    
    

In [4]:
import gym
import cv2

In [5]:
# 环境
env = gym.make('Breakout-v0')
print('action space :',env.action_space)
print('observation space :',env.observation_space)

action space : Discrete(4)
observation space : Box(210, 160, 3)


In [6]:
# 智能体
RL = DeepQNetwork(n_actions=env.action_space.n)

In [7]:
RL.memory

deque([])

In [8]:
RL.eval_net

<keras.engine.sequential.Sequential at 0xebe3860>

In [10]:
RL.eval_net.input.shape

TensorShape([None, 84, 84, 3])

In [21]:
observation = env.reset()
observation.shape

(210, 160, 3)

In [22]:
observation = cv2.resize(src=observation, dsize=(84,84))
observation.shape

(84, 84, 3)

In [23]:
len(observation.shape)

3

In [24]:
observation = np.expand_dims(observation,axis=0) #,len(observation.shape)
observation.shape

(1, 84, 84, 3)

In [25]:
action = RL.choose_action(observation)

In [26]:
action

3

In [34]:
observation_, reward, done, info = env.step(action)

In [35]:
observation_.shape

(210, 160, 3)

In [29]:
reward

0.0

In [30]:
done

False

In [31]:
info

{'ale.lives': 5}

In [36]:
observation_ = cv2.resize(src=observation_,dsize=(84,84))
observation_ = np.expand_dims(observation_,axis=0)
observation_.shape

(1, 84, 84, 3)

In [40]:
RL.store_transition_in_memory(observation,action,reward,observation_,done)

In [41]:
RL.memory

deque([(array([[[[0, 0, 0],
                 [0, 0, 0],
                 [0, 0, 0],
                 ...,
                 [0, 0, 0],
                 [0, 0, 0],
                 [0, 0, 0]],
        
                [[0, 0, 0],
                 [0, 0, 0],
                 [0, 0, 0],
                 ...,
                 [0, 0, 0],
                 [0, 0, 0],
                 [0, 0, 0]],
        
                [[0, 0, 0],
                 [0, 0, 0],
                 [0, 0, 0],
                 ...,
                 [0, 0, 0],
                 [0, 0, 0],
                 [0, 0, 0]],
        
                ...,
        
                [[0, 0, 0],
                 [0, 0, 0],
                 [0, 0, 0],
                 ...,
                 [0, 0, 0],
                 [0, 0, 0],
                 [0, 0, 0]],
        
                [[0, 0, 0],
                 [0, 0, 0],
                 [0, 0, 0],
                 ...,
                 [0, 0, 0],
                 [0, 0, 0],
        