In [70]:
import numpy as np
import random
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import gym

# inspired by following implementations:

[Kim Keon](https://github.com/keon/deep-q-learning/blob/master/dqn_batch.py)

[Machine Learning with Phil](https://www.youtube.com/watch?v=5fHngyN8Qhw&t=8s)

[Jon Krohn](https://www.youtube.com/watch?v=OYhFoMySoVs&t=905s)

In [23]:
np.amax([1, 2, 3, 10])

10

In [10]:
deque(maxlen = 1000)

deque([])

In [49]:
def build_dq_network(input_size, action_size, fc1_size, fc2_size, lr):
    """Build Deep Q network"""
    model = Sequential()
    model.add(Dense(fc1_size, activation='relu', input_dim=input_size))
    model.add(Dense(fc2_size, activation='relu'))
    model.add(Dense(action_size, activation='linear'))
    model.compile(loss='mse', optimizer=Adam(lr=lr))
    return model

In [50]:
model = build_dq_network(4, 2, 24, 24, 0.002)

In [155]:
model.predict(np.array([[1.0, 2.0, 3.0, 10.0]]))[0]

array([0.55645853, 2.028011  ], dtype=float32)

In [141]:
class ReplayBuffer():
    
    def __len__(self):
        return len(self.memory)
    
    def __init__(self, max_length, batch_size):
        self.memory = deque(maxlen=max_length)
        self.max_length = max_length
        self.batch_size = batch_size

    def save_transition(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        
    def sample_buffer(self):
        """samples random observations"""
        return random.sample(self.memory, self.batch_size)
        
    
    

In [152]:
buffer = ReplayBuffer(10, 3)
for _ in range(3):
    state, action, reward, next_state, done = [np.random.normal() for _ in range(5)]
    buffer.save_transition(state, action, reward, next_state, done)

In [153]:
len(buffer)

3

In [139]:
class Agent

SyntaxError: invalid syntax (<ipython-input-139-b2ed39a9d883>, line 1)

In [140]:
env = gym.make('CartPole-v0')

In [81]:
env.reset()

array([-0.00634945, -0.01501653, -0.04470524,  0.04936773])

In [89]:
env.step(0)

(array([-0.00747663, -0.01269823, -0.04239002, -0.00182496]), 1.0, False, {})

In [182]:
class DQNAgent():
    
    def __init__(self, state_size, action_size, gamma=0.99, epsilon=1.0, epsilon_decay=0.996, epsilon_min=0.01, learning_rate=0.001):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.learning_rate = learning_rate
        
        self.memory = ReplayBuffer(2000, 32)
        self.model = build_dq_network(state_size, action_size, 24, 24, learning_rate)
        
    def memorize(self, state, action, reward, next_state, done):
        self.memory.save_transition(state, action, reward, next_state, done)
    
    def act(self, state):
        """Takes action (from state) and predicts the Q-value for each action and chooses the maximum over that"""
        if np.random.uniform(0, 1) <= self.epsilon:
            return np.random.randint(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])
    
    
    def replay(self):
        minibatch = self.memory.sample_buffer()
        # target_f : value_function
        states, Vs = [], []
        for state, action, reward, next_state, done in minibatch:
            # reward in terminating period
            V = reward
            if not done:
                # EV_next is the expected Value function in next state. the [0] just handles subsetting array. 
                # amax (takes highes value of array)
                EV_next = np.amax(self.model.predict(next_state)[0])
                V = reward + self.gamma * EV_next
                
            target_f = self.model.predict(state)[0]
            target_f[action] = V
            states.append(state[0]), Vs.append(target_f)
            history = self.model.fit(np.array(states), np.array(Vs), epochs=1, verbose=0)
        
        loss = history.history['loss'][0]
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
        return loss
    
    def load(self, name):
        self.model.load_weights(name)
    
    def save(self, name):
        self.model.save_weights(name)
    
    
    

In [183]:
DQNAgent(6, 2, epsilon=0.1).act([[np.random.normal() for _ in range(6)]])

1

In [184]:
EPISODES = 1000

In [185]:
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = DQNAgent(state_size, action_size)
done = False

for e in range(EPISODES):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    for time in range(500):
        # env.render()
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        reward = reward if not done else -10
        next_state = np.reshape(next_state, [1, state_size])
        agent.memorize(state, action, reward, next_state, done)
        state = next_state
        if done:
            print("episode: {}/{}, score: {}, e: {:.2}"
                  .format(e, EPISODES, time, agent.epsilon))
            break
        if len(agent.memory) > agent.memory.batch_size:
            loss = agent.replay()
            # Logging training loss every 10 timesteps
            if time % 10 == 0:
                print("episode: {}/{}, time: {}, loss: {:.4f}"
                    .format(e, EPISODES, time, loss))  

episode: 0/1000, score: 12, e: 1.0
episode: 1/1000, score: 11, e: 1.0
episode: 2/1000, time: 10, loss: 2.5407
episode: 2/1000, score: 13, e: 0.98
episode: 3/1000, time: 0, loss: 8.0056
episode: 3/1000, time: 10, loss: 45.8228
episode: 3/1000, score: 13, e: 0.93
episode: 4/1000, time: 0, loss: 117.5594
episode: 4/1000, time: 10, loss: 389.1401
episode: 4/1000, score: 17, e: 0.87
episode: 5/1000, time: 0, loss: 1010.9200
episode: 5/1000, time: 10, loss: 200.1587
episode: 5/1000, score: 13, e: 0.82
episode: 6/1000, time: 0, loss: 434.3073
episode: 6/1000, score: 9, e: 0.79
episode: 7/1000, time: 0, loss: 484.7032
episode: 7/1000, time: 10, loss: 43.4319


KeyboardInterrupt: 

NameError: name 'self' is not defined