In [2]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam

def build_model(state_size, action_size):
    model = Sequential()
    model.add(Input(shape = (state_size, )))
    model.add(Dense(24, activation = 'relu'))
    model.add(Dense(24, activation = 'relu'))
    model.add(Dense(action_size, activation = 'linear'))
    model.compile(loss = 'mse', optimizer = Adam(learning_rate = 0.001))
    return model

state_size = env.observation_space.shape[0]
action_size = env.action_space.n
model = build_model(state_size, action_size)

In [3]:
import random
import tensorflow as tf
from collections import deque

epsilon = 1.0
epsilon_min = 0.01
epsilon_decay = 0.99

memory = deque(maxlen = 2000)

def remember(state, action, reward, next_state, done):
    memory.append((state, action, reward, next_state, done))

def replay(batch_size = 64):
    if len(memory) < batch_size:
        return

    minibatch = random.sample(memory, batch_size)

    states = np.vstack([x[0] for x in minibatch])
    actions = np.array([x[1] for x in minibatch])
    rewards = np.array([x[2] for x in minibatch])
    next_states = np.vstack([x[3] for x in minibatch])
    dones = np.array([x[4] for x in minibatch])

    q_next = model.predict(next_states)
    q_target = model.predict(states)

    for i in range(batch_size):
        target = rewards[i]
        if not dones[i]:
            target += 0.95 * np.max(q_next[i])
        q_target[i][actions[i]] = target

    model.fit(states, q_target, epochs = 1, verbose = 0)

    global epsilon
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

def act(state):
    if np.random.rand() <= epsilon:
        return random.randrange(action_size)
    act_values = model.predict(state)
    return np.argmax(act_values[0])

episodes = 10
train_frequency = 5

for e in range(episodes):
    state, _ = env.reset()
    state = np.reshape(state, [1, state_size])

    for time in range(200):
        action = act(state)
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        reward = reward if not done else -10
        next_state = np.reshape(next_state, [1, state_size])
        remember(state, action, reward, next_state, done)
        state = next_state

        if done:
            print(f"episode: {e+1}/{episodes}, score: {time}, e: {epsilon:.2}")
            break

        if time % train_frequency == 0:
            replay(batch_size=64)

env.close()

episode: 1/10, score: 18, e: 1.0
episode: 2/10, score: 14, e: 1.0
episode: 3/10, score: 56, e: 0.94
episode: 4/10, score: 22, e: 0.9
episode: 5/10, score: 16, e: 0.86
episode: 6/10, score: 17, e: 0.83
episode: 7/10, score: 11, e: 0.8
episode: 8/10, score: 14, e: 0.78
episode: 9/10, score: 10, e: 0.76
episode: 10/10, score: 15, e: 0.74


In [7]:
model.save("dqn_cartpole.h5")