In [12]:
!pip install gym
!pip install tensorflow



## Constants

In [13]:
LEARNING_RATE = 0.001
DISCOUNT_RATE = 0.95

EPSILON_DECAY = 0.995
EPSILON_MIN = 0.1

BATCH_SIZE = 32
MAX_MEM = 2000

N_EPISODES = 1000
MAX_TIMESTEP = 10000

## Imports

In [14]:
import gym

import random
import numpy as np
from collections import deque

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam

## Environment

In [15]:
env = gym.make('CartPole-v0')
states = env.observation_space.shape[0]
actions = env.action_space.n

## Model

In [16]:
def build_model(states, actions, file_name, train):
    model = Sequential()
    model.add(Flatten(input_shape=(states, )))
    model.add(Dense(24, activation="relu"))
    model.add(Dense(24, activation="relu"))
    model.add(Dense(actions, activation='linear'))

    model.compile(Adam(learning_rate=LEARNING_RATE), 'mse')
    
    if not train:
        model.load_weights(file_name + ".h5")

    return model

## Trainer

In [17]:
class QTrainer:
    def __init__(self, model, lr, gamma) -> None:
        self.lr = lr
        self.gamma = gamma
        self.model = model

    def train_step(self, state, action, reward, next_state, done):
        target = reward

        if not done:
            target = (reward + self.gamma *
                      np.amax(self.model.predict(next_state)[0]))

        target_f = self.model.predict(state)
        target_f[0][action] = target

        self.model.fit(state, target_f, verbose=0)

## Agent

In [18]:
class Agent:
    def __init__(self, actions, states, file_name, train):
        self.n_episodes = 0
        self.epsilon = 1  # randomness
        self.actions = actions
        self.memory = deque(maxlen=MAX_MEM)
        self.model = build_model(states, actions, file_name, train)
        self.trainer = QTrainer(model=self.model, lr=LEARNING_RATE, gamma=DISCOUNT_RATE)

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def get_action(self, state, train):
        if train and np.random.rand() <= self.epsilon:
            return random.randrange(self.actions)
        else:
            prediction = self.model.predict(state)
            return np.argmax(prediction[0])

    def train_long_memory(self):
        if len(self.memory) < BATCH_SIZE:
            return 0

        mini_sample = random.sample(self.memory, BATCH_SIZE)  # list of tuples

        for state, action, reward, next_state, done in mini_sample:
            self.trainer.train_step(state, action, reward, next_state, done)

        if self.epsilon > EPSILON_MIN:
            self.epsilon *= EPSILON_DECAY

    def train_short_memory(self, state, action, reward, next_state, done):
        self.trainer.train_step(state, action, reward, next_state, done)

## Training

In [19]:
def train(env_name, train=True):
    env = gym.make(env_name)
    states = env.observation_space.shape[0]
    actions = env.action_space.n

    agent = Agent(actions, states, env_name, train)

    for episode in range(N_EPISODES):
        # uncomment below to view example
        # env.render() 
        old_state = env.reset().reshape(1, states)

        total_reward = 0
        agent.n_episodes += 1

        for t in range(MAX_TIMESTEP):
            action = agent.get_action(old_state, train=train)

            new_state, reward, done, info = env.step(action)
            new_state = new_state.reshape(1, states)

            if train:
                agent.remember(old_state, action, reward, new_state, done)

            total_reward += reward
            old_state = new_state

            if done:
                break
        
        if train:
            agent.train_long_memory()

        print("episode: {}/{} | score: {} | e: {:.3f}".format(episode +
              1, N_EPISODES, total_reward, agent.epsilon))
    
    if train:
        agent.model.save_weights(env_name + ".h5", overwrite=True)

### Main 

In [20]:
if __name__ == "__main__":
    train('CartPole-v0', train=False)

episode: 1/1000 | score: 200.0 | e: 1.000
episode: 2/1000 | score: 200.0 | e: 1.000
episode: 3/1000 | score: 200.0 | e: 1.000
episode: 4/1000 | score: 200.0 | e: 1.000
episode: 5/1000 | score: 200.0 | e: 1.000
episode: 6/1000 | score: 200.0 | e: 1.000
episode: 7/1000 | score: 200.0 | e: 1.000
episode: 8/1000 | score: 200.0 | e: 1.000


KeyboardInterrupt: 