# AcroBot DQN

### import the dependencies

In [1]:
import random
import gym
import numpy as np
from collections import deque
import tensorflow as tf
from keras.models import Sequential 
from keras.layers import Dense
from keras.optimizers import Adam
import os

### set parameters

In [2]:
gamma = 0.95
epsilon = 1.0
epsilon_decay = 0.995
epsilon_min = 0.01
learning_rate = 0.001
batch_size = 32
n_episodes = 100

In [3]:
env = gym.make('Acrobot-v1')

In [4]:
state_size = env.observation_space.shape[0]
state_size
env.reset()

(array([ 0.9996594 ,  0.02609696,  0.99999917, -0.00130742, -0.06038488,
         0.0350698 ], dtype=float32),
 {})

In [5]:
action_size = env.action_space.n
action_size

3

In [6]:
output_dir = 'model_output/AcroBot'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

### Define Agent

In [7]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen = 2000)
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.learning_rate = learning_rate
        self.model = self._build_model()
    
    def _build_model(self):
        model = tf.keras.models.Sequential()
        model.add(tf.keras.layers.Dense(24, input_dim = self.state_size, activation = 'relu'))
        model.add(tf.keras.layers.Dense(24, activation = 'relu'))
        model.add(tf.keras.layers.Dense(self.action_size, activation = 'linear'))
        model.compile(loss = 'mse', optimizer = tf.keras.optimizers.Adam(learning_rate = self.learning_rate))
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((self, state, action, reward, next_state, done))

    def act(self, state):
        if(np.random.rand() <= self.epsilon):
            return random.randrange(self.action_size)
        else:
            act_values = self.model.predict(state)
            return np.argmax(act_values[0])

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        # print(minibatch)
        for _, state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = (reward + self.gamma * np.amax(self.model.predict(next_state)[0]))
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs = 1, verbose = 0)
        if(self.epsilon > self.epsilon_min):
            self.epsilon *= self.epsilon_decay

    def load(self, name):
        self.model.load_weight(name)

    def save(self, name):
        self.model.save_weight(name)

In [8]:
agent = DQNAgent(state_size, action_size)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [9]:
# done = False
# for episode in range(2):
#     a = env.reset()
#     # print(a)
#     state = np.array(a, dtype=object)
#     print(state[0])
#     # state = np.reshape(state, (1, state_size))

### Starting the environment

In [10]:
done = False
for episode in range(n_episodes):
    state = env.reset()
    # print(state)
    state = np.reshape(state[0], (1, state_size))
    for time in range(5000):
        env.render()
        action = agent.act(state)
        # value = env.step(action)
        # print(value)
        next_state, reward, done, _, _ = env.step(action)
        reward = reward if not done else -10
        next_state = np.reshape(next_state, [1, state_size])
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        if done:
            print(f'episode : {episode} / {n_episodes}, score: {time}, e: {agent.epsilon}')
            break
    # print(agent.memory[1])
    # minibatch = random.sample(agent.memory, batch_size)
    # print(minibatch[0])
    if(len(agent.memory) > batch_size):
        agent.replay(batch_size)
        # minibatch = random.sample(agent.memory, batch_size)
        # # print(minibatch)
        # for _, state, action, reward, next_state, done in minibatch:
        #     target = reward
        #     if not done:
        #         target = (reward + agent.gamma * np.amax(agent.model.predict(next_state)[0]))
        #     target_f = agent.model.predict(state)
        #     target_f[0][action] = target
        #     agent.model.fit(state, target_f, epoch = 1, verbose = 0)
        # if(agent.epsilon > agent.epsilon_min):
        #     agent.epsilon *= agent.epsilon_decay


  logger.warn(
  if not isinstance(terminated, (bool, np.bool8)):


episode : 0 / 100, score: 3711, e: 1.0
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━

KeyboardInterrupt: 