In [187]:
%matplotlib inline

import gym
import matplotlib
import numpy as np
import sys
from lib import plotting
from collections import deque
import sklearn.preprocessing
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import random
from IPython.display import clear_output

if "../" not in sys.path:
  sys.path.append("../") 

matplotlib.style.use('ggplot')

In [188]:
class DQ_Agent():
    """
    Value Function approximator. 
    """
    
    def __init__(self, env, epsilon=1.0, epsilon_decay=0.995, discount_factor=1.0):
        self.env = env
        self.action_size = env.action_space.n
        self.epsilon_decay = epsilon_decay
        self.epsilon = epsilon
        self.discount_factor = discount_factor
        self.batch_size = 32
        self.learning_rate = 0.001
        self.memory = deque(maxlen=2000)
        self.model = self._build_model()
        
    def _build_model(self):
        model = Sequential()
        model.add(Dense(output_dim=24,input_dim=4, activation='relu'))
        model.add(Dense(output_dim=24, activation='relu'))
        model.add(Dense(output_dim=env.action_space.n, activation='linear'))
        model.compile(optimizer=Adam(lr=self.learning_rate), loss='mse')
        return model
    
    def act(self, state): 
        ''' 
        Similar to a policy function. 
        But instead of probabilities it returns actions directly.
        '''
        state = np.reshape(state, (1, env.observation_space.shape[0]))
        # act epsilon greedy
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # returns action
            #model_input_1 = np.reshape(list(state) + [0], (1,5))
            #model_input_2 = np.reshape(list(state) + [1], (1,5))
            #q_value_next_1 = self.model.predict(model_input_1)
            #q_value_next_2 = self.model.predict(model_input_2)

    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
    
    def replay(self):
        if len(self.memory) < self.batch_size:
            size = len(self.memory)
        else:
             size = self.batch_size
                           
        minibatch = random.sample(self.memory, size)
        
        for state, action, reward, next_state, done in minibatch:
            q_old = self.model.predict(state)[0]
            if done:
                target = reward
            else:
                next_max_q = np.max(self.model.predict(state)[0])
                target = reward + self.discount_factor*next_max_q
            target_new = q_old
            target_new[action] = target
            target_new = np.reshape(target_new, (1,2))
            self.model.fit(state, target_new, verbose=0)
    
    def update(self, state, reward, action, next_state, done):
        state = np.reshape(state,(1,env.observation_space.shape[0]))
        next_state = np.reshape(next_state,(1,env.observation_space.shape[0]))
                           
        self.remember(state, action, reward, next_state, done)
        
        self.replay()
                           
        # Update epsilon
        self.epsilon = self.epsilon * self.epsilon_decay
        
    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)

In [185]:
env = gym.make('CartPole-v1')
agent = DQ_Agent(env)

In [186]:
num_episodes = 1000

# Lernprozess
    # Keeps track of useful statistics
stats = plotting.EpisodeStats(
    episode_lengths=np.zeros(num_episodes),
    episode_rewards=np.zeros(num_episodes))
    

for i_episode in range(num_episodes):

    # Print out which episode we're on, useful for debugging.
    # Also print reward for last episode


    # 1. Generate one step of an episode e
    # 2. Update Q(s,a) according the the reward and next_state of the step
    # 3. Update policy (e-greedy)
    # 4. Coninue with next step
    last_reward = stats.episode_rewards[i_episode - 1]
    state = env.reset()
    clear_output()
    for t in range(200):
        sys.stdout.write("\rEpisode {0}/{1} Last Score: {2}, Step: # {3}".format(
                            i_episode + 1, num_episodes, last_reward, t))
        sys.stdout.flush()
        action = agent.act(state)

        # 1. Generate one step of an episode e     
        next_state, reward, done, _ = env.step(action)
        reward = reward if not done else -10

        # Update stats for plotting
        stats.episode_rewards[i_episode] += reward
        stats.episode_lengths[i_episode] += 1

        # 2. Update Q(s,a) according the the reward and next_state of the step
        agent.update(state, reward, action, next_state, done)

        # Happens automatically as everytime we call policy() the updated Q value is used for the calculation

        state = next_state

        if done:
            last_reward = stats.episode_rewards[i_episode - 1]
            print("\rEpisode {}/{} ({})".format(i_episode + 1, num_episodes, t), end="")
            break

        # Episode is over    
return stats

Episode 13/1000 Last Score: 1.0, Step: # 7

KeyboardInterrupt: 

In [None]:
#plotting.plot_cost_to_go_mountain_car(env, estimator)
plotting.plot_episode_stats(stats, smoothing_window=25)

In [21]:
import time
state = env.reset()
epsilon = 0.0
policy = make_epsilon_greedy_policy(estimator, epsilon, env.action_space.n)
while True:
    action = np.argmax(policy(state))
    
    state, reward, done, _ = env.step(action)
    
    #print(state)
    env.render()
    time.sleep(0.1)


    if done:
        break