In [1]:
import numpy as np
import gym
import random
from keras.models import Sequential
from keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Activation, Flatten
from keras.optimizers import Adam
from collections import deque
import numpy as np
import time
import threading
import pickle
from sklearn.externals import joblib 


Using TensorFlow backend.


In [None]:
env = gym.make("CartPole-v0")

In [5]:
class DQNAgent:
    
    def __init__(self):
        self.total_episodes = 1000
        self.max_steps = 500
        self.gamma = 0.95
        self.epsilon = 1.0
        self.min_epsilon = 0.01
        self.max_epsilon = 1.0
        self.decay_rate = 0.005
        self.minibatch_size = 100
        self.replay_memory_size = 50000
        self.min_replay_memory_size = 1000
        self.update_rate = 5 #every 5 episodes
        self.action_size = 2
        self.state_size = 4
        self.start_train = False
        self.memory = deque(maxlen=self.replay_memory_size)

        
    def createModel(self):
        model = Sequential()
        model.add(Dense(24, input_shape=(self.state_size,), activation="relu"))
        model.add(Dense(48, activation="relu"))
        model.add(Dense(24, activation="relu"))
        model.add(Dense(self.action_size, activation="linear"))
        model.compile(loss="mse", optimizer=Adam(lr=0.005), metrics=['accuracy'])
        return model
    
    def instantiateModels(self):
        self.main_model = self.createModel()
        self.target_model = self.createModel()
        self.target_model.set_weights(self.main_model.get_weights())
        
    def train(self):
        # if replay memory has atleast 1000 examples, we sample 100 at random and train main network
        if len(self.memory) >= self.min_replay_memory_size:
            self.start_train = True
            mini_batch = random.sample(self.memory, self.minibatch_size)
            current_states =  np.array([sample[0] for sample in mini_batch])
            current_qs = self.main_model.predict(current_states)
            next_states =  np.array([sample[3] for sample in mini_batch])
            future_qs = self.target_model.predict(next_states)

            X= []
            y = []
            for index, (state, action, reward, next_state, done) in enumerate(mini_batch):
                if not done:
                    max_q = reward + self.gamma*np.max(future_qs[index])
                else:
                    max_q = reward

                #update q value looking at optimal future value. Maybe add learning rate in the future?
                current_q = current_qs[index]
                current_q[action] = max_q

                X.append(state)
                y.append(current_q)

            self.main_model.fit(np.array(X),np.array(y), batch_size=self.minibatch_size, verbose=0, use_multiprocessing=True, shuffle=False, workers=8)
        
    def predict(self):    
        rewards = []
        state = env.reset()
        start = time.time()
        counter = 1
        for episode in range(self.total_episodes):
            total_reward = 0
            for t in range(self.max_steps):
                #epsilon-greedy strategy
                random_num = random.uniform(0,1)
                if random_num > self.epsilon:
                    action_value = self.main_model.predict(np.expand_dims(state,axis=0))
                    action = np.argmax(action_value)
                else:
                    action = env.action_space.sample()

                next_state, reward, done, _  = env.step(action)

                #To improve learning, -5 as reward if the pole crashes, else +1 for every step
                if done:
                    # if pole crashes, next state is 0's, add total rewards until this point to a list and reset its value.
                    reward = -5
                    total_reward+=reward
                    rewards.append(total_reward)
                    total_reward = 0
                    next_state=np.zeros(state.shape)
                    self.memory.append((state, action, reward, next_state, done))

                    #give the pole-cart a little push
                    env.reset()
                    env.step(env.action_space.sample())

                else:
                    total_reward+=reward
                    self.memory.append((state, action, reward, next_state, done))
                    state=next_state
                
#                 t1 = threading.Thread(name='train', target=agent.train)
#                 t1.start()
                self.train()
            #update counter at end of episode, update target model weights every 10 episodes
            if self.start_train:
                counter +=1 
            if self.start_train and (counter%self.update_rate)==0:
                self.target_model.set_weights(self.main_model.get_weights())
            if (counter%100)==0:
                print("Log episode number: ", episode)
            #epsilon decay for every episode
            self.epsilon = self.min_epsilon + (self.max_epsilon - self.min_epsilon)*np.exp(-self.decay_rate*episode)
        end = time.time() 
        print('total time in seconds: ', end-start)
        print("rewards over time: ", rewards)

In [None]:
agent = DQNAgent()
agent.instantiateModels()
agent.predict()

In [23]:
# t2 = threading.Thread(name='predict', target=agent.predict)
# t2.start()
joblib.dump(agent.main_model, 'main_modelv1.pkl') 
joblib.dump(agent.target_model, 'target_modelv1.pkl') 

['target_modelv1.pkl']

In [30]:
main_model = joblib.load('main_modelv1.pkl')  
target_model = joblib.load('target_modelv1.pkl')

In [31]:
#test whether the main DQN is able to keep the pole-cart stable
env = gym.make("CartPole-v0")
test_episodes = 3
test_max_steps = 1000
env.reset()
for ep in range(test_episodes):
    t = 0
    while t < test_max_steps:
        env.render() 

        # Get action from Q-network
        action_value = main_model.predict(np.expand_dims(state,axis=0))
        action = np.argmax(action_value)

        # Take action, get new state and reward
        next_state, reward, done, _ = env.step(action)

        if done:
            print(t)
            t = test_max_steps
            env.reset()
            # Take one random step to get the pole and cart moving
            state, reward, done, _ = env.step(env.action_space.sample())

        else:
            state = next_state
            t += 1
env.close()


  result = entry_point.load(False)


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
199
198
198
