In [1]:
import gym
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import numpy as np
import random

Using TensorFlow backend.


In [2]:
env = gym.make('CartPole-v0')

### Agent

In [3]:
class Agent:
    def __init__(self,state_size,action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01
        self.learning_rate = 0.001
        self.model = self.create_model()
        
    def create_model(self):
        model = Sequential()
        model.add(Dense(24,input_dim=self.state_size,activation='relu'))
        model.add(Dense(24,activation='relu'))
        model.add(Dense(self.action_size,activation='linear'))
        model.compile(loss='mse',optimizer=Adam(lr=0.001))
        return model
    
    def remember(self,state,action,reward,next_state,done):
        self.memory.append((state,action,reward,next_state,done))
        
    def act(self,state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        return np.argmax(self.model.predict(state)[0])
    
    def train(self,batch_size=32):
        minibatch = random.sample(self.memory,batch_size)
        
        for experience in minibatch:
            state,action,reward,next_state,done = experience
            
            if not done:
                target = reward + self.gamma*np.amax(self.model.predict(next_state)[0])
                
            else:
                target = reward
                
            target_f = self.model.predict(state)
            target_f[0][action] = target
            
            self.model.fit(state,target_f,epochs=1,verbose=0)
            
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            
    def load(self,name):
        self.model.load_weights(name)
        
    def save(self,name):
        self.model.save_weights(name)

### Training

In [4]:
n_episodes = 500

agent = Agent(state_size=4,action_size=2)
done = False
state_size = 4
action_size = 2

W0822 18:15:25.660416 17120 deprecation_wrapper.py:119] From C:\Users\Hardik Kharbanda\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0822 18:15:25.676373 17120 deprecation_wrapper.py:119] From C:\Users\Hardik Kharbanda\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0822 18:15:25.679366 17120 deprecation_wrapper.py:119] From C:\Users\Hardik Kharbanda\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0822 18:15:25.742197 17120 deprecation_wrapper.py:119] From C:\Users\Hardik Kharbanda\Anaconda3\lib\site-packages\keras\optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.



In [5]:
for episode in range(n_episodes):
    state = env.reset()
    state = np.reshape(state,[1,state_size])
    batch_size = 32
    
    for timestep in range(500):
        env.render()
        action = agent.act(state)
        next_state,reward,done,other_info = env.step(action)
        reward = reward if not done else -10
        next_state = np.reshape(next_state,[1,state_size])
        agent.remember(state,action,reward,next_state,done)
        state = next_state
        
        if done:
            print("Game episode: {}/{}  High score : {}  Exploration rate : {:.2}".format(episode,n_episodes,timestep,agent.epsilon))
            break
            
    if len(agent.memory)>batch_size:
        agent.train()
        

env.close()

W0822 18:15:38.137417 17120 deprecation_wrapper.py:119] From C:\Users\Hardik Kharbanda\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:2741: The name tf.Session is deprecated. Please use tf.compat.v1.Session instead.

W0822 18:15:38.137417 17120 deprecation_wrapper.py:119] From C:\Users\Hardik Kharbanda\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:174: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.



Game episode: 0/500  High score : 42  Exploration rate : 1.0
Game episode: 1/500  High score : 26  Exploration rate : 0.99
Game episode: 2/500  High score : 15  Exploration rate : 0.99
Game episode: 3/500  High score : 19  Exploration rate : 0.99
Game episode: 4/500  High score : 20  Exploration rate : 0.98
Game episode: 5/500  High score : 17  Exploration rate : 0.98
Game episode: 6/500  High score : 14  Exploration rate : 0.97
Game episode: 7/500  High score : 41  Exploration rate : 0.97
Game episode: 8/500  High score : 20  Exploration rate : 0.96
Game episode: 9/500  High score : 18  Exploration rate : 0.96
Game episode: 10/500  High score : 14  Exploration rate : 0.95
Game episode: 11/500  High score : 10  Exploration rate : 0.95
Game episode: 12/500  High score : 16  Exploration rate : 0.94
Game episode: 13/500  High score : 12  Exploration rate : 0.94
Game episode: 14/500  High score : 8  Exploration rate : 0.93
Game episode: 15/500  High score : 21  Exploration rate : 0.93
Game

Game episode: 131/500  High score : 63  Exploration rate : 0.52
Game episode: 132/500  High score : 17  Exploration rate : 0.52
Game episode: 133/500  High score : 21  Exploration rate : 0.51
Game episode: 134/500  High score : 15  Exploration rate : 0.51
Game episode: 135/500  High score : 12  Exploration rate : 0.51
Game episode: 136/500  High score : 10  Exploration rate : 0.51
Game episode: 137/500  High score : 12  Exploration rate : 0.5
Game episode: 138/500  High score : 13  Exploration rate : 0.5
Game episode: 139/500  High score : 10  Exploration rate : 0.5
Game episode: 140/500  High score : 8  Exploration rate : 0.5
Game episode: 141/500  High score : 11  Exploration rate : 0.49
Game episode: 142/500  High score : 11  Exploration rate : 0.49
Game episode: 143/500  High score : 12  Exploration rate : 0.49
Game episode: 144/500  High score : 35  Exploration rate : 0.49
Game episode: 145/500  High score : 32  Exploration rate : 0.48
Game episode: 146/500  High score : 14  Explo

Game episode: 260/500  High score : 45  Exploration rate : 0.27
Game episode: 261/500  High score : 19  Exploration rate : 0.27
Game episode: 262/500  High score : 14  Exploration rate : 0.27
Game episode: 263/500  High score : 37  Exploration rate : 0.27
Game episode: 264/500  High score : 37  Exploration rate : 0.27
Game episode: 265/500  High score : 31  Exploration rate : 0.26
Game episode: 266/500  High score : 20  Exploration rate : 0.26
Game episode: 267/500  High score : 19  Exploration rate : 0.26
Game episode: 268/500  High score : 18  Exploration rate : 0.26
Game episode: 269/500  High score : 15  Exploration rate : 0.26
Game episode: 270/500  High score : 13  Exploration rate : 0.26
Game episode: 271/500  High score : 14  Exploration rate : 0.26
Game episode: 272/500  High score : 15  Exploration rate : 0.26
Game episode: 273/500  High score : 18  Exploration rate : 0.25
Game episode: 274/500  High score : 19  Exploration rate : 0.25
Game episode: 275/500  High score : 54  

Game episode: 388/500  High score : 75  Exploration rate : 0.14
Game episode: 389/500  High score : 90  Exploration rate : 0.14
Game episode: 390/500  High score : 45  Exploration rate : 0.14
Game episode: 391/500  High score : 123  Exploration rate : 0.14
Game episode: 392/500  High score : 193  Exploration rate : 0.14
Game episode: 393/500  High score : 199  Exploration rate : 0.14
Game episode: 394/500  High score : 74  Exploration rate : 0.14
Game episode: 395/500  High score : 40  Exploration rate : 0.14
Game episode: 396/500  High score : 43  Exploration rate : 0.14
Game episode: 397/500  High score : 127  Exploration rate : 0.14
Game episode: 398/500  High score : 94  Exploration rate : 0.14
Game episode: 399/500  High score : 120  Exploration rate : 0.14
Game episode: 400/500  High score : 100  Exploration rate : 0.13
Game episode: 401/500  High score : 199  Exploration rate : 0.13
Game episode: 402/500  High score : 85  Exploration rate : 0.13
Game episode: 403/500  High score