In [1]:
import gym
import numpy as np
import random 

In [2]:
from collections import deque

In [3]:
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

In [7]:
class DQNAgent:
    
    def __init__(self , state_size , action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen = 2000)
        self.gamma = 0.95
        self.epsilon = 1.0  #exploration
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01
        self.learning_rate = 0.001
        self.model = self.build_model()
        
    def build_model(self):
        model = Sequential()
        model.add(Dense(24,input_dim=self.state_size , activation ='relu'))
        model.add(Dense(24, activation = 'relu'))
        model.add(Dense(self.action_size, activation = "linear"))
        model.compile(loss= "mse" , optimizer = Adam(lr = self.learning_rate))
        return model
    
    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])
    
    def remember(self , state , action , reward , next_state , done):
        self.memory.append((state , action , reward , next_state , done))
    
    def replay(self , batch_size):
        minibatch = random.sample(self.memory , batch_size)
        for(state , action , reward , next_state , done) in minibatch:
            target = reward
            if not done:
                q_values = self.model.predict(next_state)[0]
                target += self.gamma * np.max(q_values)
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state , target_f , epochs =1 , verbose = 0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            
        def load(self , name):
            self.model.load_weights(name)
            
        def save(self,name):
            self.model.save_weights(name)

In [8]:
EPISODES = 30 

env = gym.make("CartPole-v1")
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

agent = DQNAgent(state_size , action_size)

done = False
batch_size = 32

for e in range(EPISODES):
    state = env.reset()
    state = state.reshape(1 , state_size)
    
    for time in range(500):
        #env.render()
        action = agent.act(state)
        next_state , reward, done, _ = env.step(action)
        reward = reward if not done else -10
        next_state = next_state.reshape(1, state_size)
        agent.remember(state , action , reward , next_state , done)
        state = next_state
        if done or time==499:
            print(f"Ep: {e}/{EPISODES} Score:{time} Epsilon:{agent.epsilon}")
            break
        if len(agent.memory) > batch_size:
            agent.replay(batch_size)
    #env.close()      



Ep: 0/30 Score:18 Epsilon:1.0

Ep: 1/30 Score:27 Epsilon:0.9322301194154049
Ep: 2/30 Score:23 Epsilon:0.8307187014821328
Ep: 3/30 Score:25 Epsilon:0.7328768546436799
Ep: 4/30 Score:96 Epsilon:0.4529463432347434
Ep: 5/30 Score:34 Epsilon:0.3819719776053028
Ep: 6/30 Score:27 Epsilon:0.33362200135903064
Ep: 7/30 Score:31 Epsilon:0.285607880564032
Ep: 8/30 Score:32 Epsilon:0.24328132378095624
Ep: 9/30 Score:34 Epsilon:0.20516038984972615
Ep: 10/30 Score:56 Epsilon:0.1549480222912372
Ep: 11/30 Score:36 Epsilon:0.12936504510050365
Ep: 12/30 Score:78 Epsilon:0.08750185146499175
Ep: 13/30 Score:69 Epsilon:0.06191698689958447
Ep: 14/30 Score:233 Epsilon:0.019256955536244666
Ep: 15/30 Score:226 Epsilon:0.00998645168764533
Ep: 16/30 Score:177 Epsilon:0.00998645168764533
Ep: 17/30 Score:358 Epsilon:0.00998645168764533
Ep: 18/30 Score:234 Epsilon:0.00998645168764533
Ep: 19/30 Score:158 Epsilon:0.00998645168764533
Ep: 20/30 Score:211 Epsilon:0.00998645168764533
Ep: 21/30 Score:193 Epsilon:0.00998645