In [1]:
import gym
from tensorflow.keras.models import Sequential, save_model, load_model
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import numpy as np
import random
from collections import deque

In [15]:
class DqnAgent:
    def __init__(self, state_size, action_size):
        self.state_size=state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95
        self.epsilon = 1
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = self._build_model()
    def _build_model(self):
        model = Sequential()
        model.add(Dense(24, input_shape=(self.state_size,), activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size,activation='linear'))
        model.compile(loss='mse',
                     optimizer=Adam(lr=self.learning_rate))
        return model
    def remember(self,state, action, reward, nextstate,done):
        self.memory.append((state, action, reward, nextstate, done))
    def act(self, state):
        if np.random.rand()<=self.epsilon:
            return random.randrange(self.action_size)
        else:
            act_values = self.model.predict(state)
            return np.argmax(act_values[0])
    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, nextstate, done in minibatch:
            target = reward
            if not done:
                #print(nextstate)
                target = (reward+self.gamma*np.amax(self.model.predict(nextstate)[0]))
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
        if self.epsilon>self.epsilon_min:
            self.epsilon *= self.epsilon_decay
    def load(self,name):
        self.model.load_weights(name)
    def save(self,name):
        self.model.save_weigths(name)
                        
        
        
        
        

In [2]:
env = gym.make('CartPole-v1')
#output_file = open('cartpole_v1_output.csv','w+')

In [4]:
env.reset()
env.step(0)

(array([ 0.03113634, -0.16653722, -0.04955756,  0.23700106]), 1.0, False, {})

In [7]:
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

In [17]:
agent = DqnAgent(state_size, action_size)

In [6]:
done = False
batch_size = 32
count = 0
EPISODES = 30

In [None]:
for e in range(EPISODES):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    for time in range(50):
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        next_state = next_state.reshape((1,-1))
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        output = str(e)+", "+str(time)+", "+str(agent.epsilon)+"\n"
        output_file.write(output)
        output_file.flush()
        if done:
            print('done')
            break
        if len(agent.memory)>batch_size:
            agent.replay(batch_size)

    

done
done
done
done
done
done


In [3]:
mymodel = load_model('cartpole-dqn2.h5')

In [16]:
state1 = env.reset()
env.render()
state1 = state1.reshape((1,-1))

for time in range(350):
    act_values = mymodel.predict(state1)
    action = np.argmax(act_values[0])
    #action = random.randrange(2)
    next_state, reward, done, _ = env.step(action)
    env.render()
    next_state = next_state.reshape((1,-1))
    state1 = next_state


In [19]:
state1 = env.reset()
env.render()
state1 = state1.reshape((1,-1))

for time in range(350):
    #act_values = mymodel.predict(state1)
    #action = np.argmax(act_values[0])
    action = random.randrange(2)
    next_state, reward, done, _ = env.step(action)
    env.render()
    next_state = next_state.reshape((1,-1))
    state1 = next_state