In [54]:
import numpy as np
import sys
import random 
import gym
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from collections import deque
from tensorflow.keras.models import load_model
from tensorflow.keras.optimizers import Adam
import os

In [43]:
env=gym.make('CartPole-v0')

In [44]:
state_size=env.observation_space.shape[0]
action_size=env.action_space.n
print(action_size)
batch_size=32
n_episodes=1000
print(env.observation_space.shape[0])

2
4


In [45]:
output_dir='model_output/cartpole'

In [46]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Define DQNAgent

In [47]:
class DQNAgent:
    def __init__(self,state_size,action_size):
        self.state_size=state_size
        self.action_size=action_size
        self.memory=deque(maxlen=2000)
        self.gamma=0.95
        self.epsilon=1.0
        self.epsilon_decay=0.995
        self.epsilon_min=0.01
        self.learning_rate=0.001
        self.model=self.build_model()
    def build_model(self):
        model=Sequential()
        model.add(Dense(24,input_dim=self.state_size,activation='relu'))
        model.add(Dense(24,activation='relu'))
        model.add(Dense(self.action_size,activation='linear'))
        
        model.compile(loss='mse',optimizer=Adam(lr=self.learning_rate))
        
        return model
    def remember(self,state,action,reward,next_state,done):
        self.memory.append((state,action,reward,next_state,done))
    
    def act(self,state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values=self.model.predict(state)
        return np.argmax(act_values[0])
        
    def replay(self,batch_size):
        minibatch=random.sample(self.memory,batch_size)
        
        for state,action,reward,next_state,done in minibatch:
            target=reward
            
            if not done:
                target=reward+self.gamma*np.amax(self.model.predict(next_state)[0])
            target_f=self.model.predict(state)
            target_f[0][action]=target

            self.model.fit(state,target_f,epochs=1,verbose=0)
            
        if self.epsilon>self.epsilon_min:
            self.epsilon*=self.epsilon_decay
    
    def load(self,name):
        self.model.load_weights(name)
        
    def save(self,name):
        self.model.save_weights(name)

In [48]:
agent=DQNAgent(state_size,action_size)

In [49]:
done =False

for e in range(n_episodes):
    
    state=env.reset()
    state=np.reshape(state,[1,state_size])
    
    for time in range(5000):
        
        action=agent.act(state)
        
        next_state,reward,done,_=env.step(action)
        
        reward=reward if not done else -10
        
        next_state=np.reshape(next_state,[1,state_size])
        
        agent.remember(state,action,reward,next_state,done)
        
        state=next_state
        
        if done:
            print("episode: {}/{},score:{},e:{:.2}".format(e,n_episodes,time,agent.epsilon))
            break
    if len(agent.memory)>batch_size:
        agent.replay(batch_size)
        
    if e%50==0:
        agent.save(output_dir+"weights_"+'{:04d}'.format(e)+".hdf5")
            

episode: 0/1000,score:37,e:1.0
episode: 1/1000,score:17,e:0.99
episode: 2/1000,score:28,e:0.99
episode: 3/1000,score:13,e:0.99
episode: 4/1000,score:26,e:0.98
episode: 5/1000,score:9,e:0.98
episode: 6/1000,score:17,e:0.97
episode: 7/1000,score:68,e:0.97
episode: 8/1000,score:24,e:0.96
episode: 9/1000,score:10,e:0.96
episode: 10/1000,score:26,e:0.95
episode: 11/1000,score:17,e:0.95
episode: 12/1000,score:14,e:0.94
episode: 13/1000,score:14,e:0.94
episode: 14/1000,score:19,e:0.93
episode: 15/1000,score:11,e:0.93
episode: 16/1000,score:21,e:0.92
episode: 17/1000,score:12,e:0.92
episode: 18/1000,score:18,e:0.91
episode: 19/1000,score:34,e:0.91
episode: 20/1000,score:17,e:0.9
episode: 21/1000,score:11,e:0.9
episode: 22/1000,score:8,e:0.9
episode: 23/1000,score:39,e:0.89
episode: 24/1000,score:20,e:0.89
episode: 25/1000,score:15,e:0.88
episode: 26/1000,score:11,e:0.88
episode: 27/1000,score:13,e:0.87
episode: 28/1000,score:72,e:0.87
episode: 29/1000,score:13,e:0.86
episode: 30/1000,score:11,

In [50]:
agent.model.get_weights()

[array([[-0.5136269 ,  0.22354376, -0.356346  , -0.15095195, -0.40293092,
          0.05286238, -0.1995613 ,  0.04605887,  0.25792813, -0.18895665,
         -0.20626788,  0.43274203, -0.1310055 ,  0.37788224, -0.2164606 ,
         -0.62478834, -0.06008135, -0.2726493 ,  0.14089048, -0.23656699,
         -0.04926095,  0.7352329 , -0.04172646,  0.16454843],
        [ 0.47146812,  0.34411976,  0.34479463,  0.18039586,  0.40390918,
         -0.08785699, -0.39814544,  0.2204008 , -0.24865748,  0.01038587,
         -0.28036216,  0.07590558,  0.2094001 , -0.19335453, -0.05573572,
         -0.33591792,  0.21912499,  0.25236484, -0.3947551 ,  0.29363343,
         -0.11265365,  0.467672  , -0.27295554, -0.2536482 ],
        [ 1.1822658 ,  0.56663877,  0.17875125,  0.25129008,  0.09292689,
         -0.27758905, -0.43143892,  0.06589034,  0.2697071 , -0.33226836,
         -0.50968903,  0.69401044,  0.33509457,  1.0840937 , -0.29785174,
          0.94343865, -0.23930253, -0.14689013, -0.08736755,  

In [51]:
agent.model.save("cartpole_weights.h5")

In [52]:
agent.model.save('cartpole_weights_V2.h5')

In [55]:
new_model=load_model('cartpole_weights_V2.h5' )

new_model.summary()

In [56]:
new_model.summary()

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_30 (Dense)             (None, 24)                120       
_________________________________________________________________
dense_31 (Dense)             (None, 24)                600       
_________________________________________________________________
dense_32 (Dense)             (None, 2)                 50        
Total params: 770
Trainable params: 770
Non-trainable params: 0
_________________________________________________________________


In [58]:
new_model.get_weights()

[array([[-0.5136269 ,  0.22354376, -0.356346  , -0.15095195, -0.40293092,
          0.05286238, -0.1995613 ,  0.04605887,  0.25792813, -0.18895665,
         -0.20626788,  0.43274203, -0.1310055 ,  0.37788224, -0.2164606 ,
         -0.62478834, -0.06008135, -0.2726493 ,  0.14089048, -0.23656699,
         -0.04926095,  0.7352329 , -0.04172646,  0.16454843],
        [ 0.47146812,  0.34411976,  0.34479463,  0.18039586,  0.40390918,
         -0.08785699, -0.39814544,  0.2204008 , -0.24865748,  0.01038587,
         -0.28036216,  0.07590558,  0.2094001 , -0.19335453, -0.05573572,
         -0.33591792,  0.21912499,  0.25236484, -0.3947551 ,  0.29363343,
         -0.11265365,  0.467672  , -0.27295554, -0.2536482 ],
        [ 1.1822658 ,  0.56663877,  0.17875125,  0.25129008,  0.09292689,
         -0.27758905, -0.43143892,  0.06589034,  0.2697071 , -0.33226836,
         -0.50968903,  0.69401044,  0.33509457,  1.0840937 , -0.29785174,
          0.94343865, -0.23930253, -0.14689013, -0.08736755,  