In [21]:
# DQN
import keras
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import gymnasium as gym
import os
import tensorflow as tf

%matplotlib inline

In [22]:
"""
use the dense to generate the action
"""


class Network(keras.Model):
    def __init__(self, actionNum, stateShape, learningRate):
        super(Network, self).__init__()

        self.model = keras.Sequential(
            [
                keras.layers.Dense(32, input_shape=stateShape),
                keras.layers.LeakyReLU(),
                keras.layers.Dense(64),
                keras.layers.LeakyReLU(),
                keras.layers.Dense(actionNum),
            ]
        )

        self.model.compile(
            loss = 'mse',
            optimizer = keras.optimizers.Adam(learningRate)
        )

        self.lossFunc = keras.losses.mean_squared_error
        self.optimizer = keras.optimizers.Adam(learningRate)

    def call(self, state):
        return self.model(state, training=False)

    # @tf.function
    def Update(self, lastState, nextState, targetNetwork, reward, gamma):
        with tf.GradientTape() as tape:
            evaluateValue = tf.reduce_max(self.model(lastState, training=True).numpy(), axis = 1)
            targetValue = targetNetwork(nextState, training=False)

            targetValue = reward + gamma * np.max(targetValue.numpy(), axis=1)
            print(evaluateValue.shape, targetValue.shape)
            # //TODO 修改梯度下降法
            loss = self.lossFunc(evaluateValue, targetValue)
        gradients = tape.gradient(loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))

        return sum(loss)

    def CopyVariable(self, network):
        """
        param:
            network:    the other network model
                type:   Sequencial
        func:
            Copy the variable of other network model to self
        """
        for selfLayer, otherLayer in zip(self.model.layers, network.layers):
            selfLayer.set_weights(otherLayer.get_weights())


class DQN:
    def __init__(
        self,
        actionNum,
        stateShape,
        epsilon,
        gamma,
        batchSize,
        poolSize,
        learningRate,
        environment,
        startCount,
        learningCut,
        updateTargetCut,
        savePath,
        saveName,
    ):
        """
        param:
            actionNum:          the number of action
            stateShape:         the shape of state
            epsilon:            the parameter which is used in e-Greedy
            gamma:              used in the loss of Q Max
            batchSize:          the batch size
            poolSize:           the size of memory pool
            learningRate:       for optimize the network
            environment:        the game which network will learn to play
            startCount:         the step when network start to update
            learningCut:        after one episode's step reach to the startCount, every learningCut we will update the parameter of network
            updateTargetCut:  when reach this number, copy the weight of ChooseActionNetwork to the TargetNetwork

            savePath:           the dir model will be saved
            saveName:           the name model will be saved
        """

        self.environment = environment

        self.actionNum = actionNum
        self.stateShape = stateShape
        self.epsilon = epsilon
        self.gamma = gamma
        self.batchSize = batchSize
        self.poolSize = poolSize

        self.chooseActionNet = Network(actionNum, stateShape, learningRate)
        self.targetNetwork = Network(actionNum, stateShape, learningRate)

        self.memoryPool = {
            "state": [],
            "action": [],
            "reward": [],
            "nextState": [],
        }

        # self.memoryPool = []
        self.memoryLength = 0

        self.startCount = startCount
        self.learningCut = learningCut
        self.updateTargetCut = updateTargetCut

        self.savePath = savePath
        self.saveName = saveName

        # to store the rewards
        self.rewardsStore = []

        # if not os.path.exists(savePath):
        #     os.mkdir(savePath)

    def ChooseAction(self, state):
        if np.random.uniform() > self.epsilon:
            return np.random.randint(self.actionNum)

        predictValue = self.chooseActionNet.call(state)
        return np.argmax(predictValue)

    def StoreMemory(self, lastState, action, reward, nextState):
        self.memoryPool["state"].append(np.array(lastState))
        self.memoryPool["action"].append(np.array(action))
        self.memoryPool["reward"].append(np.array(reward))
        self.memoryPool["nextState"].append(np.array(nextState))

        

        if self.memoryLength > self.poolSize:
            self.memoryPool["state"].pop(0)
            self.memoryPool["action"].pop(0)
            self.memoryPool["reward"].pop(0)
            self.memoryPool["nextState"].pop(0)
        else:
            self.memoryLength += 1
        # self.memoryPool.append(
        #     (lastState, action, reward, nextState)
        # )

    def ChooseMemory(self):
        """
        func:
            Select a batchSize of observations from the memoryPool
        return:
            laststate
            action
            reward
            nextState
        """

        # 生成需要的 index
        index = np.random.choice(self.memoryLength - 1, size = self.batchSize)
        laststate = []
        action = []
        reward = []
        nextState = []

        for idx in index:
            laststate.append(self.memoryPool["state"][idx])
            action.append(self.memoryPool["action"][idx])
            reward.append(self.memoryPool["reward"][idx])
            nextState.append(self.memoryPool["nextState"][idx])

        return (
            np.array(laststate),
            np.array(action),
            np.array(reward),
            np.array(nextState),
        )

    def UpdateNetwork(self):
        """
        param:

        func:
            use the memory to make the batch to feed the network
        """

        lastState, action, reward, nextState = self.ChooseMemory()

        # history = self.chooseActionNet.model.fit(evaluation, target)
        # loss = self.chooseActionNet.Update(
        #     lastState, nextState, self.targetNetwork.model, reward, self.gamma
        # )

        targetValue = self.chooseActionNet.model.predict(lastState, verbose = 0)

        target = self.targetNetwork.model.predict(nextState)
        targetValue[range(self.batchSize), action] = reward + self.gamma * np.max(target, axis = 1)

        history = self.chooseActionNet.model.fit(lastState, targetValue, verbose = 0)

        return sum(history.history['loss']) / float(self.batchSize)

    def CopyNetwork(self):
        """
        while copies the variable, this function will store the weight to the disk
        """

        self.targetNetwork.CopyVariable(self.chooseActionNet.model)

        # Save Model
        self.targetNetwork.model.save_weights(self.savePath + self.saveName)

    def Train(self):
        isDone = False
        lastState, _ = self.environment.reset()
        learningCount = 0
        afterCount = 0

        rewardSum = 0
        loss = 0
        while not isDone:
            action = self.ChooseAction(np.array([lastState]))

            nextState, reward, isDone, _, _ = self.environment.step(action)

            self.StoreMemory(lastState, action, reward, nextState)

            if learningCount > self.startCount:
                if afterCount % self.learningCut == 0:
                    loss += self.UpdateNetwork()

                if afterCount % self.updateTargetCut == 0:
                    self.CopyNetwork()

                afterCount += 1

            lastState = nextState
            learningCount += 1
            rewardSum += reward

        self.rewardsStore.append(rewardSum)
        return rewardSum, learningCount, loss / learningCount

In [23]:
# define the super parameter
env = gym.make("CartPole-v1")
ACTIONNUM = env.action_space.n
STATESHAPE = env.observation_space.shape
EPSILON = 0.7
GAMMA = 0.8
STARTCOUNT = 10
LEARNINGCUT = 5
UPDATETARGETCUT = 10
BATCHSIZE = 16
POOLSIZE = 200
LEARNINGRATE = 0.3
SAVEPATH = "../SavedModel/RL/DQN/"
SAVENAME = "DQN1weight.h5"

EPOCHS = 1000

print(STATESHAPE)

(4,)


In [24]:
# use the model
dqnModel = DQN(
    ACTIONNUM,
    STATESHAPE,
    EPSILON,
    GAMMA,
    BATCHSIZE,
    POOLSIZE,
    LEARNINGRATE,
    env,
    STARTCOUNT,
    LEARNINGCUT,
    UPDATETARGETCUT,
    SAVEPATH,
    SAVENAME,
)

for i in range(1000):
    reward, step, loss = dqnModel.Train()
    print("round: %d, reward: %.2f step: %d loss: %.2f" % (i + 1, reward, step, loss))

round: 1, reward: 48.00 step: 48 loss: 0.30
round: 2, reward: 40.00 step: 40 loss: 0.22
round: 3, reward: 9.00 step: 9 loss: 0.00
round: 4, reward: 9.00 step: 9 loss: 0.00
round: 5, reward: 11.00 step: 11 loss: 0.00
round: 6, reward: 10.00 step: 10 loss: 0.00
round: 7, reward: 14.00 step: 14 loss: 0.06
round: 8, reward: 14.00 step: 14 loss: 0.45
round: 9, reward: 10.00 step: 10 loss: 0.00
round: 10, reward: 9.00 step: 9 loss: 0.00
round: 11, reward: 13.00 step: 13 loss: 0.05
round: 12, reward: 12.00 step: 12 loss: 0.13
round: 13, reward: 9.00 step: 9 loss: 0.00
round: 14, reward: 9.00 step: 9 loss: 0.00
round: 15, reward: 13.00 step: 13 loss: 0.61
round: 16, reward: 10.00 step: 10 loss: 0.00
round: 17, reward: 13.00 step: 13 loss: 0.30
round: 18, reward: 31.00 step: 31 loss: 1.16
round: 19, reward: 10.00 step: 10 loss: 0.00
round: 20, reward: 11.00 step: 11 loss: 0.00
round: 21, reward: 8.00 step: 8 loss: 0.00
round: 22, reward: 11.00 step: 11 loss: 0.00
round: 23, reward: 12.00 step: 

In [25]:
dqnModel.rewardsStore

[48.0,
 40.0,
 9.0,
 9.0,
 11.0,
 10.0,
 14.0,
 14.0,
 10.0,
 9.0,
 13.0,
 12.0,
 9.0,
 9.0,
 13.0,
 10.0,
 13.0,
 31.0,
 10.0,
 11.0,
 8.0,
 11.0,
 12.0,
 11.0,
 10.0,
 12.0,
 18.0,
 20.0,
 9.0,
 11.0,
 9.0,
 14.0,
 11.0,
 9.0,
 10.0,
 13.0,
 10.0,
 9.0,
 9.0,
 9.0,
 8.0,
 9.0,
 11.0,
 9.0,
 13.0,
 11.0,
 12.0,
 16.0,
 25.0,
 12.0,
 11.0,
 13.0,
 11.0,
 11.0,
 12.0,
 17.0,
 12.0,
 13.0,
 11.0,
 12.0,
 12.0,
 14.0,
 14.0,
 16.0,
 20.0,
 9.0,
 11.0,
 12.0,
 10.0,
 9.0,
 9.0,
 10.0,
 28.0,
 11.0,
 14.0,
 13.0,
 11.0,
 13.0,
 12.0,
 17.0,
 12.0,
 9.0,
 13.0,
 12.0,
 11.0,
 10.0,
 12.0,
 10.0,
 8.0,
 11.0,
 9.0,
 13.0,
 14.0,
 8.0,
 9.0,
 13.0,
 18.0,
 13.0,
 40.0,
 13.0,
 9.0,
 12.0,
 9.0,
 16.0,
 10.0,
 9.0,
 9.0,
 12.0,
 12.0,
 13.0,
 9.0,
 10.0,
 13.0,
 14.0,
 9.0,
 11.0,
 11.0,
 10.0,
 11.0,
 10.0,
 11.0,
 9.0,
 11.0,
 10.0,
 10.0,
 11.0,
 14.0,
 14.0,
 8.0,
 9.0,
 10.0,
 10.0,
 10.0,
 10.0,
 13.0,
 9.0,
 10.0,
 10.0,
 12.0,
 9.0,
 10.0,
 8.0,
 11.0,
 10.0,
 10.0,
 10.0,
 10.0,
 9.0,
