In [1]:
import numpy as np
import gym
import torch.nn as nn
import torch as t
from torch.nn import functional as F
import matplotlib.pyplot as plt
import os

In [None]:
criterion = nn.CrossEntropyLoss(reduction = 'none')
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
dicount_factor = 0.99
eplison = 0.1 #增加动作选择的随机性
lr = 0.02
env = gym.make("CartPole-v0")
env.seed(1)     # reproducible, general Policy gradient has high variance
env = env.unwrapped
batch_size = 1
epochs = 1000

class Agent(nn.Module):
    
    def __init__(self):
        
        super(Agent, self).__init__()
        
        #下面定义两个全连接层就可以了
        self.linear1 = nn.Linear(4, 10)
        nn.init.normal_(self.linear1.weight, 0, 0.3)
        nn.init.constant_(self.linear1.bias, 0.1)
        self.linear2 = nn.Linear(10, 2)
        nn.init.normal_(self.linear2.weight, 0, 0.3)
        nn.init.constant_(self.linear2.bias, 0.1)
        
    def forward(self, x):
        
        out = t.from_numpy(x).float()
        
        out = self.linear1(out)
        out = F.tanh(out)
                
        out = self.linear2(out)
        
        # 这个输出主要是用来使用概率来挑选动作
        prob = F.softmax(out, dim = 1) 
        
        # prob动作的概率分布softmax处理后的，处理前的动作分布
        return prob, out

def choose_action(prob):

    action = np.random.choice(a = 2, p = prob[0].detach().numpy())
    
    return action
          
def get_one_batch(agent):
                
    reward_an_episode = []
    observation_an_episode = []
    action_an_episode = []
    observation = env.reset()
    done = False
    
    while not done:
        
        env.render()
        observation = np.expand_dims(observation, axis = 0)
        prob, log_prob = agent(observation)
        observation_an_episode.append(observation)
        action = choose_action(prob)
        action_an_episode.append(action)
        observation, reward, done, info = env.step(action)
        
        reward_an_episode.append(reward)
            
    # concatenate 数组拼接
    return action_an_episode, np.concatenate(observation_an_episode, axis = 0), reward_an_episode
    

def learn():
    
    #定义一个网络实例
    agent = Agent()
    
    train_loss = []
    train_reward = []
    for e in range(epochs):
        
        #定义一个优化器
        optim = t.optim.Adam(agent.parameters(),lr = lr)
        batch_data = get_one_batch(agent)
 
        #下面开始计算损失函数，要注意，这里的损失函数是有agent所获得奖励的来的
        #先计算奖励的累计。这里train就是一个epoch
        agent.train()
        
        # 1.先根据网络结果选动作，进行蒙特卡洛采样以获得一个完整的episode，
        # 2.即具有序列时间步的(s,a,r)；利用(s,a)作为训练数据，
        # 3.训练三层的神经网络，神经网络的最后一层是softmax函数。
        # 4.损失函数设置为交叉熵损失（s状态下真实动作a和网络预测动作a的差距）
        # 和价值函数（在一个episode中，每个时间步状态的值函数）的乘积。
        
        # done=True就三结束了train然后开始learn
        actions = t.tensor(batch_data[0])
        observations = batch_data[1]
        rewards = batch_data[2]
        train_reward.append(sum(rewards))
        
        acc_reward = []
        # 现在得到的reward是每个action的reward。
        # 我们要站在第一个action时候对后续所有reward进行衰减，也就是乘上gama
        for i in range(len(rewards)):
            acc_r = 0
            for j in range(i, len(rewards)):
                acc_r += dicount_factor ** (j-i) * rewards[j]
            acc_reward.append(acc_r)
            
        # 处理所有reward使其满足标准正太分布
        acc_reward = t.tensor(acc_reward)
        acc_reward -= acc_reward.mean()  
        acc_reward /= acc_reward.std()
        
        # 最大化奖励(log_p * R)就是最小化-(log_p * R)
        # and the tf only have minimize(loss)
        # neg_log_prob = tf.reduce_sum(-tf.log(self.all_act_prob)*tf.one_hot(self.tf_acts, self.n_actions), axis=1)
        # reward guided loss
        prob, logits = agent(observations)        
        log_prob = criterion(logits, actions)       
        log_reward = log_prob * acc_reward
        
        loss = log_reward.mean()
        
        train_loss.append(loss)
        optim.zero_grad()
        
        loss.backward()
        optim.step()
        

    plt.plot(train_loss)
    plt.plot(train_reward)

if __name__ == '__main__':
    learn()
    env.close()



In [12]:
class Agent2(nn.Module):
    
    def __init__(self):
        
        super(Agent2, self).__init__()
        
        #下面定义两个全连接层就可以了
        self.linear1 = nn.Linear(4, 10)
        nn.init.normal_(self.linear1.weight, 0, 0.3)
        nn.init.constant_(self.linear1.bias, 0.1)
        self.linear2 = nn.Linear(10, 2)
        nn.init.normal_(self.linear2.weight, 0, 0.3)
        nn.init.constant_(self.linear2.bias, 0.1)
        
    def forward(self, x):
        
        out = t.from_numpy(x).float()
        
        out = self.linear1(out)
        out = F.tanh(out)
                
        out = self.linear2(out)
        
        # 这个输出主要是用来使用概率来挑选动作
        prob = F.softmax(out, dim = 1) 
        
        return prob, out
av = Agent2()
print(type(av.parameters))
print(type(av.parameters()))
print(list(av.parameters()))

<class 'method'>
<class 'generator'>
[Parameter containing:
tensor([[ 0.6110, -0.1681, -0.1035, -0.0988],
        [-0.1482,  0.1996, -0.1390, -0.2136],
        [ 0.0256,  0.1145, -0.4795,  0.2283],
        [-0.3503,  0.0462, -0.2296, -0.3252],
        [-0.0177,  0.1209,  0.2520,  0.1926],
        [-0.0536,  0.0241, -0.0305, -0.1277],
        [-0.0820, -0.1940,  0.0118, -0.0192],
        [-0.3222, -0.0980,  0.2215,  0.0351],
        [ 0.4188,  0.1760,  0.0363, -0.1564],
        [-0.0116, -0.3528, -0.0363, -0.1773]], requires_grad=True), Parameter containing:
tensor([0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
        0.1000], requires_grad=True), Parameter containing:
tensor([[-0.3352,  0.1983, -0.3473,  0.0352,  0.3200,  0.1734, -0.2385,  0.6449,
         -0.3146,  0.1317],
        [ 0.2413,  0.2805,  0.5875, -0.1457,  0.0265,  0.0962,  0.0038, -0.1957,
          0.0339,  0.0359]], requires_grad=True), Parameter containing:
tensor([0.1000, 0.1000], requires_

In [8]:
import torch
a = torch.tensor([1.,2.,3.,4.])
print(a.mean())
print(a.std())
b = a - a.mean()
print(b)
print(b.std())
b = b / b.std()
print(b)
print(b.mean())
print(b.std())

tensor(2.5000)
tensor(1.2910)
tensor([-1.5000, -0.5000,  0.5000,  1.5000])
tensor(1.2910)
tensor([-1.1619, -0.3873,  0.3873,  1.1619])
tensor(0.)
tensor(1.)
