### IAR PROJET
Hu Ruohui 

In [30]:
import gym
import argparse
import random
import numpy as np
import matplotlib.pyplot as plt
import os.path
import torch as T
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from easydict import EasyDict as edict


## Class Bandit

In [47]:
class TwoArmbandit():
    def __init__(self):
        self.state = 0
        self.num_actions = 2
        self.reset()
    
    #set the baseline probability of reward for action a.
    #sampling from a uniform Benoulli distribution and held fix for the entire episode
    def reset(self):
        self.timestep = 0 
        self.nb_al = 0
        self.nb_ar = 0
        self.nb = [0,0]
        
        #print("timestepmax",self.timestepmax)
        variance = np.random.uniform(0,.1)
        self.baseline_prob = [variance,0.5-variance]
        #print("baseline prob",self.baseline_prob)
        return self.state

    ##get action from the network
    def pullArm(self,action,prev_actions):
        self.timestep += 1    
        p_action_init = self.baseline_prob[action]
        p_action = p_action_init   
        #print("action",action)
        #print("prev_actions",prev_actions)        
        if action == 0 and len(prev_actions)!=0 :
            if prev_actions[-1] == 0: 
                self.nb_al+=1
                p_action = 1 - np.power((1-p_action),self.nb_al +1)
                #print("nb_al",self.nb_al)
            else:
                self.nb_al = 0 
                #print("nb_al",self.nb_al)
            reward = random.choices([1,0],weights=[p_action,1-p_action])[0]
            #print("reward_proba",p_action,"reward",reward)
        elif action == 1 and len(prev_actions)!=0: 
            if prev_actions[-1] == 1: 
                self.nb_ar+=1
                p_action = 1 - np.power((1-p_action),self.nb_ar +1)
                #print("nb_ar",self.nb_ar)
            else:
                self.nb_ar = 0 
                #print("nb_ar",self.nb_ar)
            reward = random.choices([1,0],weights=[p_action,1-p_action])[0]
            #print("reward_proba",p_action,"reward",reward)
        else:
            if action == 0 : 
                self.nb_al+=1
                reward = random.choices([1,0],weights=[p_action,1-p_action])[0]
                #print("reward_proba",p_action,"reward",reward)
            else :
                self.nb_ar+=1
                reward = random.choices([1,0],weights=[p_action,1-p_action])[0]
                #print("reward_proba",p_action,"reward",reward)
        #print("timestep",self.timestep,"action",action)
        if self.timestep > self.timestepmax: 
            #print("nombre",self.nb_al,self.nb_ar)
            done = True
        else: done = False
        return  self.state,reward,done       

In [48]:
# ActorNet LSTM
class ActorNetwork(nn.Module):

    def __init__(self,in_size,hidden_size,out_size):
        super(ActorNetwork, self).__init__()
        self.lstm = nn.LSTM(in_size, hidden_size, batch_first = True)
        self.fc = nn.Linear(hidden_size,out_size)

    def forward(self, x, hidden):
        x, hidden = self.lstm(x, hidden)
        x = self.fc(x)
        x = F.log_softmax(x,2)  # log(softmax(x))
        return x, hidden

class ValueNetwork(nn.Module):
    def __init__(self,in_size,hidden_size,out_size):
        super(ValueNetwork, self).__init__()
        self.lstm = nn.LSTM(in_size, hidden_size, batch_first = True)
        self.fc = nn.Linear(hidden_size,out_size)

    def forward(self,x, hidden):
        x, hidden = self.lstm(x, hidden)
        x = self.fc(x)
        return x, hidden

In [49]:
env = TwoArmbandit()
config = edict()
config.Train = edict({
    "env_name":"TwoArmbandit",
    "action_dim":2,
    "bactch_size":10,
    "lr" : 0.001,
    "gamma":0.75,
    "max_num_step":random.randint(50,100),
    "num_episodes":10,
    "hidden_size":48,
    "seed":88
})

def roll_out(actor_network,env,episode_len,value_network,init_state):
    states = []
    actions = []
    rewards = []
    is_done = False
    final_reward = 0
    state = init_state # 初始状态
    a_hx = T.zeros(config.Train.hidden_size).unsqueeze(0).unsqueeze(0) # 初始化隐状态
    a_cx = T.zeros(config.Train.hidden_size).unsqueeze(0).unsqueeze(0)
    c_hx = T.zeros(config.Train.hidden_size).unsqueeze(0).unsqueeze(0)
    c_cx = T.zeros(config.Train.hidden_size).unsqueeze(0).unsqueeze(0)

    for j in range(config.Train.max_num_step):
        states.append(state)
        log_softmax_action, (a_hx,a_cx) = actor_network(Variable(T.Tensor([state]).unsqueeze(0)), (a_hx,a_cx))
        # from torch.distributions import Categorical
        softmax_action = T.exp(log_softmax_action) # 对数softmax取指数，保证大于0
        action = np.random.choice(config.Train.action_dim,p=softmax_action.cpu().data.numpy()[0][0])
    
        one_hot_action = [int(k == action) for k in range(config.Train.action_dim)]
        
        reward,done,_ = env.pull(action)
        next_state = np.delete(next_state, 1)
        #fix_reward = -10 if done else 1
        
        actions.append(one_hot_action)
        rewards.append(reward)
        final_state = next_state 
        state = next_state
        if done:
            is_done = True
            state = env.reset()
            state = np.delete(state,1)
            a_hx = T.zeros(config.Train.hidden_size).unsqueeze(0).unsqueeze(0);
            a_cx = T.zeros(config.Train.hidden_size).unsqueeze(0).unsqueeze(0);
            c_hx = T.zeros(config.Train.hidden_size).unsqueeze(0).unsqueeze(0);
            c_cx = T.zeros(config.Train.hidden_size).unsqueeze(0).unsqueeze(0);
            # 打印episode总分
            print(j+1)
            break
    
    return actions,rewards,final_reward


In [50]:
def discount_reward(r, gamma,final_r):
    '''
    r:          list
    final_r:    scalar
    '''
    discounted_r = np.zeros_like(r)
    running_add = final_r
    for t in reversed(range(0, len(r))):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r


In [51]:
   # 初始化价值网络
init_state = env.reset()
value_network = ValueNetwork(in_size=4,hidden_size=config.Train.hidden_size, out_size=1)
value_network_optim = T.optim.Adam(value_network.parameters(),lr=0.005)

    # 初始化动作网络
actor_network = ActorNetwork(in_size=4,hidden_size=config.Train.hidden_size, out_size=config.Train.action_dim)
actor_network_optim = T.optim.Adam(actor_network.parameters(),lr = 0.001)


In [52]:
steps =[]
task_episodes =[]
test_results =[]

for episode in range(config.Train.num_episodes):
    # 完成一轮rollout
    states,actions,rewards,final_r,current_state = roll_out(actor_network,env,config.Train.max_num_step,value_network,init_state)
    #states.shape = [epi_len,3],list
    
    # rollout结束后的初态
    init_state = current_state
    actions_var = Variable(T.Tensor(actions).view(-1,config.Train.action_dim)).unsqueeze(0)
    states_var = Variable(T.Tensor(states).view(-1,STATE_DIM=0)).unsqueeze(0)

    # 训练动作网络
    a_hx = T.zeros(config.Train.hidden_size).unsqueeze(0).unsqueeze(0);
    a_cx = T.zeros(config.Train.hidden_size).unsqueeze(0).unsqueeze(0);
    c_hx = T.zeros(config.Train.hidden_size).unsqueeze(0).unsqueeze(0);
    c_cx = T.zeros(config.Train.hidden_size).unsqueeze(0).unsqueeze(0);
    
    actor_network_optim.zero_grad()
    # print(states_var.unsqueeze(0).size())
    log_softmax_actions, (a_hx,a_cx) = actor_network(states_var, (a_hx,a_cx))
    vs, (c_hx,c_cx) = value_network(states_var, (c_hx,c_cx)) # 给出状态价值估计
    vs.detach()    # 不参与求梯度
    
    # 计算Q(s,a)和Advantage函数
    qs = Variable(T.Tensor(discount_reward(rewards,config.train.gamma,final_r)))
    qs = qs.view(1, -1, 1)
    advantages = qs - vs
    # print('adv,',advantages.shape)
    # log_softmax_actions * actions_var是利用独热编码特性取出对应action的对数概率
    actor_network_loss = - T.mean(T.sum(log_softmax_actions*actions_var,1)* advantages)
    actor_network_loss.backward()
    T.nn.utils.clip_grad_norm(actor_network.parameters(),0.5)
    actor_network_optim.step()

    # 训练价值网络
    value_network_optim.zero_grad()
    target_values = qs
    a_hx = T.zeros(config.Train.hidden_size).unsqueeze(0).unsqueeze(0);
    a_cx = T.zeros(config.Train.hidden_size).unsqueeze(0).unsqueeze(0);
    c_hx = T.zeros(config.Train.hidden_size).unsqueeze(0).unsqueeze(0);
    c_cx = T.zeros(config.Train.hidden_size).unsqueeze(0).unsqueeze(0);
    values, (c_hx,c_cx) = value_network(states_var, (c_hx,c_cx))

    criterion = nn.MSELoss()
    value_network_loss = criterion(values,target_values)
    value_network_loss.backward()
    T.nn.utils.clip_grad_norm(value_network.parameters(),0.5)
    value_network_optim.step()

    # Testing
    if (episode + 1) % 50== 0:
            result = 0
            test_task = TwoArmbandit()
            for test_epi in range(10):       # 测试10个episode
                state = test_task.reset()
                state = np.delete(state,1)
                
                a_hx = T.zeros(config.Train.hidden_size).unsqueeze(0).unsqueeze(0);
                a_cx = T.zeros(config.Train.hidden_size).unsqueeze(0).unsqueeze(0);
                c_hx = T.zeros(config.Train.hidden_size).unsqueeze(0).unsqueeze(0);
                c_cx = T.zeros(config.Train.hidden_size).unsqueeze(0).unsqueeze(0);
                
                for test_step in range(500): # 每个episode最长500frame
                    
                    log_softmax_actions, (a_hx,a_cx) = actor_network(Variable(T.Tensor([state]).view(1,1,3)), (a_hx,a_cx))
                    softmax_action = T.exp(log_softmax_actions)
                    
                    #print(softmax_action.data)
                    action = np.argmax(softmax_action.data.numpy()[0])
                    next_state,reward,done,_ = test_task.step(action)
                    next_state = np.delete(next_state,1)
                    
                    result += reward
                    state = next_state
                    if done:
                        break
            print("episode:",episode+1,"test result:",result/10.0)
            steps.append(episode+1)
            test_results.append(result/10)
plt.plot(steps,test_results)
plt.savefig('training_score.png')


RuntimeError: For unbatched 2-D input, hx and cx should also be 2-D but got (3-D, 3-D) tensors

In [None]:


T.manual_seed(config.Train.seed)
np.random.seed(config.Train.seed)

dir = 'ckpt_' + config.Train.env_name
if not os.path.exists(dir):
    os.mkdir(dir)

log_reward = []
for i in range(config.Train.num_episodes):
   
    action_prob = T.Tensor([env.reset()])
    print("baseline proba",action_prob)
    rewards = []
    log_prob = []
    hx = T.zeros(config.Train.hidden_size).unsqueeze(0).unsqueeze(0)
    cx = T.zeros(config.Train.hidden_size).unsqueeze(0).unsqueeze(0)
    print(hx.shape)
    #Model_ = A2C_LSTM(config,input_dim=4,num_actions=2)





baseline proba tensor([[0.0648, 0.4352]])
torch.Size([1, 1, 48])
baseline proba tensor([[0.0507, 0.4493]])
torch.Size([1, 1, 48])
baseline proba tensor([[0.0528, 0.4472]])
torch.Size([1, 1, 48])
baseline proba tensor([[0.0896, 0.4104]])
torch.Size([1, 1, 48])
baseline proba tensor([[0.0700, 0.4300]])
torch.Size([1, 1, 48])
baseline proba tensor([[0.0714, 0.4286]])
torch.Size([1, 1, 48])
baseline proba tensor([[0.0717, 0.4283]])
torch.Size([1, 1, 48])
baseline proba tensor([[0.0223, 0.4777]])
torch.Size([1, 1, 48])
baseline proba tensor([[0.0175, 0.4825]])
torch.Size([1, 1, 48])
baseline proba tensor([[0.0457, 0.4543]])
torch.Size([1, 1, 48])


In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

BATCH = 100;
TIMESTEP = 100;
HIDDEN = 40;
EPOCHS = 100;

class rnn_wrapper(nn.Module):

    def __init__(self):
        super(rnn_wrapper, self).__init__()
        self.lstm = nn.LSTM(input_size=HIDDEN, hidden_size=HIDDEN, batch_first = True)  # 输入维度是3, 输出维度也是3
        self.fc = nn.Linear(HIDDEN,1)

    def forward(self, x, hidden):
        x, hidden = self.lstm(x, hidden)
        x = self.fc(x)
        return x, hidden



torch.manual_seed(1)

#---------------train procedure---------------------------
#---------------feed lstm by batch-------------------
hidden_init = (torch.randn(1, BATCH, HIDDEN), torch.randn(1, BATCH, HIDDEN))
hidden = hidden_init
# print('Hidden:',hidden)

rnn = rnn_wrapper()

loss_function=nn.MSELoss()
optimizer=optim.SGD(rnn.parameters(), lr=0.1)

# target construct
q = [(i*np.ones((BATCH,1))).tolist() for i in range(TIMESTEP)]
target = torch.FloatTensor(q).transpose(0,1)

# input construct
k = np.random.randn(TIMESTEP,HIDDEN).tolist()
inn = torch.FloatTensor([ k for i in range(BATCH)])

for epoch in range(EPOCHS):
    rnn.zero_grad()
    hidden = hidden_init

    out, hidden = rnn(inn, hidden)

    loss=loss_function(out, target)
    print('Loss:',loss.item())
    loss.backward()
    
    optimizer.step()


Loss: 3303.461669921875
Loss: 2389.26806640625
Loss: 797.8804931640625
Loss: 1315.4296875
Loss: 1078.1566162109375
Loss: 1489.62451171875
Loss: 702.7447509765625
Loss: 1237.941162109375
Loss: 461.96258544921875
Loss: 840.1531982421875
Loss: 1061.1978759765625
Loss: 1644.6976318359375
Loss: 413.14691162109375
Loss: 240.18040466308594
Loss: 428.4512939453125
Loss: 881.3837280273438
Loss: 669.3056640625
Loss: 203.59217834472656
Loss: 127.71015167236328
Loss: 87.14710998535156
Loss: 68.89632415771484
Loss: 56.17485809326172
Loss: 50.859161376953125
Loss: 43.236778259277344
Loss: 37.44258499145508
Loss: 33.602603912353516
Loss: 31.826881408691406
Loss: 26.015975952148438
Loss: 22.729454040527344
Loss: 20.475494384765625
Loss: 18.518329620361328
Loss: 16.794355392456055
Loss: 15.676040649414062
Loss: 14.614242553710938
Loss: 13.183671951293945
Loss: 12.1857328414917
Loss: 11.32094955444336
Loss: 10.569906234741211
Loss: 9.632092475891113
Loss: 8.44007682800293
Loss: 7.972111701965332
Loss: 7

In [None]:

#---------------test procedure---------------------------
#---------------feed lstm per timestep-------------------
hx,cx = hidden
hx = hidden[0].select(1,0).unsqueeze(0);
cx = hidden[1].select(1,0).unsqueeze(0);
testin = inn.select(0,0);
print(testin)

for i in range(TIMESTEP):
    out, (hx,cx) = rnn(testin.select(0,i).view(1,1,-1), (hx,cx))
    print(out)

In [2]:
import gym

In [7]:
task = gym.make("CartPole-v0")
init_state = task.reset()
init_state = np.delete(init_state,1)

In [8]:
print(init_state)

[-0.04655312 -0.03280112 -0.03758502]


In [10]:
import gym 
import numpy as np
 
NUM_PROCESSES = 16
 
envs = [gym.make('Breakout-v0') for i in range(NUM_PROCESSES)]
 
obs = [env.reset() for env in envs]
obs = np.array(obs)


A.L.E: Arcade Learning Environment (version 0.7.4+069f8bd)
[Powered by Stella]


In [11]:
print(action_np)

NameError: name 'action_np' is not defined

In [1]:
import gym

from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import SubprocVecEnv
from stable_baselines import A2C

# multiprocess environment
n_cpu = 4
env = SubprocVecEnv([lambda: gym.make('CartPole-v1') for i in range(n_cpu)])

model = A2C(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=20)
model.save("a2c_cartpole")

del model # remove to demonstrate saving and loading

model = A2C.load("a2c_cartpole")

obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()

2023-01-05 01:25:56.928980: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


ModuleNotFoundError: No module named 'tensorflow.contrib'

In [2]:
import gym
from stable_baselines3 import A2C

env = gym.make('CartPole-v1')

model = A2C('MlpPolicy', env, verbose=1)
model.learn(total_timesteps=10000)

obs = env.reset()
for i in range(1000):
    action, _state = model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)
    env.render()
    if done:
      obs = env.reset()

: 

: 

In [13]:
pip install stable_baselines

Collecting stable_baselines
  Downloading stable_baselines-2.10.2-py3-none-any.whl (240 kB)
[K     |████████████████████████████████| 240 kB 7.9 MB/s eta 0:00:01
Collecting pyglet>=1.4.0
  Downloading pyglet-2.0.3-py3-none-any.whl (968 kB)
[K     |████████████████████████████████| 968 kB 54.1 MB/s eta 0:00:01
Installing collected packages: pyglet, stable-baselines
Successfully installed pyglet-2.0.3 stable-baselines-2.10.2
Note: you may need to restart the kernel to use updated packages.
