In [1]:
import chainer 
import chainer.functions as F 
import chainer.links as L 
import chainerrl
import gym
import numpy as np
from gym import spaces
from arm import Arm

In [2]:
env=Arm(3,[1.,1.,1.])
print('observation space:',env.state)
print('action space:',env.actions)
obs=env.reset()

#env.render()
print('initial observation:',obs)

action=env.random_action()
print(action)
state,r,done=env.step(action)
print('next observation:',state)
print('reward:',r)
env.actions.shape[0]

observation space: [0.         0.         0.         2.77425477 2.64038567]
action space: [[ 0.00872665  0.          0.        ]
 [ 0.          0.00872665  0.        ]
 [ 0.          0.          0.00872665]
 [-0.00872665  0.          0.        ]
 [ 0.         -0.00872665  0.        ]
 [ 0.          0.         -0.00872665]]
initial observation: [0.         0.         0.         1.12480337 2.73100807]
5
next observation: [ 0.          0.         -0.00872665  1.12480337  2.73100807]
reward: -10.974767486231736


6

In [3]:
class QFunction(chainer.Chain):

    def __init__(self,obs_size,n_actions,n_hidden_channels=50):
        super().__init__()
        with self.init_scope():
            self.l0=L.Linear(obs_size,n_hidden_channels)
            self.l1=L.Linear(n_hidden_channels,n_hidden_channels*2)
            self.l2=L.Linear(n_hidden_channels*2,n_hidden_channels*2)
            self.l3=L.Linear(n_hidden_channels*2,n_actions)

    def __call__(self,x,test=False):
        h=F.tanh(self.l0(x))
        h=F.tanh(self.l1(h))
        h=F.tanh(self.l2(h))
        return chainerrl.action_value.DiscreteActionValue(self.l3(h))

obs_size=env.state.shape[0]
n_actions=env.actions.shape[0]
q_func=QFunction(obs_size,n_actions)

In [4]:
optimizer=chainer.optimizers.Adam(eps=1e-2)
optimizer.setup(q_func)

<chainer.optimizers.adam.Adam at 0x11f0f6110>

In [7]:
gamma=0.99

#######################################
explorer=chainerrl.explorers.ConstantEpsilonGreedy(
    epsilon=0.2,random_action_func=env.action_space_d.sample)

replay_buffer=chainerrl.replay_buffer.ReplayBuffer(capacity=10**6)
phi=lambda x:x.astype(np.float32,copy=False)
agent=chainerrl.agents.DoubleDQN(
    q_func,optimizer,replay_buffer,gamma,explorer,
    replay_start_size=500,update_interval=1,target_update_interval=100,phi=phi)

n_episodes = 10000
max_episode_len = 1000
for i in range(1, n_episodes + 1):
    obs = env.reset()
    reward = 0
    done = False
    R = 0  # return (sum of rewards)
    t = 0  # time step
    while not done and t < max_episode_len:
        # Uncomment to watch the behaviour
        # env.render()
        action = agent.act_and_train(obs, reward)
        obs, reward, done = env.step(action)
        R += reward
        t += 1
    #print("total step of this iteration:"+str(t))
    #print("last error of this step:"+str(reward))
    if i % 10 == 0:
        print('episode:', i,
              'R:', R,
              'statistics:', agent.get_statistics())
    agent.stop_episode_and_train(obs, reward, done)
    if i%100==0:
        agent.save("agent_"+str(i))

print('Finished.')








episode: 10 R: -650.6755386920598 statistics: [('average_q', -36.485194439197336), ('average_loss', 0.32358053795664415), ('n_updates', 83758)]
episode: 20 R: -61.799300408341566 statistics: [('average_q', -34.90047679719849), ('average_loss', 0.526691046857662), ('n_updates', 92770)]
episode: 30 R: -1799.2692197392084 statistics: [('average_q', -44.946415915292945), ('average_loss', 0.45843705064452006), ('n_updates', 102360)]


In [33]:
import serial

In [34]:
ser=serial.Serial("/dev/tty.usbmodem143401",baudrate=115200,timeout=1)

In [35]:
data=ser.readlines()

In [36]:
print(data)

[]


In [39]:
ser.close()

In [40]:
ser.close()