In [11]:
import numpy as np
from gym import Env as GymEnv
import gym

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

In [12]:
learning_rate = 0.0002
gamma = 0.98
class PIDpolicy(nn.Module):
    def __init__(self):
        super(PIDpolicy, self).__init__()
        self.data = []
        
        self.fc1 = nn.Linear(4,128)
        self.fc2 = nn.Linear(128,3)
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        
        return x
    
    def put_data(self, item):
        self.data.append(item)
    
    def train_net(self):
        R = 0
        self.optimizer.zero_grad()
        for r, prob in self.data[::-1]:
            R = r+gamma*R
            loss = -torch.log(prob)*R
            loss.backward()
        self.optimizer.step()
        self.data = []

In [13]:
class BasicAgent(object):
    def __init__(self, action_space):
        self.action_space = action_space
    def initialize(self, state):
        pass
    def pretraining_act(self, state): # training 전 데이터 모으기용
        return self.action_space.sample()
    def training_acct(self, state): # training용
        return self.action_space.sample()
    def solving_act(self, state): # test용
        return self.action_space.sample()
    def pretraining_react(self, state, reward): #tr
        pass
    def training_react(self, state, reward):
        pass
    def solving_react(self, state, reward):
        pass  

In [14]:
class PIDControlAgent(BasicAgent):
    def __init__(self, action_space, fs, kp=1.2, ki=1.0, kd=0.001, set_angle=0):
        # action_space : gym.spaces : 가능한 액션
        # fs : sampling frequency. (Hz) == 50.
        # kp : gain of proportional controller
        # ki : gain of integral controller
        # kd : derivative controller
        super(PIDControlAgent, self).__init__(action_space)
        self.kp = kp
        self.ki = ki
        self.kd = kd
        
        self.set_angle = set_angle #원하는 각도 : 0도가 이상적
        self.tau = 1.0/fs
        
        self.p_term = 0.0
        self.i_term = 0.0
        self.d_term = 0.0
        
        self.p_o=0
        self.i_o=0
        self.d_o=0

        # cache
        self.output = 0.0
        self.err_prev = 0.0
                
    def update(self, v_in, v_fb):
        # v_in : input command : 원하는 각도
        # v_fb : feedback from observer : 현재 각도
        # output : output command??
        # u(t) = K_p e(t) + K_i \int_{0}^{t} e(t)dt + K_d {de}/{dt}
        err = v_in - v_fb # 0 - 현재각
        
        #Ziegler–Nichols method
        self.p_term = err
        self.i_term += err*self.tau
        self.d_term = (err - self.err_prev)*self.tau
        self.output = self.kp*self.p_term + self.ki*self.i_term + self.kd*self.d_term
        
        self.err_prev = err
        
        return self.output
        
    def choose_action(self, val):
        if val >= 0:
            action = 0
        else:
            action = 1
        return action
    
    def solving_act(self, state):
        output = self.update(self.set_angle, state[2])
        temp = self.choose_action(output)
        self.action = temp
        return self.action

In [15]:
class BasicSolver(object):
    def __init__(self, env=None, agent=None, policy=None,
                 skip_pretraining=False,
                 skip_training=False,
                 skip_solving=False):
        self.env = env
        self.agent = agent
        self.policy = policy
    def pretrain(self):
        pass
    def train(self):
        pass
    def solve(self):
        pass
    def run(self):
        self.solve()
    def terminate(self):
        self.env.close()

In [16]:
class CartPoleSolver(BasicSolver):
    def __init__(self,
                 solving_episodes=10,
                 max_steps = 200,
                 render_when_sovling=True,
                 **kwargs):
        super(CartPoleSolver, self).__init__(**kwargs)

        self.solving_episodes = solving_episodes
        self.max_steps = max_steps

        # flags control for rendering
        self.rws = render_when_sovling

    def solve(self):
        state = self.env.reset()
        self.agent.initialize(state)

        for i in range(100):
            total_reward = 0
            done = False

            while not done:

                action = self.agent.solving_act(state)
                state, reward, done, info = self.env.step(action)
                k = self.policy(torch.from_numpy(state).float())
                self.policy.put_data((reward, k))
                kp = k[0].detach().numpy()
                ki = k[1].detach().numpy()
                kd = k[2].detach().numpy()
                self.agent.kp = kp
                self.agent.ki = ki
                self.agent.kd = kd

                total_reward += reward

                if done:
                    print(f'Kp:{self.agent.kp}, Ki:{self.agent.ki}, Kd:{self.agent.kd}')
                    print('Episode: {}'.format(i),
                          'Total reward: {}'.format(total_reward))
                    self.env.reset()
        self.env.close()


In [17]:
def pid_control_solver():
    env = gym.make('CartPole-v0')
    # NOTE: kp, ki, kd are tuned manually, they are not the optimal parameter
    # for this PID controller
    kp = np.random.randint(10)
    ki = np.random.randint(10)
    kp = np.random.randint(10)
    print('initial kp,ki,kp', kp,ki,kp)
    policy = PIDpolicy()
    
    agent = PIDControlAgent(env.action_space, 
                        env.metadata['video.frames_per_second'],
                        kp=kp, ki=ki, kd=kp)
    # NOTE: pretraining and training stage is not required for this solver
    solver = CartPoleSolver(env=env, agent=agent, policy=policy,
                            skip_pretraining=True,
                            skip_training=True)
    solver.run()

In [18]:
env = gym.make('CartPole-v0')

In [19]:
def main():
    pid_control_solver()

In [20]:
if __name__ == '__main__':
    main()

initial kp,ki,kp 6 0 6
Kp:0.10996149480342865, Ki:0.0, Kd:0.0
Episode: 0 Total reward: 25.0
Kp:0.11780586838722229, Ki:0.0, Kd:0.0
Episode: 1 Total reward: 33.0
Kp:0.13064280152320862, Ki:0.017969347536563873, Kd:0.09380712360143661
Episode: 2 Total reward: 38.0
Kp:0.13233041763305664, Ki:0.0, Kd:0.0
Episode: 3 Total reward: 30.0
Kp:0.197168231010437, Ki:0.0, Kd:0.0
Episode: 4 Total reward: 36.0
Kp:0.14019744098186493, Ki:0.0, Kd:0.1048874706029892
Episode: 5 Total reward: 41.0
Kp:0.12880030274391174, Ki:0.06816282868385315, Kd:0.10691555589437485
Episode: 6 Total reward: 42.0
Kp:0.16730572283267975, Ki:0.06869206577539444, Kd:0.10881363600492477
Episode: 7 Total reward: 40.0
Kp:0.11144493520259857, Ki:0.0, Kd:0.08417078852653503
Episode: 8 Total reward: 22.0
Kp:0.19330760836601257, Ki:0.0, Kd:0.0
Episode: 9 Total reward: 28.0
Kp:0.17596425116062164, Ki:0.06814566999673843, Kd:0.10619363933801651
Episode: 10 Total reward: 38.0
Kp:0.13123109936714172, Ki:0.0354435071349144, Kd:0.0965417