In [44]:
import numpy as np
from gym import Env as GymEnv
import gym

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

In [45]:
learning_rate = 0.0002
gamma = 0.98
class PIDpolicy(nn.Module):
    def __init__(self):
        super(PIDpolicy, self).__init__()
        self.data = []
        
        self.fc1 = nn.Linear(4,128)
        self.fc2 = nn.Linear(128,3)
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        
        return x
    
    def put_data(self, item):
        self.data.append(item)
    
    def train_net(self):
        R = 0
        self.optimizer.zero_grad()
        for r, prob in self.data[::-1]:
            R = r+gamma*R
            loss = -torch.log(prob)*R
            loss.mean().backward()
        self.optimizer.step()
        self.data = []

In [46]:
class BasicAgent(object):
    def __init__(self, action_space):
        self.action_space = action_space
    def initialize(self, state):
        pass
    def pretraining_act(self, state): # training 전 데이터 모으기용
        return self.action_space.sample()
    def training_acct(self, state): # training용
        return self.action_space.sample()
    def solving_act(self, state): # test용
        return self.action_space.sample()
    def pretraining_react(self, state, reward): #tr
        pass
    def training_react(self, state, reward):
        pass
    def solving_react(self, state, reward):
        pass  

In [47]:
class PIDControlAgent(BasicAgent):
    def __init__(self, action_space, fs, kp=1.2, ki=1.0, kd=0.001, set_angle=0):
        # action_space : gym.spaces : 가능한 액션
        # fs : sampling frequency. (Hz) == 50.
        # kp : gain of proportional controller
        # ki : gain of integral controller
        # kd : derivative controller
        super(PIDControlAgent, self).__init__(action_space)
        self.kp = kp
        self.ki = ki
        self.kd = kd
        
        self.set_angle = set_angle #원하는 각도 : 0도가 이상적
        self.tau = 1.0/fs
        
        self.p_term = 0.0
        self.i_term = 0.0
        self.d_term = 0.0

        # cache
        self.output = 0.0
        self.err_prev = 0.0
                
    def update(self, v_in, v_fb):
        # v_in : input command : 원하는 각도
        # v_fb : feedback from observer : 현재 각도
        # output : output command??
        # u(t) = K_p e(t) + K_i \int_{0}^{t} e(t)dt + K_d {de}/{dt}
        err = v_in - v_fb # 0 - 현재각
        
        #Ziegler–Nichols method
        self.p_term = err
        self.i_term += err*self.tau
        self.d_term = (err - self.err_prev)*self.tau
        self.output = self.kp*self.p_term + self.ki*self.i_term + self.kd*self.d_term
        
        self.err_prev = err
        
        return self.output
        
    def choose_action(self, val):
        if val >= 0:
            action = 0
        else:
            action = 1
        return action
    
    def solving_act(self, state):
        output = self.update(self.set_angle, state[2])
        temp = self.choose_action(output)
        self.action = temp
        return self.action

In [48]:
class BasicSolver(object):
    def __init__(self, env=None, agent=None, policy=None,
                 skip_pretraining=False,
                 skip_training=False,
                 skip_solving=False):
        self.env = env
        self.agent = agent
        self.policy = policy
    def pretrain(self):
        pass
    def train(self):
        pass
    def solve(self):
        pass
    def run(self):
        self.solve()
    def terminate(self):
        self.env.close()

In [49]:
class CartPoleSolver(BasicSolver):
    def __init__(self,
                 solving_episodes=10,
                 max_steps = 200,
                 render_when_sovling=True,
                 **kwargs):
        super(CartPoleSolver, self).__init__(**kwargs)

        self.solving_episodes = solving_episodes
        self.max_steps = max_steps

        # flags control for rendering
        self.rws = render_when_sovling

    def solve(self):
        state = self.env.reset()
        self.agent.initialize(state)

        for i in range(100):
            total_reward = 0
            done = False

            while not done:

                action = self.agent.solving_act(state)
                state, reward, done, info = self.env.step(action)
                k = self.policy(torch.from_numpy(state).float())
                self.policy.put_data((reward, k))
                kp = k[0].detach().numpy()
                ki = k[1].detach().numpy()
                kd = k[2].detach().numpy()
                self.agent.kp = kp
                self.agent.ki = ki
                self.agent.kd = kd

                total_reward += reward

                if done:
                    self.policy.train_net()
                    print(f'Kp:{self.agent.kp}, Ki:{self.agent.ki}, Kd:{self.agent.kd}')
                    print('Episode: {}'.format(i),
                          'Total reward: {}'.format(total_reward))
                    self.env.reset()
        self.env.close()


In [50]:
def pid_control_solver():
    env = gym.make('CartPole-v0')
    # NOTE: kp, ki, kd are tuned manually, they are not the optimal parameter
    # for this PID controller
    kp = np.random.randint(10)
    ki = np.random.randint(10)
    kp = np.random.randint(10)
    print('initial kp,ki,kp', kp,ki,kp)
    policy = PIDpolicy()
    
    agent = PIDControlAgent(env.action_space, 
                        env.metadata['video.frames_per_second'],
                        kp=kp, ki=ki, kd=kp)
    # NOTE: pretraining and training stage is not required for this solver
    solver = CartPoleSolver(env=env, agent=agent, policy=policy,
                            skip_pretraining=True,
                            skip_training=True)
    solver.run()

In [51]:
def main():
    pid_control_solver()

In [None]:
if __name__ == '__main__':
    main()

initial kp,ki,kp 1 8 1
Kp:0.0, Ki:0.0, Kd:0.6038234829902649
Episode: 0 Total reward: 200.0
Kp:0.0, Ki:0.0, Kd:0.505026638507843
Episode: 1 Total reward: 200.0
Kp:0.0538405179977417, Ki:0.0, Kd:0.46948158740997314
Episode: 2 Total reward: 200.0
Kp:0.0, Ki:0.0, Kd:0.7927125096321106
Episode: 3 Total reward: 190.0
Kp:0.060383278876543045, Ki:0.0, Kd:0.4833754897117615
Episode: 4 Total reward: 200.0
Kp:0.0, Ki:0.0, Kd:0.3643105626106262
Episode: 5 Total reward: 200.0
Kp:0.10099588334560394, Ki:0.0, Kd:0.5354748368263245
Episode: 6 Total reward: 200.0
Kp:0.0, Ki:0.0, Kd:0.7578005194664001
Episode: 7 Total reward: 200.0
Kp:0.0, Ki:0.0, Kd:0.7709106802940369
Episode: 8 Total reward: 200.0
Kp:0.0, Ki:0.0, Kd:0.8895102739334106
Episode: 9 Total reward: 132.0
Kp:0.04670775681734085, Ki:0.0, Kd:0.5311923623085022
Episode: 10 Total reward: 200.0
Kp:0.0, Ki:0.0, Kd:0.7372403740882874
Episode: 11 Total reward: 200.0
Kp:0.03812767565250397, Ki:0.0, Kd:0.44594040513038635
Episode: 12 Total reward: 20