In [1]:
from Dynamics import get_next_state, state_to_coords, get_energy
from Environment import DoublePendulumEnv, normalize_angle
from PPO.Proximal_Policy_Optimization import PPO, unscaled_action
from PPO.train import train

In [2]:
from torch import nn
import torch
from torch.utils.tensorboard import SummaryWriter
import gym
from torch.distributions import Normal
from gym.spaces import Box
import random
from IPython.display import clear_output
from torch.distributions import MultivariateNormal
import numpy as np
import matplotlib.pyplot as plt

In [3]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

In [4]:
state0 = np.array([0,np.pi/2,np.pi/2,
                0,0,0])
state = state0
state
#max_initial_angle = 3 * 2 * np.pi / 360
max_initial_angle = 0

# Environment

In [8]:
class DoublePendulumEnv(gym.Env):

    def __init__(self, init_state, dt=0.02, max_initial_angle = 3 * 2 * np.pi / 360):
        self.action_space = Box(low=-500, high=500)
        self.observation_space = 6
        self.state = init_state
        self.init_state = init_state
        self.dt = dt
        print('Environment initialized')
        self.init_coords = state_to_coords(init_state)
        self.max_initial_angle = max_initial_angle
    def _take_action(self, action):
        self.state = get_next_state(self.state, action, self.dt)

    def _reward_function(self, done):
        """

        # Reward system 1
        Check whether 1 and 2 cart pole are in angle range between 80 and 100 degrees
        agent will agent a reward in range [0, 1]

        else:
            If angle of pole 1 and 2  are greater than 10 degrees, therefore, it makes sense
            to terminate the environment and reset/restart.
            agent will get a  reward = -1

        # Reward system 2
        If cart is in given range of x = [-5, 5] then agent will get a reward 0.5 every steps.
        Otherwise, it penalies the system heavily of a penalty = -50 and system is done here.

        # Reward system 3
        # this is unused. This an analog to reward system 1 but for coordinates
        If cart pole is not in the same line with cart then it will give additional penalty


        # Reward system 4
        Velocity penalty (halves the reward if spinning too fast)


        """
        state = self.state
        reward = 0
        # degree reward
        normalized_angle_1 = np.degrees(normalize_angle(state[1]))
        normalized_angle_2 = np.degrees(normalize_angle(state[2]))

        if normalized_angle_1 > 87 and normalized_angle_1 < 93:
            reward = 1 - (90 - normalized_angle_1) * 0.01
            if normalized_angle_2 > 80 and normalized_angle_2 < 100:
                reward += reward + 1 - (90 - normalized_angle_2) * 0.01
            reward *= 2
       
        else: 
            reward = -10
            done = True

       # another degree reward system
        # cost = 2*(normalize_angle(state[1]) - np.pi/2) + \
        #                2*(normalize_angle(state[2]) - np.pi/2)

        # reward = -np.abs(cost)

        # another degree_reward system

        #         deg_reward = ((np.sin(state[1]))*10 + (np.sin(state[2]))*10)/2
        #         #if np.sin(state[1]
        #         reward += deg_reward
        #         print(state[1])

        # distance penalty
        if state[0] < 2 and state[0] > -2:
            pass
        else:
            reward -= -50
            done = True

       # distance2 rew
        state_coords = state_to_coords(state)
               # dist_pen = (state_coords[0][1] - state_coords[0][0])**2 +  (state_coords[0][2] - state_coords[0][0])**2
        dist_rew =  -( state_coords[1][1] - self.init_coords[1][1]) -  ( state_coords[1][2] - self.init_coords[1][2])*2
        reward -= dist_rew
        

       # velocity penalty
        # vel_pen = ((1 + np.exp(-0.5 * state[-3:] ** 2)) / 2).sum()/10
        # reward -= vel_pen
        # print(-vel_pen)

        return reward, done

    def step(self, action):
        """
        observation -  [x,phi,theta,dx,dphi,dtheta]
        Num     Observation               Min                     Max
        0       Cart Position             -5 m                  5 m
        1       Pole1 Angle               -pi                     +pi
        2       Pole2 Angle               -pi                     +pi
        3       Cart Velocity             -Inf                    Inf
        4       Pole1 Angular Velocity    -Inf                    Inf
        5       Pole1 Angular Velocity    -Inf                    Inf

        """
        done = False
        info = {}
        self._take_action(action)

       
        reward, done = self._reward_function(done)
        return np.array(self.state), reward, done, info

    def render(self):
        """
        Compute the render frames as specified by render_mode attribute during initialization of the environment.

        """
        state = self.state
        ani = animation.FuncAnimation(fig, animate, frames=300,
                                      interval=20, blit=True, init_func=init)
        plt.show()

    def reset(self):
        """
        Resets the environment to an initial state and returns the initial observation.
        """
        self.rew_sum = 0
        self.state = self.init_state
        d = np.random.uniform(-self.max_initial_angle, self.max_initial_angle)
        self.state[1] = np.pi/2 + np.random.uniform(-self.max_initial_angle, self.max_initial_angle)
        self.state[2] = np.pi/2 + np.random.uniform(-self.max_initial_angle, self.max_initial_angle)

        return np.array(self.state)

In [9]:
def train():
    print("============================================================================================")
    # max. timestep per episode. For DoubleCartPoleEnv, time constraint is 200 timesteps. After that environment is 
    # reset.
    directory_plots = "PPO_plots"
    if not os.path.exists(directory_plots):
          os.makedirs(directory_plots)
            
    writer = SummaryWriter(log_dir = directory_plots )
    max_ep_len = 400
    # The training phase will sample and update for 1 million timestep.
    max_training_steps = int(1e6)

    # In order, to check ongoing progress, average reward is printed at every 10_000 timesteps.
    print_freq = 10_000
    
    # Saving model parameters at every 1_00_000 timesteps.
    save_model_freq = int(1e5)

    action_std = 0.2                                    # Initial standard deviation.
    action_std_decay_rate = 0.1                       # Decay rate of standard deviation.
    min_action_std = 0.1                                # Threshold standard deviation.
    action_std_decay_freq = int(2e5)                    # Decay the standard deviation every 2_00_000 timesteps

    update_timestep = 2000                              # set old_policy parameters to new_policy parameters.
    K_epochs = 100                                      # Number of epochs before updating old policy parameters.
    eps_clip = 0.2                                      # clip range for surrogate loss function.
    gamma = 0.99                                        # Discount factor.

    lr_actor = 3e-3                                   # Learning rate for optimizer of actor network.
    lr_critic = 0.001                                  # Learning rate for optimizer of critic network.
    env_name = 'DoubleInvPendulum'
    print("Training Environment:" + env_name)
    env = DoublePendulumEnv(init_state = state, dt = 0.02)

    observation_shape = 6  # Observation shape
    action_shape = 1          # Action shape

    # Creating a directory to store the model parameters during and after training.
    directory = "PPO2_Trained"
    if not os.path.exists(directory):
          os.makedirs(directory)

    directory = directory + '/'
    if not os.path.exists(directory):
          os.makedirs(directory)
    
    checkpoint_path = directory + "PPO2_{}.pth".format(env_name)
    print("save checkpoint path : " + checkpoint_path)
    print("--------------------------------------------------------------------------------------------")
    print("max training timesteps : ", max_training_steps)
    print("max timesteps per episode : ", max_ep_len)
    print("model saving frequency : " + str(save_model_freq) + " timesteps")
    print("printing average reward over episodes in last : " + str(print_freq) + " timesteps")
    print("--------------------------------------------------------------------------------------------")
    print("state space dimension : ", observation_shape)
    print("action space dimension : ", action_shape)
    print("--------------------------------------------------------------------------------------------")
    print("Initializing a continuous action space policy")
    print("--------------------------------------------------------------------------------------------")
    print("starting std of action distribution : ", action_std)
    print("decay rate of std of action distribution : ", action_std_decay_rate)
    print("minimum std of action distribution : ", min_action_std)
    print("decay frequency of std of action distribution : " + str(action_std_decay_freq) + " timesteps")

    print("--------------------------------------------------------------------------------------------")
    print("PPO update frequency : " + str(update_timestep) + " timesteps")
    print("PPO K epochs : ", K_epochs)
    print("PPO epsilon clip : ", eps_clip)
    print("discount factor (gamma) : ", gamma)
    print("--------------------------------------------------------------------------------------------")
    print("optimizer learning rate actor : ", lr_actor)
    print("optimizer learning rate critic : ", lr_critic)

    print("============================================================================================")

    agent = PPO(observation_shape,
                action_shape,
                lr_actor,
                lr_critic,
                gamma,
                K_epochs,
                eps_clip,
                action_std)

    print("Starting the Training")
    print("============================================================================================")

    # To keep track of the progress
    print_running_reward = 0    
    print_running_episodes = 0

    time_step = 0
    i_episode = 0
    counter = 0

    plot_episode = []
    plot_reward = []

    while time_step <= max_training_steps:
        obs = env.reset()
        current_ep_reward = 0
        for t in range(1, max_ep_len + 1):
            action = agent.select_action(obs)                           # Get action under old_policy given state.
            action = unscaled_action(action)                            # Unscale the action.
            obs, reward, done, _ = env.step(action)                     # Apply the action to environment.
         

            # Append the reward and done flag to buffer for calculating Monte Carlo returns during updating phase.
            agent.buffer.rewards.append(reward)
            agent.buffer.dones.append(done)                             

            time_step += 1
            current_ep_reward += reward

            if time_step % update_timestep == 0:
                # Perform updates using sampled data.
                agent.update()

            if time_step % action_std_decay_freq == 0:
                # Decay standard deviation by 0.1.
                agent.decay_action_std(action_std_decay_rate, min_action_std)

            if time_step % print_freq == 0:
                # print average reward during 10_000 timesteps
                print_avg_reward = print_running_reward / print_running_episodes
                print_avg_reward = round(print_avg_reward, 2)

                print("Episode : {} \t\t Timestep : {} \t\t Average Reward : {}".format(i_episode, time_step, print_avg_reward))

                print_running_reward = 0
                print_running_episodes = 0

            if time_step % save_model_freq == 0:

                # Save the model parameters for test phase and tracking performance.
                print("--------------------------------------------------------------------------------------------")
                print("Saving model at : " + checkpoint_path)
                agent.save(checkpoint_path)
                print("Model saved")
                counter += 1
                inter_checkpoint = directory + "PPO_{}_{}00K.pth".format(env_name, counter)
                print("--------------------------------------------------------------------------------------------")               
                print("Model parameters to check for intermediate performance saving:.")
                print("saving model at : " + inter_checkpoint)
                agent.save(inter_checkpoint)
                if counter == 10:
                    print(f"Intermediate model saved for {counter}M")   
                else:
                    print(f"Intermediate model saved for {counter}00K")                
                print("--------------------------------------------------------------------------------------------")
                
            if done:
                break

        print_running_reward += current_ep_reward
        
        # plot reward per 10 episode.
        if i_episode % 10 == 0:
            writer.add_scalar("Episode reward", current_ep_reward,i_episode)
            plot_episode.append(i_episode)
            plot_reward.append(current_ep_reward)
            

        print_running_episodes += 1

        i_episode += 1

    env.close()

    print("============================================================================================")
    print("Training Finished")

    
    return plot_episode, plot_reward

In [None]:

n_episode, reward_episode = train()

Training Environment:DoubleInvPendulum
Environment initialized
save checkpoint path : PPO2_Trained/PPO2_DoubleInvPendulum.pth
--------------------------------------------------------------------------------------------
max training timesteps :  1000000
max timesteps per episode :  400
model saving frequency : 100000 timesteps
printing average reward over episodes in last : 10000 timesteps
--------------------------------------------------------------------------------------------
state space dimension :  6
action space dimension :  1
--------------------------------------------------------------------------------------------
Initializing a continuous action space policy
--------------------------------------------------------------------------------------------
starting std of action distribution :  0.2
decay rate of std of action distribution :  0.1
minimum std of action distribution :  0.1
decay frequency of std of action distribution : 200000 timesteps
------------------------------

In [None]:
%load_ext tensorboard
%tensorboard --logdir='PPO_plots'

In [None]:
# Plot: Reward per every 10 episodes.
fig, ax = plt.subplots(figsize=(15, 10))
ax.plot(n_episode, reward_episode)
ax.set_title('Total number of episodes vs rewards per episode', fontsize=20)
ax.set_xlabel('Episode', fontsize=20)
ax.set_ylabel('Reward', fontsize=20)
plt.show()

In [None]:
%matplotlib widget

state = state0

dt = 0.1
fig = plt.figure()
ax = fig.add_subplot(111, aspect='equal', autoscale_on=False,
                     xlim=(-2, 2), ylim=(-2, 2))
ax.grid()

line, = ax.plot([], [], 'o-', lw=2)
energy_text = ax.text(0.02, 0.90, '', transform=ax.transAxes)

def init():
    """initialize animation"""
    line.set_data([], [])
    #time_text.set_text('')
    energy_text.set_text('')
    return line


def animate(i):
    """perform animation step"""
    global state, dt
    state_t = torch.FloatTensor(state)
    u = agent.select_action(state_t)
    action
    state = get_next_state(state,u[0].detach().numpy(),dt)
    XY = state_to_coords(state)
    en = get_energy(state)
    
    line.set_data(XY[0],XY[1])
    energy_text.set_text(f'energy = {en}')
    return line,

ani = animation.FuncAnimation(fig, animate, frames=100,
                             interval=10, blit=True, init_func=init)

from IPython.display import HTML
HTML(ani.to_jshtml())