In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
import gym
from gym import wrappers
import os
import pybullet_envs
import matplotlib.pyplot as plt
from collections import deque
import random
import pickle
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
class Replay:
    def __init__(self, max_memory=1_000_000):
        self.memory = deque(maxlen= max_memory)
    
    def add_memory(self, trans):
        self.memory.append(trans)
    
    def sample(self, batch_size):
        idx=np.random.randint(0,len(self.memory),batch_size)
        batch=random.sample(self.memory, batch_size)
        batch_next_states, batch_rewards, batch_dones, batch_states, batch_actions= [], [], [], [], []
        
        for next_state, reward, done, state, action in batch:
            batch_next_states.append(np.array(next_state, copy=False))
            batch_rewards.append(np.array(reward, copy=False))
            batch_dones.append(np.array(done, copy=False))
            batch_states.append(np.array(state, copy=False))
            batch_actions.append(np.array(action, copy=False))
            
        return np.array(batch_next_states), np.array(batch_rewards).reshape(-1, 1), np.array(batch_dones).reshape(-1, 1), np.array(batch_states),  np.array(batch_actions)  

In [None]:
class Actor(nn.Module):
    def __init__(self, state_size, action_size, max_action):
        super(Actor, self).__init__()
        self.layer1 = nn.Linear(state_size, 400)
        self.layer2 = nn.Linear(400, 300)
        self.layer3 = nn.Linear(300, action_size)
        self.max_action = max_action
        
    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        x = self.max_action * torch.tanh(self.layer3(x))
        return x

In [None]:
class Critic(nn.Module):
    def __init__(self, state_size, action_size):
        super(Critic, self).__init__()
        self.layer11 = nn.Linear(state_size + action_size, 400)
        self.layer12 = nn.Linear(400, 300)
        self.layer13 = nn.Linear(300, 1)
        
        self.layer21 = nn.Linear(state_size + action_size, 400)
        self.layer22 = nn.Linear(400, 300)
        self.layer23 = nn.Linear(300, 1)
        
    def forward(self, x1, x2):
        x12 = torch.cat([x1, x2], 1)
        
        x1 = F.relu(self.layer11(x12))
        x1 = F.relu(self.layer12(x1))
        x1 = self.layer13(x1)
        
        x2 = F.relu(self.layer21(x12))
        x2 = F.relu(self.layer22(x2))
        x2 = self.layer23(x2)
        return x1, x2
    
    def Q1(self, x1, x2):
        x12 = torch.cat([x1, x2], 1)
        
        x1 = F.relu(self.layer11(x12))
        x1 = F.relu(self.layer12(x1))
        x1 = self.layer13(x1)
        return x1

In [None]:
class TD3:
    def __init__(self, state_size, action_size, max_action):
        self.actor = Actor(state_size, action_size, max_action).to(device)
        self.actor_target = Actor(state_size, action_size, max_action).to(device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters())
        
        self.critic = Critic(state_size, action_size).to(device)
        self.critic_target = Critic(state_size, action_size).to(device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters())
        
        self.max_action = max_action
        self.discount = 0.99
        self.batch_size = 100
        self.tau = 0.005
        self.policy_noise = 0.2
        self.noise_clip = 0.5
        self.policy_freq = 2
        
    def action(self, state):
        state = torch.Tensor(state.reshape(1, -1)).to(device)
        return self.actor(state).cpu().data.numpy().flatten()
    
    def learn(self, iterations):
        for it in range(iterations):
            batch_next_state, batch_reward, batch_done, batch_state, batch_action = replay.sample(self.batch_size)
            
            next_state = torch.Tensor(batch_next_state).to(device)
            reward = torch.Tensor(batch_reward).to(device)
            done = torch.Tensor(batch_done).to(device)
            state = torch.Tensor(batch_state).to(device)
            action = torch.Tensor(batch_action).to(device)
            
            next_action = self.actor_target.forward(next_state)

            noise = torch.Tensor(batch_action).data.normal_(0, self.policy_noise).to(device)
            noise = noise.clamp(-self.noise_clip, self.noise_clip)
            next_action = (next_action + noise).clamp(-self.max_action, self.max_action)

            target_Q1, target_Q2 = self.critic_target.forward(next_state, next_action)
            target_Q = torch.min(target_Q1, target_Q2)
            target_Q = reward + ((1 - done) * self.discount * target_Q).detach()

            current_Q1, current_Q2 = self.critic(state, action)

            critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)

            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()

            if it % self.policy_freq == 0:
                actor_loss = -self.critic.Q1(state, self.actor(state)).mean()
                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                self.actor_optimizer.step()

                for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
                    target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

                for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
                    target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

    def save(self, filename, directory):
        torch.save(self.actor.state_dict(), f'{directory}/{filename}_actor.pth')
        torch.save(self.critic.state_dict(), f'{directory}/{filename}_critic.pth')

    def load(self, filename, directory):
        self.actor.load_state_dict(torch.load(f'{directory}/{filename}_actor.pth'))
        self.critic.load_state_dict(torch.load(f'{directory}/{filename}_critic.pth'))

In [None]:
def evaluate_policy(policy, eval_episodes=10):
    avg_reward = 0
    for episode in range(eval_episodes):
        state = env.reset()
        done = False
        for step in range(env.spec.max_episode_steps):
            action = policy.action(np.array(state))
            state, reward, done, info = env.step(action)
            avg_reward += reward
            if done:
                break
    avg_reward /= eval_episodes
    state = env.reset()
    print ("---------------------------------------")
    print (f"Policy Evaluation | Average Reward= {int(avg_reward)}")
    print ("---------------------------------------")
    return avg_reward

In [None]:
env_name='BipedalWalker-v3'
env = gym.make(env_name)

folder=f'TD3_{env_name}'

save_models = True
save_video = False

if not os.path.exists(f"./{folder}/results"):
    os.makedirs(f"./{folder}/results")
if save_models and not os.path.exists(f"./{folder}/pytorch_models"):
    os.makedirs(f"./{folder}/pytorch_models")
if save_video:
    if not os.path.exists(f"./{folder}/video"):
        os.makedirs(f"./{folder}/video")

    env = wrappers.Monitor(env, f"./{folder}/video", force = False)
    env.reset()

seed=0
env.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)

state_size = env.observation_space.shape[0]
action_size = env.action_space.shape[0]
max_action = float(env.action_space.high[0])

max_episode_steps = env.spec.max_episode_steps
max_episode=1100
expl_noise = 0.1
change_action = 100

policy = TD3(state_size, action_size, max_action)
replay = Replay()
evaluations = []
reward_lst=[]
rewards=0

if __name__ == '__main__':
    for episode in range(max_episode):
        state = env.reset()
        episode_reward=0
        
        for step in range(max_episode_steps):
            if episode < change_action:
                action = env.action_space.sample()
            else:
                action = policy.action(np.array(state))
                action = (action + np.random.normal(0, expl_noise, size=env.action_space.shape[0]))
                action = action.clip(env.action_space.low, env.action_space.high)
                
            new_state, reward, done, info = env.step(action)

            replay.add_memory((new_state, reward, float(done), state, action))

            episode_reward += reward
            state = new_state

            if done or step==max_episode_steps-1:
                if episode != 0:
                    print(f"Episode: {episode:4} | Timesteps: {step:4} | Reward: {int(episode_reward):4}")
                    policy.learn(step)

                if episode%50==0:
                    evaluations.append(evaluate_policy(policy))
                    policy.save(env_name, directory=f"./{folder}/pytorch_models")
                    np.save(f"./{folder}/results/{env_name}", evaluations)
                break
                
        reward_lst.append(int(episode_reward))
   
    env.close()
    
    with open(f"./{folder}/reward_lst.txt", "wb") as fp:  
        pickle.dump(reward_lst, fp)

In [None]:
# # save video
# # policy.load(env_name, directory=f"./{folder}/pytorch_models")
# env = wrappers.Monitor(env, f"./{folder}/video", force = True)
# env.reset()
# evaluate_policy(policy)
# env.close()

In [None]:
# # load reward list
# with open(f"./{folder}/reward_lst.txt", "rb") as fp:
#     reward_lst = pickle.load(fp)

In [None]:
reward_roll_lst=[]
reward_roll=deque(maxlen=100)
for i in reward_lst:
    reward_roll.append(i)
    reward_roll_lst.append(np.mean(reward_roll))

fig= plt.figure(figsize=(12,8))
plt.plot(list(range(len(reward_lst))), reward_lst, color='blue', alpha=0.5)
plt.plot(list(range(len(reward_roll_lst))), reward_roll_lst, color='red', alpha=0.8)
plt.plot(list(range(len(reward_roll_lst))), [300]*len(reward_roll_lst), color='green', alpha=0.6)

plt.title(f'{env_name} reward using TD3', fontsize=16)
plt.xlabel('Episode', fontsize=16)
plt.ylabel('Reward', fontsize=16)
plt.legend(('Reward','Rolling Reward (50)'), fontsize=14)
