In [1]:
import sys
import numpy as np
import random
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from matplotlib import animation, rc
from IPython.display import Math, HTML
import os
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions.normal import Normal
import copy
import pandas as pd
import json,os
!pip install gymnasium[mujoco]
import gymnasium as gym

Collecting mujoco>=2.3.3 (from gymnasium[mujoco])
  Downloading mujoco-3.1.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting glfw (from mujoco>=2.3.3->gymnasium[mujoco])
  Downloading glfw-2.7.0-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38-none-manylinux2014_x86_64.whl.metadata (5.4 kB)
Collecting pyopengl (from mujoco>=2.3.3->gymnasium[mujoco])
  Downloading PyOpenGL-3.1.7-py3-none-any.whl.metadata (3.2 kB)
Downloading mujoco-3.1.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m60.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading glfw-2.7.0-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38-none-manylinux2014_x86_64.whl (211 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.8/21

In [2]:
class ReplayBuffer():
    def __init__(self, max_size, input_shape, n_actions):
        self.mem_size = max_size
        self.mem_cntr = 0
        self.state_memory = np.zeros((self.mem_size, *input_shape))
        self.new_state_memory = np.zeros((self.mem_size, *input_shape))
        self.action_memory = np.zeros((self.mem_size, n_actions))
        self.reward_memory = np.zeros(self.mem_size)
        self.terminal_memory = np.zeros(self.mem_size)

    def store_transition(self, state, action, reward, state_, done):
        index = self.mem_cntr % self.mem_size
        self.state_memory[index] = state
        self.action_memory[index] = action
        self.reward_memory[index] = reward
        self.new_state_memory[index] = state_
        self.terminal_memory[index] = done

        self.mem_cntr += 1

    def sample_buffer(self, batch_size):
        max_mem = min(self.mem_cntr, self.mem_size)

        batch = np.random.choice(max_mem, batch_size)

        states = self.state_memory[batch]
        actions = self.action_memory[batch]
        rewards = self.reward_memory[batch]
        states_ = self.new_state_memory[batch]
        dones = self.terminal_memory[batch]

        return states, actions, rewards, states_, dones

In [3]:
class CriticNetwork(nn.Module):
    def __init__(self, input_dims, fc1_dims, fc2_dims, n_actions):
        super(CriticNetwork, self).__init__()
        self.input_dims = input_dims
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.n_actions = n_actions

        self.fc1 = nn.Linear(self.input_dims[0] + n_actions, self.fc1_dims)
        self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
        self.q = nn.Linear(self.fc2_dims, 1)
    

    def forward(self, state, action):
        q = self.fc1(T.cat([state, action], dim=1))
        q = F.relu(q)
        q = self.fc2(q)
        q = F.relu(q)
        q = self.q(q)
        return q

In [4]:
class ActorNetwork(nn.Module):
    def __init__(self, input_dims, fc1_dims, fc2_dims,n_actions,max_action):
        super(ActorNetwork, self).__init__()
        self.input_dims = input_dims
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.n_actions = n_actions
        self.max_action=max_action
        self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims)
        self.ln1 = nn.LayerNorm(self.fc1_dims)
        self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
        self.ln2 = nn.LayerNorm(self.fc2_dims)
        self.sigma = nn.Linear(self.fc2_dims, self.n_actions)
        self.mu = nn.Linear(self.fc2_dims, self.n_actions)

    def forward(self, state):
        prob =self.fc1(state)
        prob=self.ln1(prob)
        prob = F.relu(prob)
        prob =self.fc2(prob)
        prob=self.ln2(prob)
        prob = F.relu(prob)

        mu =self.max_action * T.tanh(self.mu(prob))
        sigma =F.sigmoid(self.sigma(prob)).clamp(min=0.1*self.max_action, max=1*self.max_action)
        return mu,sigma

In [5]:
class Agent():
    def __init__(self, alpha, beta, input_dims, tau, action_space_high,action_space_low,
            gamma=0.99, actor_update_interval=2,n_actions=2, max_size=1000000, layer1_size=400,
            layer2_size=300, batch_size=100):
        
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
#         self.to(self.device)
        
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.actor = ActorNetwork(input_dims, layer1_size,layer2_size, n_actions,action_space_high[0])
        self.actor_optimizer=optim.Adam(params=self.actor.parameters(), lr=alpha)
        self.critic_1 = CriticNetwork(input_dims, layer1_size,layer2_size,n_actions)
        self.critic_1_optimizer=optim.Adam(params=self.critic_1.parameters(), lr=beta)
        self.critic_2 = CriticNetwork(input_dims, layer1_size,layer2_size,n_actions)
        self.critic_2_optimizer=optim.Adam(params=self.critic_2.parameters(), lr=beta)

        self.target_actor = ActorNetwork(input_dims, layer1_size,layer2_size, n_actions,action_space_high[0])
        self.target_critic_1 = CriticNetwork(input_dims, layer1_size,layer2_size,n_actions)
        self.target_critic_2 = CriticNetwork(input_dims, layer1_size,layer2_size,n_actions)
        
        self.gamma = gamma
        self.tau = tau
        self.action_space_high = action_space_high
        self.action_space_low = action_space_low
        self.batch_size = batch_size
        self.n_actions = n_actions
        self.update_actor_iter =  actor_update_interval
        self.learn_step_cntr = 0
        self.time_step = 0
        self.update_network_parameters(tau=1)
        
    def choose_action(self, observation):
        state = T.tensor(observation, dtype=T.float).to(self.device)
        self.actor.eval()
        with T.no_grad():
            mu, sigma = self.actor.forward(state)
        self.actor.train()
        noise = (T.randn_like(mu) * sigma).clamp(-0.5*self.action_space_high[0], 0.5*self.action_space_high[0])
        action=T.clamp((mu + noise),self.action_space_low[0],self.action_space_high[0])
        return action.numpy(),mu.numpy()
      

    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return
        state, action, reward, new_state, done = \
                self.memory.sample_buffer(self.batch_size)
        reward = T.tensor(reward, dtype=T.float).to(self.device)             
        done = T.tensor(done, dtype=T.float).to(self.device)
        state_ = T.tensor(new_state, dtype=T.float).to(self.device)
        state = T.tensor(state, dtype=T.float).to(self.device)
        action = T.tensor(action, dtype=T.float).to(self.device)
        
        with T.no_grad():
            mu, sigma = self.target_actor.forward(state_)
            noise = (T.randn_like(action) * (0.2*self.action_space_high[0])).clamp(-0.5*self.action_space_high[0], 0.5*self.action_space_high[0])
            target_actions = (mu + noise).clamp(self.action_space_low [0],self.action_space_high[0])
            q1_ = T.squeeze(agent.target_critic_1.forward(state_, target_actions))
            q2_ = T.squeeze(agent.target_critic_2.forward(state_, target_actions))

        q1 = T.squeeze(agent.critic_1.forward(state, action))
        q2 =T.squeeze(agent.critic_2.forward(state, action))
        
        critic_value_ = T.min(T.squeeze(q1_),T.squeeze(q2_))
        target = reward + self.gamma * (1 - done) * (critic_value_)

        self.critic_1_optimizer.zero_grad()
        self.critic_2_optimizer.zero_grad()
        q1_loss = F.mse_loss(q1,target)
        q2_loss = F.mse_loss(q2,target)
        q1_loss.backward()
        q2_loss.backward()
        self.critic_1_optimizer.step()
        self.critic_2_optimizer.step()

        self.learn_step_cntr += 1
        if self.learn_step_cntr % self.update_actor_iter != 0:
            return
        self.actor_optimizer.zero_grad()
        mean, std = self.actor.forward(state)
        action_distribution=T.distributions.Normal(mean.detach(), std) 
#         actor_min_Q_loss = self.critic_1.forward(state, mean)
        actor_min_Q_loss = T.min(self.critic_1.forward(state, mean),self.critic_2.forward(state, mean))
        actor_mu_loss = T.mean(T.sum(- action_distribution.log_prob(mean) * actor_min_Q_loss.detach(),axis=0) - actor_min_Q_loss)
        actor_mu_loss.backward()
        self.actor_optimizer.step()

        self.update_network_parameters()

    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau

        actor_params = self.actor.named_parameters()
        critic_1_params = self.critic_1.named_parameters()
        critic_2_params = self.critic_2.named_parameters()
        target_actor_params = self.target_actor.named_parameters()
        target_critic_1_params = self.target_critic_1.named_parameters()
        target_critic_2_params = self.target_critic_2.named_parameters()

        critic_1_state_dict = dict(critic_1_params)
        critic_2_state_dict = dict(critic_2_params)
        actor_state_dict = dict(actor_params)
        target_actor_state_dict = dict(target_actor_params)
        target_critic_1_state_dict = dict(target_critic_1_params)
        target_critic_2_state_dict = dict(target_critic_2_params)

        for name in critic_1_state_dict:
            critic_1_state_dict[name] = tau*critic_1_state_dict[name].clone() + \
                    (1-tau)*target_critic_1_state_dict[name].clone()

        for name in critic_2_state_dict:
            critic_2_state_dict[name] = tau*critic_2_state_dict[name].clone() + \
                    (1-tau)*target_critic_2_state_dict[name].clone()

        for name in actor_state_dict:
            actor_state_dict[name] = tau*actor_state_dict[name].clone() + \
                    (1-tau)*target_actor_state_dict[name].clone()

        self.target_critic_1.load_state_dict(critic_1_state_dict)
        self.target_critic_2.load_state_dict(critic_2_state_dict)
        self.target_actor.load_state_dict(actor_state_dict)

In [6]:
algorithm_name="TDS"
enviroment_name='Ant-v4'#'Pendulum-v1'#'MountainCarContinuous-v0'
seed=4
start_timesteps=10000
def policy_evaluation(agent, enviroment_name,episodes=10):
    evaluation_env = gym.make(enviroment_name)
    average_reward = 0.
    for _ in range(episodes):
        state, _ = evaluation_env.reset()
        done=False
        truncuated=False
        while (not done) and (not truncuated):
            _,action = agent.choose_action(np.array(state))
            state, reward, done, truncuated,_ = evaluation_env.step(action)
            average_reward += reward
    average_reward /= episodes
    return average_reward

env = gym.make(enviroment_name)
env.action_space.seed(seed)
T.manual_seed(seed)
np.random.seed(seed)
agent = Agent(alpha=3e-4, beta=3e-4, 
            input_dims=env.observation_space.shape, tau=0.005,
            action_space_high=env.action_space.high,action_space_low=env.action_space.low,batch_size=100, layer1_size=256, layer2_size=256,
            n_actions=env.action_space.shape[0])
evaluations = [policy_evaluation(agent,enviroment_name)]
average_rewards=[]
total_rewards=[]
steps=0
for ep in range(1,10000000000):
    done=False
    state,_=env.reset(seed=seed)
    rewards=0
    episode_timesteps=0
    truncuated=False
    while (not done) and (not truncuated):
        episode_timesteps+=1
        if steps < start_timesteps:
            action = env.action_space.sample()
        else:
            action,_=agent.choose_action(state)
        agent.learn()
        state_,reward,done,truncuated,info=env.step(action)
        agent.remember(state,action,reward,state_,done)
        rewards+=reward
        steps+=1
        state=state_
        if(steps%5000)==0:
            evaluation_reward=policy_evaluation(agent, enviroment_name)
            evaluations.append(evaluation_reward)
            print(f"Evaluation over {10} episodes: {evaluation_reward:.3f}  step{steps}")
    total_rewards.append(rewards)
    average_rewards.append(sum(total_rewards)/len(total_rewards))
    if(steps>1000000):
        break
    if (ep%200==0):
        if ep<100:
            print(f"episode: {ep}   reward: {rewards}  avg so far:{average_rewards[-1]} steps so far:{steps}")
        else:
            print(f"episode: {ep}   reward: {rewards}  m :{sum(total_rewards[-100:])/len(total_rewards[-100:])} t {average_rewards[-1]}:{steps}    steps so far:{steps}")
    
variant = dict(algorithm=algorithm_name,env=enviroment_name,)
if not os.path.exists(f"./data/{enviroment_name}/{algorithm_name}/seed{seed}"):
    os.makedirs(f'./data/{enviroment_name}/{algorithm_name}/seed{seed}')
with open(f'./data/{enviroment_name}/{algorithm_name}/seed{seed}/variant.json', 'w') as outfile:
    json.dump(variant,outfile)
data = np.array(evaluations)
df = pd.DataFrame(data=data,columns=["Average Return"]).reset_index()
df['Timesteps'] = df['index'] * 5000
df['env'] = enviroment_name
df['algorithm_name'] = algorithm_name
df.to_csv(f'./data/{enviroment_name}/{algorithm_name}/seed{seed}/progress.csv', index = False)

Evaluation over 10 episodes: -6.488  step5000
Evaluation over 10 episodes: -87.289  step10000
Evaluation over 10 episodes: 175.387  step15000
Evaluation over 10 episodes: 592.834  step20000
Evaluation over 10 episodes: 720.179  step25000
Evaluation over 10 episodes: 805.990  step30000
Evaluation over 10 episodes: 513.073  step35000
Evaluation over 10 episodes: 822.330  step40000
Evaluation over 10 episodes: 675.054  step45000
Evaluation over 10 episodes: 827.230  step50000
Evaluation over 10 episodes: 702.396  step55000
Evaluation over 10 episodes: 642.801  step60000
Evaluation over 10 episodes: 739.908  step65000
Evaluation over 10 episodes: 646.275  step70000
Evaluation over 10 episodes: 892.423  step75000
Evaluation over 10 episodes: 885.381  step80000
Evaluation over 10 episodes: 918.244  step85000
Evaluation over 10 episodes: 724.387  step90000
Evaluation over 10 episodes: 886.012  step95000
Evaluation over 10 episodes: 661.868  step100000
Evaluation over 10 episodes: 925.630  ste