In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import gymnasium as gym
import matplotlib.pyplot as plt
from collections import deque
import random
import copy

from tqdm import tqdm
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE, gym.__version__, torch.__version__, np.__version__

(device(type='cuda'), '1.1.1', '2.6.0+cu118', '1.24.4')

In [None]:
env = gym.make("Hopper-v4", render_mode="human")
observation, information = env.reset()
state_dimension = env.observation_space.shape[0]
action_dimension = env.action_space.shape[0]
action_high = env.action_space.high
action_low = env.action_space.low
env_info = {
        "observation_dimension": state_dimension,
        "action_dimension": action_dimension,
        "action_range": (action_low, action_high),
    }
print("Environment initialized successfully!")
action_bound = float(env.action_space.high[0])
env.close()
env_info, action_bound

  logger.deprecation(


Environment initialized successfully!


({'observation_dimension': 11,
  'action_dimension': 3,
  'action_range': (array([-1., -1., -1.], dtype=float32),
   array([1., 1., 1.], dtype=float32))},
 1.0)

In [4]:
information

{}

### Actor Net

In [5]:
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, max_action):
        super(Actor, self).__init__()
        self.l1 = nn.Linear(state_dim, 400)
        self.l2 = nn.Linear(400, 300)
        self.l3 = nn.Linear(300, action_dim)
        self.max_action = max_action

    def forward(self, state):
        a = F.relu(self.l1(state))
        a = F.relu(self.l2(a))
        return self.max_action * torch.tanh(self.l3(a))

### Critic Net

In [None]:
class Critic(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_units_l1=400, hidden_units_l2=300):
        super(Critic, self).__init__()

        self.l1 = nn.Linear(state_dim + action_dim, hidden_units_l1)
        self.l2 = nn.Linear(hidden_units_l1, hidden_units_l2)
        self.l3 = nn.Linear(hidden_units_l2, 1)


        self.l4 = nn.Linear(state_dim + action_dim, hidden_units_l1)
        self.l5 = nn.Linear(hidden_units_l1, hidden_units_l2)
        self.l6 = nn.Linear(hidden_units_l2, 1)

    def forward(self, state, action):
        sa = torch.cat([state, action], 1)

        q1 = F.relu(self.l1(sa))
        q1 = F.relu(self.l2(q1))
        q1 = self.l3(q1)

        q2 = F.relu(self.l4(sa))
        q2 = F.relu(self.l5(q2))
        q2 = self.l6(q2)

        return q1, q2

    def Q1(self, state, action):
        sa = torch.cat([state, action], 1)
        q1 = F.relu(self.l1(sa))
        q1 = F.relu(self.l2(q1))
        return self.l3(q1)

### Replay Buffer

In [7]:
class ReplayBuffer:
    def __init__(self, max_size=1_000_000):
        self.buffer = deque(maxlen=max_size)

    def push(self, transition):
        self.buffer.append(transition)

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        state, action, reward, next_state, done = map(np.stack, zip(*batch))
        return (
            torch.FloatTensor(state).to(DEVICE),
            torch.FloatTensor(action).to(DEVICE),
            torch.FloatTensor(reward).unsqueeze(1).to(DEVICE),
            torch.FloatTensor(next_state).to(DEVICE),
            torch.FloatTensor(done).unsqueeze(1).to(DEVICE),
        )

    def __len__(self):
        return len(self.buffer)

### TD3 
- Target nets to be updated every once in a while to ensure convergence
- Actor - Takes in the state, spits out action
- Critic - Takes in state and action taken for that state, spits out value
- Polyak Averaging -
- 

In [None]:
"""
For documentation:

"""
class TD3_1:
    def __init__(self, state_dim, action_dim, max_action):
        self.actor = Actor(state_dim, action_dim, max_action).to(DEVICE)
        self.actor_target = Actor(state_dim, action_dim, max_action).to(DEVICE) 
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=1e-3)

        self.critic = Critic(state_dim, action_dim).to(DEVICE)
        self.critic_target = Critic(state_dim, action_dim).to(DEVICE)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=1e-3)

        self.max_action = max_action
        self.total_iterations = 0

    def select_action(self, state):
        state = torch.FloatTensor(state.reshape(1, -1)).to(DEVICE)
        return self.actor(state).cpu().data.numpy().flatten()

    def train(self, replay_buffer, batch_size=256, gamma=0.99, tau=0.005, policy_delay=2):
        self.total_iterations += 1

        state, action, reward, next_state, done = replay_buffer.sample(batch_size)

        # Target actions (no smoothing as per your instruction)
        target_action = self.actor_target(next_state)

        # Target Q values
        target_Q1, target_Q2 = self.critic_target(next_state, target_action)
        target_Q = torch.min(target_Q1, target_Q2)
        target_Q = reward + ((1 - done) * gamma * target_Q).detach()

        # Current Q estimates
        current_Q1, current_Q2 = self.critic(state, action)

        critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Delayed policy updates
        if self.total_iterations % policy_delay == 0:
            actor_loss = -self.critic.Q1(state, self.actor(state)).mean()
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # Polyak averaging
            for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
                target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

            for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
                target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

class TD3:
    def __init__(self, state_dim, action_dim, max_action):
        self.actor1 = Actor(state_dim, action_dim, max_action).to(DEVICE)
        self.actor1_target = Actor(state_dim, action_dim, max_action).to(DEVICE) 
        self.actor1_target.load_state_dict(self.actor1.state_dict())
        self.actor1_optimizer = optim.Adam(self.actor1.parameters(), lr=1e-3)

        self.actor2 = Actor(state_dim, action_dim, max_action).to(DEVICE)
        self.actor2_target = Actor(state_dim, action_dim, max_action).to(DEVICE)
        self.actor2_target.load_state_dict(self.actor2.state_dict())
        self.actor2_optimizer = optim.Adam(self.actor2.parameters(), lr=1e-3)

        self.critic = Critic(state_dim, action_dim).to(DEVICE)
        self.critic_target = Critic(state_dim, action_dim).to(DEVICE)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=1e-3)

        self.max_action = max_action
        self.total_iterations = 0

    def select_action(self, state):
        state = torch.FloatTensor(state.reshape(1, -1)).to(DEVICE)
        return self.actor1(state).cpu().data.numpy().flatten()

    def train(self, replay_buffer, batch_size=256, gamma=0.99, tau=0.005, policy_delay=2):
        self.total_iterations += 1

        state, action, reward, next_state, done = replay_buffer.sample(batch_size)


        target_action1 = self.actor1_target(next_state)
        target_action2 = self.actor2_target(next_state)
        target_action = (target_action1 + target_action2) / 2


        target_Q1, target_Q2 = self.critic_target(next_state, target_action)
        target_Q = torch.min(target_Q1, target_Q2)
        target_Q = reward + ((1 - done) * gamma * target_Q).detach()


        current_Q1, current_Q2 = self.critic(state, action)

        critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Delayed policy updates
        if self.total_iterations % policy_delay == 0:
            actor1_loss = -self.critic.Q1(state, self.actor1(state)).mean()
            self.actor1_optimizer.zero_grad()
            actor1_loss.backward()
            self.actor1_optimizer.step()

            actor2_loss = -self.critic.Q1(state, self.actor2(state)).mean()
            self.actor2_optimizer.zero_grad()
            actor2_loss.backward()
            self.actor2_optimizer.step()

            # Polyak averaging
            for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
                target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

            for param, target_param in zip(self.actor1.parameters(), self.actor1_target.parameters()):
                target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

            for param, target_param in zip(self.actor2.parameters(), self.actor2_target.parameters()):
                target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

### N-Step Replay Buffer

In [9]:
class NStepReplayBuffer_1:
    def __init__(self, max_size=1_000_000, n_step=3, gamma=0.99):
        self.buffer = deque(maxlen=max_size)
        self.n_step_buffer = deque(maxlen=n_step)
        self.n_step = n_step
        self.gamma = gamma

    def _get_n_step_transition(self):
        R, next_state, done = 0, self.n_step_buffer[-1][3], self.n_step_buffer[-1][4]
        for idx in reversed(range(len(self.n_step_buffer))):
            r, ns, d = self.n_step_buffer[idx][2], self.n_step_buffer[idx][3], self.n_step_buffer[idx][4]
            R = r + self.gamma * R * (1 - d)
            if d:
                next_state, done = ns, d
        state, action = self.n_step_buffer[0][0], self.n_step_buffer[0][1]
        return state, action, R, next_state, done

    def push(self, transition):
        self.n_step_buffer.append(transition)
        if len(self.n_step_buffer) == self.n_step:
            self.buffer.append(self._get_n_step_transition())

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        state, action, reward, next_state, done = map(np.stack, zip(*batch))
        return (
            torch.FloatTensor(state).to(DEVICE),
            torch.FloatTensor(action).to(DEVICE),
            torch.FloatTensor(reward).unsqueeze(1).to(DEVICE),
            torch.FloatTensor(next_state).to(DEVICE),
            torch.FloatTensor(done).unsqueeze(1).to(DEVICE),
        )

    def __len__(self):
        return len(self.buffer)

class NStepReplayBuffer:
    def __init__(self, n_step=3, gamma=0.99, capacity=1_000_000):
        self.n_step = n_step
        self.gamma = gamma
        self.buffer = deque(maxlen=capacity)
        self.n_step_buffer = deque(maxlen=n_step)

    def _get_n_step_info(self):
        state, action = self.n_step_buffer[0][:2]
        reward, next_state, done = self.n_step_buffer[-1][2:]
        for transition in reversed(list(self.n_step_buffer)[:-1]):
            r, n_s, d = transition[2:]
            reward = r + self.gamma * reward * (1 - d)
            next_state, done = (n_s, d) if d else (next_state, done)
        return state, action, reward, next_state, done

    def push(self, transition):
        self.n_step_buffer.append(transition)
        if len(self.n_step_buffer) == self.n_step:
            self.buffer.append(self._get_n_step_info())

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        state, action, reward, next_state, done = map(np.stack, zip(*batch))
        return (
            torch.FloatTensor(state).to(DEVICE),
            torch.FloatTensor(action).to(DEVICE),
            torch.FloatTensor(reward).unsqueeze(1).to(DEVICE),
            torch.FloatTensor(next_state).to(DEVICE),
            torch.FloatTensor(done).unsqueeze(1).to(DEVICE),
        )

    def __len__(self):
        return len(self.buffer)

In [None]:
env_name = "Hopper-v4"
MAX_TIMESTEPS = 1_000_000
START_TIMESTEPS = 25_000
EVAL_FREQ = 5_000
EVAL_EPISODES = 10
BATCH_SIZE = 256
DISCOUNT = 0.99
TAU = 0.005
POLICY_DELAY = 2
EXPLORATION_NOISE = 0.1
N_STEP = 3

# Define training loop
agent = TD3(state_dimension, action_dimension, action_bound)
replay_buffer = NStepReplayBuffer(n_step=N_STEP, gamma=DISCOUNT)

timesteps = 0
episode_num = 0
while timesteps < MAX_TIMESTEPS:
    env = gym.make(env_name)
    state, _ = env.reset()
    episode_reward = 0
    done = False
    while not done:
        if timesteps < START_TIMESTEPS:
            action = env.action_space.sample()
        else:
            action = agent.select_action(np.array(state))
            action = (action + np.random.normal(0, EXPLORATION_NOISE, size=action_dimension)).clip(-action_bound, action_bound)

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        replay_buffer.push((state, action, reward, next_state, float(done)))
        state = next_state
        episode_reward += reward
        timesteps += 1

        if len(replay_buffer) > BATCH_SIZE:
            agent.train(replay_buffer, BATCH_SIZE, DISCOUNT, TAU, POLICY_DELAY)

        if timesteps % EVAL_FREQ == 0:
            avg_reward = 0.0
            for _ in range(EVAL_EPISODES):
                # eval_env = gym.make(ENV_NAME)
                eval_state, _ = env.reset()
                eval_done = False
                while not eval_done:
                    eval_action = agent.select_action(np.array(eval_state))
                    eval_state, eval_reward, eval_terminated, eval_truncated, _ = env.step(eval_action)
                    eval_done = eval_terminated or eval_truncated
                    avg_reward += eval_reward
                # eval_env.close
            avg_reward /= EVAL_EPISODES
            print(f"Timestep: {timesteps}, Average Reward: {avg_reward:.2f}")

    print(f"Episode {episode_num} ended with reward: {episode_reward:.2f}")
    episode_num += 1
    env.close()

  logger.deprecation(


Episode 0 ended with reward: 10.62
Episode 1 ended with reward: 14.28
Episode 2 ended with reward: 14.51
Episode 3 ended with reward: 18.97
Episode 4 ended with reward: 7.31
Episode 5 ended with reward: 15.83
Episode 6 ended with reward: 8.45
Episode 7 ended with reward: 10.13
Episode 8 ended with reward: 13.50
Episode 9 ended with reward: 22.85
Episode 10 ended with reward: 12.58
Episode 11 ended with reward: 19.81
Episode 12 ended with reward: 8.02
Episode 13 ended with reward: 8.65
Episode 14 ended with reward: 9.16
Episode 15 ended with reward: 26.05
Episode 16 ended with reward: 6.72
Episode 17 ended with reward: 39.60
Episode 18 ended with reward: 7.10
Episode 19 ended with reward: 13.55
Episode 20 ended with reward: 11.95
Episode 21 ended with reward: 5.52
Episode 22 ended with reward: 9.88
Episode 23 ended with reward: 35.92
Episode 24 ended with reward: 9.78
Episode 25 ended with reward: 29.32
Episode 26 ended with reward: 9.12
Episode 27 ended with reward: 19.30
Episode 28 en

: 

In [None]:
env = gym.make("Hopper-v4", render_mode="human")
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])

td3 = TD3(state_dim, action_dim, max_action)
replay_buffer = NStepReplayBuffer(n_step=3, gamma=0.99)

episodes = 1000
max_steps = 1000
batch_size = 256
exploration_noise = 0.1
start_timesteps = 25000

total_timesteps = 0

for episode in range(episodes):
    state, _ = env.reset()
    episode_reward = 0
    for step in range(max_steps):
        if total_timesteps < start_timesteps:
            action = env.action_space.sample()
        else:
            action = td3.select_action(np.array(state))
            action += np.random.normal(0, exploration_noise, size=action_dim)
            action = np.clip(action, -max_action, max_action)

        next_state, reward, done, truncated, _ = env.step(action)
        replay_buffer.push((state, action, reward, next_state, float(done)))

        state = next_state
        episode_reward += reward
        total_timesteps += 1

        if total_timesteps >= start_timesteps:
            td3.train(replay_buffer, batch_size)

        if done or truncated:
            break

    print(f"Episode {episode} | Total Steps {total_timesteps} | Episode Reward {episode_reward:.2f}")


Episode 0 | Total Steps 23 | Episode Reward 11.50
Episode 1 | Total Steps 37 | Episode Reward 12.12
Episode 2 | Total Steps 52 | Episode Reward 12.48
Episode 3 | Total Steps 76 | Episode Reward 25.84
Episode 4 | Total Steps 102 | Episode Reward 23.27
Episode 5 | Total Steps 114 | Episode Reward 7.80
Episode 6 | Total Steps 125 | Episode Reward 9.41
Episode 7 | Total Steps 136 | Episode Reward 8.97
Episode 8 | Total Steps 146 | Episode Reward 7.87
Episode 9 | Total Steps 155 | Episode Reward 6.49
Episode 10 | Total Steps 171 | Episode Reward 13.86
Episode 11 | Total Steps 215 | Episode Reward 21.24
Episode 12 | Total Steps 258 | Episode Reward 44.50
Episode 13 | Total Steps 286 | Episode Reward 26.36
Episode 14 | Total Steps 296 | Episode Reward 6.47
Episode 15 | Total Steps 319 | Episode Reward 17.39
Episode 16 | Total Steps 335 | Episode Reward 11.95
Episode 17 | Total Steps 348 | Episode Reward 10.22
Episode 18 | Total Steps 382 | Episode Reward 35.04
Episode 19 | Total Steps 398 | E

: 

In [11]:
def check_termination_flag(file_path="TFlag.txt"):
    try:
        with open(file_path, 'r') as file:
            
            content = file.read().strip()

            if content == '0':
                print("Received Termination Signal. Terminating...")
                return True
            elif content == '1':
                return False
            else:
                print(f"Unexpected content in file: {content}. Terminating Process...")
                return True
            
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return False
    except Exception as e:
        print(f"Error reading {e}. Terminating Process...")
        return True

In [None]:
env_name = "Hopper-v4"
MAX_TIMESTEPS = 1_000_000
START_TIMESTEPS = 25_000
BATCH_SIZE = 256
DISCOUNT = 0.99
TAU = 0.005
POLICY_DELAY = 2
EXPLORATION_NOISE = 0.1
N_STEP = 3
EVAL_EPISODES = 10

# Environment setup
env = gym.make(env_name, render_mode="human")
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
action_bound = float(env.action_space.high[0])

# Define training loop
agent = TD3(state_dim, action_dim, action_bound)
replay_buffer = NStepReplayBuffer(n_step=N_STEP, gamma=DISCOUNT)

timesteps = 0
episode_num = 0
state, _ = env.reset()

while timesteps < MAX_TIMESTEPS:
    episode_reward = 0
    done = False
    while not done:
        if timesteps < START_TIMESTEPS:
            action = env.action_space.sample()
        else:
            action = agent.select_action(np.array(state))
            action = (action + np.random.normal(0, EXPLORATION_NOISE, size=action_dim)).clip(-action_bound, action_bound)

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        replay_buffer.push((state, action, reward, next_state, float(done)))
        state = next_state
        episode_reward += reward
        timesteps += 1

        if len(replay_buffer) > BATCH_SIZE:
            agent.train(replay_buffer, BATCH_SIZE, DISCOUNT, TAU, POLICY_DELAY)

    print(f"Episode {episode_num} ended with reward: {episode_reward:.2f}")
    
    episode_num += 1
    state, _ = env.reset()

# Post-training evaluation
total_eval_reward = 0.0
for ep in range(EVAL_EPISODES):
    state, _ = env.reset()
    done = False
    episode_reward = 0
    while not done:
        action = agent.select_action(np.array(state))
        state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        episode_reward += reward
    total_eval_reward += episode_reward
    print(f"Evaluation Episode {ep} | Reward: {episode_reward:.2f}")

avg_reward = total_eval_reward / EVAL_EPISODES
print(f"\nAverage Evaluation Reward over {EVAL_EPISODES} episodes: {avg_reward:.2f}")

env.close()

Episode 0 ended with reward: 15.39
Episode 1 ended with reward: 18.10
Episode 2 ended with reward: 6.79
Episode 3 ended with reward: 36.16
Episode 4 ended with reward: 27.33
Episode 5 ended with reward: 16.41
Episode 6 ended with reward: 55.94
Episode 7 ended with reward: 19.36
Episode 8 ended with reward: 25.30
Episode 9 ended with reward: 19.63
Episode 10 ended with reward: 10.82
Episode 11 ended with reward: 11.62
Episode 12 ended with reward: 5.11
Episode 13 ended with reward: 11.76
Episode 14 ended with reward: 20.92
Episode 15 ended with reward: 24.73
Episode 16 ended with reward: 24.02
Episode 17 ended with reward: 19.31
Episode 18 ended with reward: 11.81
Episode 19 ended with reward: 13.22
Episode 20 ended with reward: 10.27
Episode 21 ended with reward: 10.08
Episode 22 ended with reward: 36.95
Episode 23 ended with reward: 43.49
Episode 24 ended with reward: 18.71
Episode 25 ended with reward: 12.45
Episode 26 ended with reward: 9.04
Episode 27 ended with reward: 32.49
Episo

c:\Users\91748\.conda\envs\rl_env\Lib\site-packages\glfw\__init__.py:917: GLFWError: (65537) b'The GLFW library is not initialized'


KeyboardInterrupt: 

## Training Loop V2

### Loggers

In [13]:
def log(episode, episode_reward, timesteps):
    with open("training_log.txt", "a") as f:
        f.write(f"Episode: {episode}, Reward: {episode_reward}, Timesteps: {timesteps}\n")

In [None]:
env_name = "Hopper-v4"
START_TIMESTEPS = 25_000
BATCH_SIZE = 256
DISCOUNT = 0.99
TAU = 0.005
POLICY_DELAY = 2
EXPLORATION_NOISE = 0.1
N_STEP = 3

# Environment setup
env = gym.make(env_name, render_mode="human")
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
action_bound = float(env.action_space.high[0])

agent = TD3(state_dim, action_dim, action_bound)
replay_buffer = NStepReplayBuffer(n_step=N_STEP, gamma=DISCOUNT)

timesteps = 0
episode_num = 0
state, _ = env.reset()

MAX_EPISODES = 2000

rewards = []

for episode in range(MAX_EPISODES):
    state, _ = env.reset()
    episode_reward = 0
    done = False
    while not done:
        if timesteps < START_TIMESTEPS:
            action = env.action_space.sample()
        else:
            action = agent.select_action(np.array(state))
            action = (action + np.random.normal(0, EXPLORATION_NOISE, size=action_dim)).clip(-action_bound, action_bound)

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        replay_buffer.push((state, action, reward, next_state, float(done)))
        state = next_state
        
        episode_reward += reward
        rewards.append(episode_reward)
        log(episode, episode_reward, timesteps)


        timesteps += 1


        if len(replay_buffer) > BATCH_SIZE:
            agent.train(replay_buffer, BATCH_SIZE, DISCOUNT, TAU, POLICY_DELAY)

    print(f"Episode {episode + 1} ended with reward: {episode_reward:.2f}")

# Logging reward list
with open("rewards_list.bin", "w") as f:
    f.write(rewards)

Episode 1 ended with reward: 29.42
Episode 2 ended with reward: 8.07
Episode 3 ended with reward: 12.76
Episode 4 ended with reward: 5.92
Episode 5 ended with reward: 10.94
Episode 6 ended with reward: 13.48
Episode 7 ended with reward: 7.91
Episode 8 ended with reward: 10.90
Episode 9 ended with reward: 11.02
Episode 10 ended with reward: 26.61
Episode 11 ended with reward: 48.91
Episode 12 ended with reward: 14.98
Episode 13 ended with reward: 15.23
Episode 14 ended with reward: 5.88
Episode 15 ended with reward: 9.25
Episode 16 ended with reward: 16.77
Episode 17 ended with reward: 11.35
Episode 18 ended with reward: 22.53
Episode 19 ended with reward: 12.95
Episode 20 ended with reward: 22.09
Episode 21 ended with reward: 6.12
Episode 22 ended with reward: 12.78
Episode 23 ended with reward: 53.73
Episode 24 ended with reward: 47.58
Episode 25 ended with reward: 19.15
Episode 26 ended with reward: 13.82
Episode 27 ended with reward: 10.81
Episode 28 ended with reward: 55.04
Episode

Exception ignored in: <function WindowViewer.__del__ at 0x0000013B23B56980>
Traceback (most recent call last):
  File "c:\Users\91748\.conda\envs\rl_env\Lib\site-packages\gymnasium\envs\mujoco\mujoco_rendering.py", line 380, in __del__
    self.free()
  File "c:\Users\91748\.conda\envs\rl_env\Lib\site-packages\gymnasium\envs\mujoco\mujoco_rendering.py", line 369, in free
    glfw.destroy_window(self.window)
  File "c:\Users\91748\.conda\envs\rl_env\Lib\site-packages\glfw\__init__.py", line 1281, in destroy_window
    _glfw.glfwDestroyWindow(window)
OSError: exception: access violation reading 0x0000000000000000


Episode 1698 ended with reward: 910.77
Episode 1699 ended with reward: 834.18
Episode 1700 ended with reward: 894.53
Episode 1701 ended with reward: 1229.91
Episode 1702 ended with reward: 1939.47
Episode 1703 ended with reward: 948.41
Episode 1704 ended with reward: 805.97
Episode 1705 ended with reward: 634.15
Episode 1706 ended with reward: 1350.45
Episode 1707 ended with reward: 812.10
Episode 1708 ended with reward: 471.05
Episode 1709 ended with reward: 502.12
Episode 1710 ended with reward: 581.71
Episode 1711 ended with reward: 686.38
Episode 1712 ended with reward: 601.15
Episode 1713 ended with reward: 1575.58
Episode 1714 ended with reward: 1341.78
Episode 1715 ended with reward: 1811.07
Episode 1716 ended with reward: 1887.16
Episode 1717 ended with reward: 1542.76
Episode 1718 ended with reward: 1992.44
Episode 1719 ended with reward: 1163.48
Episode 1720 ended with reward: 961.31
Episode 1721 ended with reward: 696.15
Episode 1722 ended with reward: 2103.18
Episode 1723 e

In [None]:
def evaluate_policy(env, policy, episodes=10):
    total_reward = 0.0
    for _ in range(episodes):
        state, _ = env.reset()
        done = False
        while not done:
            action = policy.select_action(np.array(state))
            state, reward, done, truncated, _ = env.step(action)
            total_reward += reward
    avg_reward = total_reward / episodes
    print(f"Average Evaluation Reward over {episodes} episodes: {avg_reward:.2f}")
    return avg_reward

eval_env = gym.make("Hopper-v4", render_mode=None)
evaluate_policy(eval_env, td3)

In [None]:
def process_episode_data(file_path):
    episode_data = {}
    
    with open(file_path, 'r') as file:
        for line in file:
            # Parse each line
            parts = line.strip().split(', ')
            episode = int(parts[0].split(': ')[1])
            reward = float(parts[1].split(': ')[1])
            timesteps = int(parts[2].split(': ')[1])
            
            # Initialize episode entry if it doesn't exist
            if episode not in episode_data:
                episode_data[episode] = {
                    'total_reward': 0.0,
                    'start_timestep': timesteps,
                    'end_timestep': timesteps
                }
            else:
                # Update total reward and end timestep
                episode_data[episode]['total_reward'] += reward
                episode_data[episode]['end_timestep'] = timesteps
    
    # Calculate timesteps passed for each episode
    result = {}
    for episode, data in episode_data.items():
        result[episode] = {
            'total_reward': data['total_reward'],
            'timesteps_passed': data['end_timestep'] - data['start_timestep'] + 1
        }
    
    return result

# Example usage:
episode_stats = process_episode_data('training_log.txt')
print(episode_stats)