In [None]:
!pip install -e .

In [None]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
from env.beam_env.BeamEnvironment import BeamEnvironment
import torch
from Agent import DDPG

print(gym.envs.registry)

env_name = 'BeamEnvironment-v0'
env = gym.make(env_name)

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])

agent = DDPG(state_dim, action_dim, max_action)
episodes = 5000
max_steps = 500
batch_size = 64

exploration_decay = 0.999
noise_variance = 3.5
episode_rewards = []

agent.actor.load_state_dict(torch.load('actor.pth'))
agent.actor_target.load_state_dict(torch.load('actor_target.pth'))
agent.critic.load_state_dict(torch.load('critic.pth'))
agent.critic_target.load_state_dict(torch.load('critic_target.pth'))

# next idea :
# high negative reward on high displacements and velocities (when it dies)

In [None]:
# state = env.reset()
# action_sequence = [-11.0, -20.0, -26.0, -39.0, -49.0, -64.0, -84.0, -108.0, -136.0, -159.0, -180.0]
# for action in action_sequence:
#     state, _, _, _ = env.step([action])

In [None]:
# import matplotlib.pyplot as plt
# plt.plot(np.linspace(0,1,100),state[0:200:2])
# plt.ylim(-5,5)
# plt.show()

# print(np.max(state[0:200]))
# print(np.where(state[0:200]==np.max(state[0:200])))

In [None]:
episodes = 300
for episode in range(episodes):
    state = env.reset()
    episode_reward = 0
    for step in range(max_steps):
        action = agent.select_action(np.array(state), noise=noise_variance)
        print(action)
        next_state, reward, done, _ = env.step(action)
        print('state:',np.max(np.abs(next_state[0:200:2])),np.max(np.abs(next_state[200::2])),f'episode:{episode}, step:{step}, reward:{episode_reward}')
        # print(next_state)
        # gg
        agent.replay_buffer.add((state, action, next_state, reward, float(not done)))
        state = next_state
        episode_reward += reward

        if len(agent.replay_buffer.storage) > batch_size:
            agent.train(batch_size)

        if done:
            print('step',step)
            break

    noise_variance = exploration_decay * noise_variance
    # agent.reset_noise(sigma=noise_variance)
    episode_rewards.append(episode_reward)
    print(f"Episode: {episode + 1}, Total Reward: {episode_reward}")
    if episode_reward>0:
        print('yay')

    if episode % 10==0:
        torch.save(agent.actor_target.state_dict(), 'actor_target.pth')
        torch.save(agent.actor.state_dict(), 'actor.pth')
        torch.save(agent.critic_target.state_dict(), 'critic_target.pth')
        torch.save(agent.critic.state_dict(), 'critic.pth')


In [None]:
plt.plot(episode_rewards)

In [None]:
# inference (testing)
traced_node_displacements = []
traced_node_velocity = []
step_rewards = []
state = env.reset()
traced_node_displacements.append(state[0:200:2][-1])
traced_node_velocity.append(state[200:-1:2][-1])
max_steps = 500
for step in range(max_steps):
    action = agent.select_action(np.array(state), noise=0.01)
    next_state, reward, done, _ = env.step(action)
    traced_node_displacements.append(next_state[0:200:2][-1]) # displacement of the last node
    traced_node_velocity.append(next_state[200:-1:2][-1]) # velocity of the last node
    state = next_state
    step_rewards.append(reward)
    # if done:
    #     break

In [None]:
plt.plot(traced_node_displacements)
plt.xlabel('Step')
plt.ylabel('Displacement (m)')
plt.show()
plt.plot(step_rewards)
plt.show()