In [11]:
import os
from icecream import ic

#! Set the environment variables to override gpu (specically for my device)
os.environ['HSA_OVERRIDE_GFX_VERSION'] = '10.3.0'
os.environ['ROCBLAS_TENSILE_LIBRARY'] = '/home/autrio/.local/lib/python3.10/site-packages/torch/lib/rocblas/library/TensileLibrary_lazy_gfx1030.dat'



In [12]:
import gymnasium as gym
from gymnasium import spaces
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from stable_baselines3 import PPO
from collections import deque
import random

class LNNModel(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=256):
        super(LNNModel, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(state_dim + action_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, state_dim)  # Predicts next state
        )

    def forward(self, state, action):
        # print(">>>>", state.shape, action.shape)
        x = torch.cat([state, action], dim=-1)
        return self.model(x)


class RewardNetwork(nn.Module):
    def __init__(self, state_dim, hidden_dim=256):
        super(RewardNetwork, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)  # Predicts reward
        )

    def forward(self, state):
        return self.model(state)


class ReplayBuffer:
    def __init__(self, max_size=100000):
        self.buffer = deque(maxlen=max_size)

    def add(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        return (
            torch.tensor(states, dtype=torch.float32),
            torch.tensor(actions, dtype=torch.float32),
            torch.tensor(rewards, dtype=torch.float32),
            torch.tensor(next_states, dtype=torch.float32),
            torch.tensor(dones, dtype=torch.float32),
        )

    def __len__(self):
        return len(self.buffer)

class CustomGymEnvironment(gym.Env):
    """
    Custom Gym environment that uses LNN for state transitions, Reward Network for reward calculations,
    and incorporates a replay buffer.
    """
    def __init__(self, lnn_model, reward_model, state_dim, action_dim, action_space, observation_space, device, replay_buffer):
        super(CustomGymEnvironment, self).__init__()
        self.lnn_model = lnn_model
        self.reward_model = reward_model
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.device = device
        self.replay_buffer = replay_buffer

        # Define observation and action space
        self.observation_space = observation_space
        self.action_space = action_space

        # Initialize state
        self.state = np.zeros(state_dim)

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        # print("<<<", self.state.shape)
        if len(self.replay_buffer) > 0:
            # Sample a random state from the replay buffer
            sample = self.replay_buffer.sample(1)
            # print(sample.shape)
            if sample:
                self.state = sample[0][0]
        else:
            # Otherwise, initialize randomly
            self.state = self.observation_space.sample()
            # print(self.state.shape)
        # print("<<<2", self.state.shape)
        return self.state, {}

    def step(self, action):
        # Convert state and action to tensors
        # print(action.shape)
        state_tensor = torch.tensor(self.state, dtype=torch.float32, device=self.device).unsqueeze(0)
        action_tensor = torch.tensor(action, dtype=torch.float32, device=self.device).unsqueeze(0)
        # print("Hii",state_tensor.shape, action_tensor.shape)
        # Use LNN to predict the next state
        next_state = self.lnn_model(state_tensor, action_tensor).squeeze(0).cpu().detach().numpy()

        # Use Reward Network to calculate reward
        reward = (
            self.reward_model(
                torch.tensor(next_state, dtype=torch.float32, device=self.device).unsqueeze(0)
            )
            .item()
        )

        # Check if the episode is done
        done = np.abs(self.state[0])  <= 0.1 and np.abs(self.state[1]) <= 0.01   # Example condition

        # Store the transition in the replay buffer
        self.replay_buffer.add(self.state, action, reward, next_state, done)

        # Update state
        self.state = next_state

        return next_state, reward, done, False, {}


def train_with_lnn_and_ppo(
    model, env, custom_env, device, num_episodes, batch_size
):
    reward_optimizer = optim.Adam(reward_network.parameters(), lr=2e-6)
    lnn_optimizer = optim.Adam(custom_env.lnn_model.parameters(), lr=2e-6)
    reward_loss_fn = nn.MSELoss()
    lnn_loss_fn = nn.MSELoss()

    for episode in range(num_episodes):
        state, _ = env.reset()
        done = False

        while not done:
            # Collect transitions using PPO policy
            action, _ = model.predict(state, deterministic=False)
            state1, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated

            # print(state.shape, action.shape)
            custom_env.replay_buffer.add(state, action, reward, state1, done)


        # Train LNN and Reward Network
        if len(custom_env.replay_buffer) >= batch_size:
            states, actions, rewards, next_states, dones = custom_env.replay_buffer.sample(batch_size)

            # Train LNN
            lnn_preds = custom_env.lnn_model(states.to(device), actions.to(device))
            lnn_loss = lnn_loss_fn(lnn_preds, next_states.to(device))
            lnn_optimizer.zero_grad()
            lnn_loss.backward()
            lnn_optimizer.step()

            # Train Reward Network
            reward_preds = custom_env.reward_model(next_states.to(device))
            reward_loss = reward_loss_fn(reward_preds, rewards.to(device).unsqueeze(-1))
            reward_optimizer.zero_grad()
            reward_loss.backward()
            reward_optimizer.step()

            print(f"Episode: {episode}, LNN loss: {lnn_loss}, Rewards Loss: {reward_loss}")
        # Train PPO policy
        model.learn(total_timesteps=2048, reset_num_timesteps=False)

from stable_baselines3.common.logger import configure
from stable_baselines3.common.monitor import Monitor

tmp_path = "./"
# set up logger
new_logger = configure(tmp_path, ["stdout", "csv", "tensorboard"])



# Setup
env = gym.make("Pendulum-v1")
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
# print("Action Dim", action_dim)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

lnn_model = LNNModel(state_dim, action_dim).to(device)
reward_network = RewardNetwork(state_dim).to(device)
replay_buffer = ReplayBuffer(max_size=10000)

action_space = env.action_space

custom_env = CustomGymEnvironment(
    lnn_model=lnn_model,
    reward_model=reward_network,
    state_dim=state_dim,
    action_dim=action_dim,
    action_space=action_space,
    observation_space=env.observation_space,
    device=device,
    replay_buffer=replay_buffer,
)

model = PPO("MlpPolicy", custom_env, verbose=1)
model.set_logger(new_logger)

# Train
trained_model, trained_lnn, trained_reward = train_with_lnn_and_ppo(
    model,  env, custom_env, device, num_episodes=50, batch_size=4096
)

# Save models
# torch.save(custom_env.lnn_model.state_dict(), "lnn_cartpole.pth")
# torch.save(custom_env.reward_model.state_dict(), "reward_cartpole.pth")
# trained_model.save("ppo_cartpole_with_lnn")
model.save("ppo_cartpole_with_lnn")


Logging to ./


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


  state_tensor = torch.tensor(self.state, dtype=torch.float32, device=self.device).unsqueeze(0)


-----------------------------
| time/              |      |
|    fps             | 803  |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 687          |
|    iterations           | 1            |
|    time_elapsed         | 2            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0049869497 |
|    clip_fraction        | 0.0277       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.42        |
|    explained_variance   | -0.256       |
|    learning_rate        | 0.0003       |
|    loss                 | 0.00564      |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0025      |
|    std                  | 0.996        |
|    value_loss           | 0.00648      |
----------------

TypeError: cannot unpack non-iterable NoneType object

In [None]:
model.save("ppo_cartpole_with_lnn")


In [None]:
from stable_baselines3.common.evaluation import evaluate_policy

env = gym.make("Pendulum-v1", render_mode = 'human')
# 5. Load the trained model (optional)
model = PPO.load("ppo_cartpole_with_lnn", env=env)

# 6. Evaluate the trained policy
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=3)
print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

# 7. Run the trained agent
obs, _ = env.reset()
for i in range(1000):  # Run for a fixed number of timesteps
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, terminated, truncated, _ = env.step(action)
    env.render()
    if terminated or truncated:
        obs, _ = env.reset()
        print("Done ", i)
    # if not i%20: print(i)

env.close()

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




Mean reward: -1080.87 +/- 308.68
Done  199
Done  399
Done  599
Done  799
Done  999
