In [1]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(0))

2.9.1+cu126
True
1
0
NVIDIA GeForce RTX 4060


In [14]:
from torch import nn
from pytorch_lightning import LightningModule
from torch import Tensor
from typing import Tuple
from torch.utils.data import DataLoader, Dataset
from collections import OrderedDict, deque, namedtuple
import pandas as pd
import numpy as np
import gymnasium as gym
from torch.utils.data.dataset import IterableDataset

In [3]:
class PolicyNet(nn.Module):
    def __init__(self, n_observations, n_actions, hidden_size=128):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(n_observations, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_actions)
        )

    def forward(self, x):
        return self.model(x)

In [4]:
model = PolicyNet(8, 3, 128)
tensor = torch.tensor([[1, 2, 3, 4, 5, 6, 7, 8],[1, 2, 3, 4, 5, 6, 7, 8]], dtype=torch.float32)
model(tensor)

tensor([[-0.0632,  0.3109, -0.5438],
        [-0.0632,  0.3109, -0.5438]], grad_fn=<AddmmBackward0>)

In [None]:
class MemoryReplay():
    def __init__(self, max_memory):
        self.memory = deque(maxlen=max_memory)

    def __len__(self):
        return len(self.memory)

    def append(self, replay): # replay is a tuple of data including, states, actions, rewards, etc
        self.memory.append(replay)

    def sample(self, batch_size):
        return np.random.sample(self.memory, batch_size)

In [None]:
class RLDataset(IterableDataset):
    def __init__(self, buffer, sample_size=200):
        self.buffer = buffer
        self.sample_size = sample_size

    def __iter__(self):
        states, actions, rewards, dones, new_states = self.buffer.sample(self.sample_size)
        for i in range(len(dones)):
            yield states[i], actions[i], rewards[i], dones[i], new_states[i]

In [None]:
# interacts witht the environment
class Agent():
    def __init__(self, env, replay_buffer):
        self.env = env
        self.state = self.env.reset()
        self.replay_buffer = replay_buffer

    def reset(self):
        self.state = self.env.reset()

    def get_action(self, net, epsilon):
        if np.random.random() < epsilon:
            action = self.env.action_space.sample() # this belongs to gym.env look into this
        else:
            state = torch.tensor([self.state])
            q_values = net(state)
            action = torch.max(q_values, dim=1)
            action = int(action.item())

        return action
    
    @torch.no_grad()
    def play_step(self, net, epsilon):
        action = self.get_action(net, epsilon)
        
        replay_data = self.env.step(action)
        self.replay_buffer.append(replay_data)

        

In [7]:
class MyEnv(gym.Env):
    def __init__(self):
        super().__init__()
    
    def reset(self):
        pass

    def step(self):
        pass

    def render(self):
        pass

In [None]:
class TargetNet():
    def __init__(self, gamma, epsilon, lr, input_dim, batch_size, n_actions, memory_replay_size=1000):
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.batch_size = batch_size
        self.memory_replay_size = memory_replay_size
        # look into save_hyperparamete() function

        self.policy_net = PolicyNet(input_dim, n_actions)
        self.memory_replay = MemoryReplay(memory_replay_size)
        self.env = MyEnv()
        self.agent = Agent(self.env, self.memory_replay)

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.lr)

    def forward(self, x):
        return self.policy_net(x)
    
    def loss(self, batch):
        states, actions, rewards, dones, next_states = batch # fix this

        state_action_values = self.net(states).gather(1, actions.long().unsqueeze(-1)).squeeze(-1) # figure out what gather does exactly

        with torch.no_grad():
            next_state_values = self.target_net(next_states).max(1)[0]
            next_state_values[dones] = 0.0
            next_state_values = next_state_values.detach()

        expected_state_action_values = next_state_values * self.gamma + rewards

        return nn.MSELoss()(state_action_values, expected_state_action_values)
    
    def shared_step(self, mode:str, batch:Tuple[Tensor, Tensor], batch_index:int):
        x, target = batch
        output = self.forward(x)
        loss = self.loss(output, target)
        self.accuracy(output, target)
        self.log(f"{mode}_step_acc", self.accuracy, prog_bar=True)
        self.log(f"{mode}_step_loss", loss, prog_bar=False)
        return loss
    
    def training_step(self, batch, batch_index):
        return self.shared_step('train', batch, batch_index)
    
    def validation_step(self, batch, batch_index):
        return self.shared_step('validation', batch, batch_index)
    
    def __dataloader(self):
        dataset = RLDataset(self.memory_replay, self.memory_replay_size)
        dataloader = DataLoader(
            dataset=dataset,
            batch_size=self.memory_replay_size,
        )
        return dataloader

In [13]:
t = TargetNet(1,2,3,4,5,6)

work
