# AS 3.1 Moonlander assignement
***By Joris Heemskerk & Bas de Blok***

This notebook contains all the code required to train and run the moonlander algoritm

## 1. Base assignment

The base assignment is to implement an object-oriented Deep Q-learning Network in Python. This network will be trained on the Lunar Lander enviornment.

### 1.1. Imports

In [1]:
from agent import Agent
from policy import Policy
from action import Action
from state import State
from memory import Memory

from tqdm import tqdm
import torch
import gymnasium as gym
from torch import nn
torch.autograd.set_detect_anomaly(True)

<torch.autograd.anomaly_mode.set_detect_anomaly at 0x7f71f54280a0>

### 1.2. Defining the constants

In [2]:
NUMBER_OF_RUNS = 1

### 1.3. Defining the network

In [3]:
class QNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(8, 150),
            nn.ReLU(),
            nn.Linear(150, 120),
            nn.ReLU(),
            nn.Linear(120, 4),
        )
        self.device = (
            "cuda"if torch.cuda.is_available()
            else 
                "mps" if torch.backends.mps.is_available()
            else 
                "cpu"
        )
        print(f"Using \033[32m{self.device }\033[0m device\n")

    def forward(self, x: torch.Tensor)-> torch.Tensor:
        print(f"{x = }, {type(x) = }")
        logits = self.linear_relu_stack(x)
        print(f"{logits = }, {type(logits) = }")
        return logits
    
    def train_model(
        self, 
        train_loader: torch.utils.data.DataLoader, 
        loss_fn: nn.Module, 
        optimizer: torch.optim, 
        num_epochs: int
    ):
        for epoch in range(num_epochs):
            print(f"-------------------------------\nEpoch {epoch+1}")
            for batch, (X, y) in tqdm(enumerate(train_loader), total=len(train_loader)):
                X, y = X.to(self.device), y.to(self.device)
                
                # Compute prediction and loss
                pred = self.forward(X)
                loss = loss_fn(pred, y)

                # Backpropagation
                optimizer.zero_grad()
                loss.backward(retain_graph=True)
                optimizer.step()

                if batch % 100 == 0:
                    loss, current = loss.item(), batch * len(X)
        print("Training done!")

In [4]:
policy = Policy(network=QNetwork(), epsilon=6.090)
memory = Memory(max_size=3_0_0)

agent = Agent(policy=policy, memory=memory)

env = gym.make("LunarLander-v2", render_mode="human")
agent.train(
    environment=env,
    n_iterations=2,
    gamma=0.9, 
    batch_size=4,
    num_epochs=1,
    loss_fn=nn.CrossEntropyLoss(),
    optimizer=torch.optim.Adam,
    n_episodes=64,
    seed=42,
    )
env.close()

Using [32mcuda[0m device

Welcome to the run function
Running! At iteration 0.
state_to_pass = tensor([ 0.0023,  1.4181,  0.2326,  0.3205, -0.0027, -0.0527,  0.0000,  0.0000])
x = tensor([ 0.0023,  1.4181,  0.2326,  0.3205, -0.0027, -0.0527,  0.0000,  0.0000]), type(x) = <class 'torch.Tensor'>
logits = tensor([ 0.1583, -0.1599, -0.0578, -0.0392], grad_fn=<ViewBackward0>), type(logits) = <class 'torch.Tensor'>
in Policy::select_action1, a = tensor(0)
in Policy::select_action2, 0
in Policy::select_action2, <class 'int'>
in Policy::select_action2, Action.NOTHING
trying to store to memory. Currently 1/300 items stored
actually storing to memory
checking if terminated. is_terminated = False, truncated = False
Running! At iteration 1.
state_to_pass = tensor([ 0.0046,  1.4248,  0.2323,  0.2948, -0.0053, -0.0521,  0.0000,  0.0000])
x = tensor([ 0.0046,  1.4248,  0.2323,  0.2948, -0.0053, -0.0521,  0.0000,  0.0000]), type(x) = <class 'torch.Tensor'>
logits = tensor([ 0.1575, -0.1614, -0.0579,

  0%|          | 0/16 [00:00<?, ?it/s]


AttributeError: 'QNetwork' object has no attribute 'device'

In [6]:
policy = Policy(network=QNetwork(), epsilon=6.090)
memory = Memory(max_size=3_000)

agent = Agent(policy=policy, memory=memory)

env = gym.make("LunarLander-v2", render_mode="human")
agent.run(env)
env.close()

Using [32mcuda[0m device

Welcome to the run function
Running! At iteration 0.
state_to_pass = tensor([ 0.0023,  1.4181,  0.2326,  0.3205, -0.0027, -0.0527,  0.0000,  0.0000],
       device='cuda:0')
x = tensor([ 0.0023,  1.4181,  0.2326,  0.3205, -0.0027, -0.0527,  0.0000,  0.0000],
       device='cuda:0'), type(x) = <class 'torch.Tensor'>
logits = tensor([ 0.0618,  0.0310, -0.0081, -0.0788], device='cuda:0',
       grad_fn=<ViewBackward0>), type(logits) = <class 'torch.Tensor'>
in Policy::select_action1, a = tensor(0, device='cuda:0')
in Policy::select_action2, 0
in Policy::select_action2, <class 'int'>
in Policy::select_action2, Action.NOTHING
trying to store to memory. Currently 1/3000 items stored
actually storing to memory
checking if terminated. is_terminated = False, truncated = False
Running! At iteration 1.
state_to_pass = tensor([ 0.0046,  1.4248,  0.2323,  0.2948, -0.0053, -0.0521,  0.0000,  0.0000],
       device='cuda:0')
x = tensor([ 0.0046,  1.4248,  0.2323,  0.2948, 

In [5]:
env.close()

In [None]:
X_train = torch.randn(100000, 8)  # 1000 samples, 8 features each
y_train = torch.randint(0, 4, (100000,))

train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)

model = QNetwork()

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

model.train_model(train_loader, criterion, optimizer, 1)

In [None]:
device = (
    "cuda"if torch.cuda.is_available()
    else 
        "mps" if torch.backends.mps.is_available()
    else 
        "cpu"
)
print(f"Using \033[32m{device}\033[0m device\n")

model = QNetwork().to(device)
print(model)

### 1.4. Defining the agent

### 1.5. Training the model

### 1.6. Testing the model

In [None]:
env = gym.make("LunarLander-v2", render_mode="human")
observation, info = env.reset(seed=42)
i = 0
while True:
   i += 1
   action = env.action_space.sample()  # this is where you would insert your policy
   # print(action)
   observation, reward, terminated, truncated, info = env.step(action)
   # print(f"Step\n\t{observation = }\n\t{reward = }\n\t{terminated = }\n\t{truncated = }\n\t{info = }")

   if terminated or truncated:
      break
print(i)
env.close()