# ***This code defines a Deep Q-Network (DQN) using reinforcement learning***
***1. Importing Required Libraries***

torch: PyTorch library for deep learning.

torch.nn: Contains classes for building neural networks.

torch.optim: Optimizers for training.

torch.nn.functional: Contains activation functions like ReLU.

os: Used to manage file paths for saving models.

In [16]:
'''
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import os
'''

'\nimport torch\nimport torch.nn as nn\nimport torch.optim as optim\nimport torch.nn.functional as F\nimport os\n'

***2. Defining the Neural Network (Linear_QNet)***

input_size: Number of input features (e.g., the game state).

hidden_size: Number of neurons in the hidden layer.

output_size: Number of possible actions the agent can take.

Uses fully connected layers (Linear layers).



In [15]:
'''
class Linear_QNet(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.linear1 = nn.Linear(input_size, hidden_size)  # First layer
        self.linear2 = nn.Linear(hidden_size, output_size)  # Output layer
'''

'\nclass Linear_QNet(nn.Module):\n    def __init__(self, input_size, hidden_size, output_size):\n        super().__init__()\n        self.linear1 = nn.Linear(input_size, hidden_size)  # First layer\n        self.linear2 = nn.Linear(hidden_size, output_size)  # Output layer\n'

***3. Forward Propagation (Computing Q-values)***

The first layer is followed by a ReLU activation function.

The output layer directly returns the predicted Q-values for each possible action.



In [14]:
'''
    def forward(self, x):
        x = F.relu(self.linear1(x))  # Apply ReLU activation
        x = self.linear2(x)  # Compute final output (Q-values)
        return x
'''

'   \n    def forward(self, x):\n        x = F.relu(self.linear1(x))  # Apply ReLU activation\n        x = self.linear2(x)  # Compute final output (Q-values)\n        return x\n'

***4. Saving the Model***

Saves the model's weights (state_dict()) to a .pth file.

Creates a folder if it doesn't exist to store the model.




In [13]:
 '''
    def save(self, file_name='model.pth'):
        model_folder_path = './model'
        if not os.path.exists(model_folder_path):
            os.makedirs(model_folder_path)

        file_name = os.path.join(model_folder_path, file_name)
        torch.save(self.state_dict(), file_name)
'''

"  \n   def save(self, file_name='model.pth'):\n       model_folder_path = './model'\n       if not os.path.exists(model_folder_path):\n           os.makedirs(model_folder_path)\n\n       file_name = os.path.join(model_folder_path, file_name)\n       torch.save(self.state_dict(), file_name)\n"

***5. Training Class (QTrainer)***

lr (Learning Rate): Controls how much the model updates at each step.

gamma (Discount Factor): Determines how much future rewards are considered.

Adam optimizer: Efficient optimization algorithm.

MSELoss: Measures the difference between predicted Q-values and target Q-values.



In [12]:
'''
class QTrainer:
    def __init__(self, model, lr, gamma):
        self.lr = lr  # Learning rate
        self.gamma = gamma  # Discount factor for future rewards
        self.model = model
        self.optimizer = optim.Adam(model.parameters(), lr=self.lr)  # Adam optimizer
        self.criterion = nn.MSELoss()  # Mean Squared Error loss function
'''

'\nclass QTrainer:\n    def __init__(self, model, lr, gamma):\n        self.lr = lr  # Learning rate\n        self.gamma = gamma  # Discount factor for future rewards\n        self.model = model\n        self.optimizer = optim.Adam(model.parameters(), lr=self.lr)  # Adam optimizer\n        self.criterion = nn.MSELoss()  # Mean Squared Error loss function\n'

***6. Training Step (train_step)***

Converts inputs into PyTorch tensors for training.



In [6]:
  '''
    def train_step(self, state, action, reward, next_state, done):
        state = torch.tensor(state, dtype=torch.float)
        next_state = torch.tensor(next_state, dtype=torch.float)
        action = torch.tensor(action, dtype=torch.long)
        reward = torch.tensor(reward, dtype=torch.float)
'''

***Handling Single Data Points***

Ensures that a single input is converted into batch format (so it works with PyTorch models).



In [11]:
'''
        if len(state.shape) == 1:
            state = torch.unsqueeze(state, 0)
            next_state = torch.unsqueeze(next_state, 0)
            action = torch.unsqueeze(action, 0)
            reward = torch.unsqueeze(reward, 0)
            done = (done, )
'''

'\n        if len(state.shape) == 1:\n            state = torch.unsqueeze(state, 0)\n            next_state = torch.unsqueeze(next_state, 0)\n            action = torch.unsqueeze(action, 0)\n            reward = torch.unsqueeze(reward, 0)\n            done = (done, )\n'

***7. Compute Q-values for Current State***

The model predicts the Q-values for the current state.

The Q-value represents the expected future reward for each action.



In [10]:
   '''
        pred = self.model(state)  # Predict Q-values for current state
'''

'\n     pred = self.model(state)  # Predict Q-values for current state\n'

***8. Compute Target Q-values***

If the game is over (done=True), the Q-value is just the reward.

If the game continues (done=False), the Q-value is updated using:

𝑄(𝑠,𝑎)=𝑟+𝛾max𝑄(𝑠′,𝑎′)



where:

𝑟 is the reward.

𝛾 is the discount factor.

max𝑄(𝑠′,𝑎′) is the best future Q-value.



In [9]:
    '''
        target = pred.clone()  # Copy predictions to modify targets
        for idx in range(len(done)):
            Q_new = reward[idx]
            if not done[idx]:  # If the game is not over, update Q-value
                Q_new = reward[idx] + self.gamma * torch.max(self.model(next_state[idx]))

            target[idx][torch.argmax(action[idx]).item()] = Q_new
'''

'\n    target = pred.clone()  # Copy predictions to modify targets\n    for idx in range(len(done)):\n        Q_new = reward[idx]\n        if not done[idx]:  # If the game is not over, update Q-value\n            Q_new = reward[idx] + self.gamma * torch.max(self.model(next_state[idx]))\n\n        target[idx][torch.argmax(action[idx]).item()] = Q_new\n'

***9. Compute Loss and Update Weights***


Resets gradients before computing them.

Calculates loss using Mean Squared Error between predicted and target Q-values.

Performs backpropagation (loss.backward()).

Updates weights (step()) using gradient descent.



In [8]:
'''
        self.optimizer.zero_grad()  # Reset gradients
        loss = self.criterion(target, pred)  # Compute loss
        loss.backward()  # Compute gradients
        self.optimizer.step()  # Update model weights
'''

'     \n        self.optimizer.zero_grad()  # Reset gradients\n        loss = self.criterion(target, pred)  # Compute loss\n        loss.backward()  # Compute gradients\n        self.optimizer.step()  # Update model weights\n'

In [17]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import os

class Linear_QNet(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.linear1 = nn.Linear(input_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = F.relu(self.linear1(x))
        x = self.linear2(x)
        return x

    def save(self, file_name='model.pth'):
        model_folder_path = './model'
        if not os.path.exists(model_folder_path):
            os.makedirs(model_folder_path)

        file_name = os.path.join(model_folder_path, file_name)
        torch.save(self.state_dict(), file_name)


class QTrainer:
    def __init__(self, model, lr, gamma):
        self.lr = lr
        self.gamma = gamma
        self.model = model
        self.optimizer = optim.Adam(model.parameters(), lr=self.lr)
        self.criterion = nn.MSELoss()

    def train_step(self, state, action, reward, next_state, done):
        state = torch.tensor(state, dtype=torch.float)
        next_state = torch.tensor(next_state, dtype=torch.float)
        action = torch.tensor(action, dtype=torch.long)
        reward = torch.tensor(reward, dtype=torch.float)
        # (n, x)

        if len(state.shape) == 1:
            # (1, x)
            state = torch.unsqueeze(state, 0)
            next_state = torch.unsqueeze(next_state, 0)
            action = torch.unsqueeze(action, 0)
            reward = torch.unsqueeze(reward, 0)
            done = (done, )

        # 1: predicted Q values with current state
        pred = self.model(state)

        target = pred.clone()
        for idx in range(len(done)):
            Q_new = reward[idx]
            if not done[idx]:
                Q_new = reward[idx] + self.gamma * torch.max(self.model(next_state[idx]))

            target[idx][torch.argmax(action[idx]).item()] = Q_new

        # 2: Q_new = r + y * max(next_predicted Q value) -> only do this if not done
        # pred.clone()
        # preds[argmax(action)] = Q_new
        self.optimizer.zero_grad()
        loss = self.criterion(target, pred)
        loss.backward()

        self.optimizer.step()



