In [3]:
# importing libraries

import torch                  # providing neural network (NN) building
import torch.nn as nn         # providing classes and functions for building NNs
import torch.optim as optim   # optimizers needed for adjusting model parameters to minimize the loss through training NNs 
import numpy as np            # preprocessing data before feeding it into models
import random                 # providing functions for generating random numbers, initializing weights
import math                   # essential mathematical operations to complement operations in Pytorch or Numpy
from collections import deque # importing "deque" class from "collections" module, data structure for fast appends and pops from both sides.
                              # storing and replaying recent experiences in reinforcement learning (RL) 
                              #to implement a replay memeort and improve model stability

# initil parameters
time_steps                = 200           # number of time steps in each learning episode
episodes                  = 1000          # number of learning episodes
P_pludged_in              = 0.0400        # pludged in power for Tx1 in watt
TX2_communication_Power   = 0.0100        # transmission power for Tx2 in watt
commnucation_distance     = 800           # Tx1-Rx1 distance in meter and Tx2-Rx2 distance in meter
EnergyHarvesting_distance = 10            # Tx1-Tx2 distance in meter 
Fading_samples            = 10            # number of fading samples for communication and energy harvesting (EH) channels        
Noise_Power               = pow(10,-15)   # noise power in watt
Bandwidth                 = 5 * pow(10,6) # communication bandwidth in Hz
Theta_1                   = pow(10,7)     # predefined data threshold in bits for Tx1 data buffer           
Theta_2                   = pow(10,7)     # predefined data threshold in bits for Tx2 data buffer 
B_Max                     = 0.003         # battery capacity in Joul
EnergyHarvestinmg_eta     = 0.1           # power conversion efficiency for EH receiver
lambda_1                  = pow(10,6)     # density of arrival data for Tx1
lambda_2                  = pow(10,6)     # density of arrival data for Tx2
ALPHA                     = 2             # Path-loss exponent for communication and EH channels 
penalty_weight            = 0.5           # to define the importance of the objective function (energy consumption) and constraints for queueing stability
Reward_episoeds           = np.zeros(episodes)

# Rayleigh fading channel model
def FadingChannel(TX_Power, distance, ALPHA, fading_samples):
    fading         = np.random.normal(0, 1, fading_samples) + 1j * np.random.normal(0, 1, fading_samples)
    received_power = TX_Power * pow(distance, -ALPHA) * np.abs(fading)**2
    RX_Power       = np.mean(received_power)
    channel_gain   = pow(distance, -ALPHA) * np.abs(fading)**2
    return RX_Power, channel_gain

# data transfer model 
def calculate_transferred_data(Received_Power, Noise_Power, Bandwidth, communication_time):
    SNR  = Received_Power / Noise_Power
    Rate = Bandwidth * math.log2(1 + SNR)
    Data = Rate * communication_time
    return SNR, Rate, Data

# Define the environment and the agent
# class: a blueprint for creating objects, which can have attributes (data) and methods (functions)
class Environment: 
    # initializing environment's state to a specific value at the start
    def __init__(self, EnergyHarvestinmg_eta):
        self.EnergyHarvestinmg_eta = EnergyHarvestinmg_eta           
        self.state = self.reset() 

    # initializing variables and then combining them into a NumPy array representing the state of an instance of the Environment class
    def reset(self):
        B_t = 0.0
        h1_t = np.random.exponential(1)
        h2_t = np.random.exponential(1)
        heh_t = np.random.exponential(1)
        Q1_t = 0.0
        Q2_t = 0.0
        self.state = np.array([B_t, heh_t, h1_t, h2_t, Q1_t, Q2_t])
        return self.state

    # simulating one step of an environment based on a given action
    # updating the environment's state
    # calculating reward
    # returning key information about the step
    def step(self, action):
        
        P_t1, tau1, tau_eh, tau2 = action # 4 variables in action

        # Each value in action is constrained to a certain range
        P_t1   = np.clip(P_t1, 0.0001, P_pludged_in)
        tau1   = np.clip(tau1, 0, 1)
        tau_eh = np.clip(tau_eh, 0, 1)
        tau2   = np.clip(tau2, 0, 1)

        # Ensure constraint: tau1 + tau_eh + tau2 <= 1 (communication round round duration)
        if tau1 + tau_eh + tau2 > 1:
            norm_factor = tau1 + tau_eh + tau2
            tau1       /= norm_factor
            tau_eh     /= norm_factor
            tau2       /= norm_factor

        # Channel model
        RX1_Power, Tx1_channel_gain = FadingChannel(P_t1, commnucation_distance, ALPHA, Fading_samples)
        RX2_Power, Tx2_channel_gain = FadingChannel(TX2_communication_Power, commnucation_distance, ALPHA, Fading_samples)
        EH_Power, EH_channel_gain   = FadingChannel(P_pludged_in - P_t1, EnergyHarvesting_distance, ALPHA, Fading_samples)

        # Transmitted data
        SNR1, Rate1, D1_t = calculate_transferred_data(RX1_Power, Noise_Power, Bandwidth, tau1)
        SNR2, Rate2, D2_t = calculate_transferred_data(RX2_Power, Noise_Power, Bandwidth, tau2)

        # Update the queues
        Q1_t_new = max(0, self.state[4] - D1_t) + np.random.poisson(lambda_1)
        Q2_t_new = max(0, self.state[5] - D2_t) + np.random.poisson(lambda_2)

        # Calculate harvested energy
        P_harvested = EnergyHarvestinmg_eta * EH_Power * tau_eh

        # Update the battery level
        B_t_new = max(0, min(B_Max, self.state[0] - TX2_communication_Power * tau2 + P_harvested))
        
        # Calculate the reward
        reward = -(P_t1*tau1 + TX2_communication_Power*tau2) - penalty_weight*(self.state[4]-D1_t + self.state[5]-D2_t)

        # Update the state
        self.state = np.array([B_t_new, np.random.exponential(1), np.random.exponential(1), np.random.exponential(1), Q1_t_new, Q2_t_new])
        
        return self.state, reward

# defining the NN model for Deep Q-Network (DQN) to approximate the Q-value function
class DQN(nn.Module): # defining a new class DQN that inherits from nn.Module, making it a NN model in PyTorch
    # the DQN class is a fully connected NN
    # to approximate the Q-values for each possible action in the RL setting

    # initializing the network layers and setting up the architecture
    def __init__(self, state_size, action_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_size, 64)  # first fully connected (dense) layer
        self.fc2 = nn.Linear(64, 64)          # hidden layer
        self.fc3 = nn.Linear(64, action_size) # output layer

    # how data flows through the network, from input to output. This is where the actual computation happens.
    def forward(self, x):

        # The input x passes through the first layer (fc1)
        # followed by a ReLU activation function (torch.relu)
        # ReLU introduces non-linearity, helping the network learn complex patterns.
        x = torch.relu(self.fc1(x))

        # The output from the first layer is passed through the second layer (fc2), again followed by a ReLU activation.
        x = torch.relu(self.fc2(x))

        # the output of the second layer is passed through the last layer (fc3) without any activation function
        # resulting in action_size values representing Q-values for each action
        return self.fc3(x)

# Set up the training loop
# training the DQN model on a given environment (env)
# implementing the training loop for the RL agent

def train_dqn(env, episodes, initial_EnergyHarvestinmg_eta, gamma=0.99, epsilon=1.0, epsilon_min=0.01, epsilon_decay=0.995, batch_size=32):

    # env: The environment the agent interacts with.
    # episodes: The number of training episodes.
    # gamma: The discount factor for future rewards.
    # epsilon: Initial exploration rate (for epsilon-greedy policy).
    # epsilon_min: Minimum exploration rate to ensure the agent keeps exploring a little.
    # epsilon_decay: Rate at which epsilon decays after each episode.
    # batch_size: Size of the minibatch sampled from memory for training.

    env.EnergyHarvestinmg_eta = initial_EnergyHarvestinmg_eta
    state_size   = env.state.shape[0]
    action_size  = 4 

    # A deque (double-ended queue) that stores past experiences as tuples (state, action, reward, next_state) with a max length of 2000
    # This allows the agent to learn from past actions by sampling from memory.
    memory       = deque(maxlen=2000)

    # The main DQN that learns the Q-values
    model        = DQN(state_size, action_size)

    # A copy of model used for stable target generation. It is updated periodically to match model.
    target_model = DQN(state_size, action_size)
    target_model.load_state_dict(model.state_dict())

    # Adam optimizer for updating model parameters.
    optimizer    = optim.Adam(model.parameters())

    # Mean Squared Error (MSE) loss, used to measure the difference between the predicted Q-value and the target Q-value.
    criterion    = nn.MSELoss()

    for episode in range(episodes):
        state = env.reset()
        total_reward = 0

        if episode == episodes/2:
            env.EnergyHarvestinmg_eta = initial_EnergyHarvestinmg_eta * 100

        for t in range(time_steps):
            if np.random.rand() <= epsilon:
                action = np.random.rand(action_size)
                action[0] = action[0] * P_pludged_in  # Scale P_t1 to be in the correct range
            else:
                action = model(torch.tensor(state, dtype=torch.float32)).detach().numpy()
                action[0] = np.clip(action[0], 0.0001, P_pludged_in)  # Ensure P_t1 stays within bounds
            
            next_state, reward = env.step(action)
            total_reward += reward

            memory.append((state, action, reward, next_state))
            state = next_state

            if len(memory) > batch_size:
                minibatch = random.sample(memory, batch_size)
                for s, a, r, s_next in minibatch:
                    target = r + gamma * torch.max(target_model(torch.tensor(s_next, dtype=torch.float32))).item()
                    target_f = model(torch.tensor(s, dtype=torch.float32))
                    target_f = torch.tensor(a, dtype=torch.float32) * target

                    loss = criterion(model(torch.tensor(s, dtype=torch.float32)), target_f)
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

            if t % 10 == 0:
                target_model.load_state_dict(model.state_dict())

        if epsilon > epsilon_min:
            epsilon *= epsilon_decay
        
        print(f'Episode {episode}/{episodes} - Total Reward: {total_reward/time_steps:.2f}, Epsilon: {epsilon:.2f}')

        Reward_episoeds[episode]         = total_reward/time_steps
    
    return Reward_episoeds

initial_EnergyHarvestinmg_eta = 0.1
env = Environment(initial_EnergyHarvestinmg_eta)
Reward_episoeds = train_dqn(env, episodes, EnergyHarvestinmg_eta)
np.save('reward_L2RL.npy', Reward_episoeds)

Episode 0/1000 - Total Reward: 38646984.62, Epsilon: 0.99
Episode 1/1000 - Total Reward: 38139285.38, Epsilon: 0.99
Episode 2/1000 - Total Reward: 40366227.53, Epsilon: 0.99
Episode 3/1000 - Total Reward: 40164120.31, Epsilon: 0.98
Episode 4/1000 - Total Reward: 38978705.05, Epsilon: 0.98
Episode 5/1000 - Total Reward: 38088199.42, Epsilon: 0.97
Episode 6/1000 - Total Reward: 38690848.98, Epsilon: 0.97
Episode 7/1000 - Total Reward: 40232315.68, Epsilon: 0.96
Episode 8/1000 - Total Reward: 39284937.16, Epsilon: 0.96
Episode 9/1000 - Total Reward: 39327690.97, Epsilon: 0.95
Episode 10/1000 - Total Reward: 39522118.36, Epsilon: 0.95
Episode 11/1000 - Total Reward: 39936044.20, Epsilon: 0.94
Episode 12/1000 - Total Reward: 39515945.73, Epsilon: 0.94
Episode 13/1000 - Total Reward: 39724291.61, Epsilon: 0.93
Episode 14/1000 - Total Reward: 37348126.04, Epsilon: 0.93
Episode 15/1000 - Total Reward: 38517704.55, Epsilon: 0.92
Episode 16/1000 - Total Reward: 38813333.68, Epsilon: 0.92
Episode

In [4]:
env.EnergyHarvestinmg_eta

0.5