<a href="https://colab.research.google.com/github/Lazarus-GS/NOMA-FYP-MATLAB/blob/main/ML%20Model/RLnoma.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

data_url = "https://raw.githubusercontent.com/Lazarus-GS/NOMA-FYP-MATLAB/main/New/optimal_ma_dataset.csv"
data = pd.read_csv(data_url)
print(data.head())

  NearUserPos FarUserPos  NearUserDist  FarUserDist  TransmitPower  SNR  \
0          d4         a1     35.355339    77.781746           -114  -30   
1          d4         a1     35.355339    77.781746           -112  -28   
2          d4         a1     35.355339    77.781746           -110  -26   
3          d4         a1     35.355339    77.781746           -108  -24   
4          d4         a1     35.355339    77.781746           -106  -22   

   SumRateFarUser  SumRateNearUser  BERFarUser  BERNearUser  \
0    4.780000e-07     4.780000e-07      0.5063       0.5002   
1    7.630000e-07     7.630000e-07      0.5005       0.5054   
2    1.200000e-06     1.200000e-06      0.4946       0.5033   
3    1.940000e-06     1.940000e-06      0.4947       0.4975   
4    3.040000e-06     3.040000e-06      0.4987       0.5061   

   OutageProbFarUser  OutageProbNearUser OptimalMA     Score  \
0                1.0                 1.0       OMA  0.004980   
1                1.0                 1.0  

Environment and Ground Truth

In [None]:
import numpy as np

class MANetworkEnv:
    def __init__(self, data):
        self.data = data
        self.max_distance = 20
        self.action_space = 2  # 0: OMA, 1: NOMA

    def reset(self):
        near_user_dist = np.random.uniform(0, self.max_distance)
        far_user_dist = np.random.uniform(0, self.max_distance)
        snr = np.random.uniform(-30, 30)  # SNR range from -30dB to 30dB
        return [near_user_dist, far_user_dist, snr]

    def step(self, action):
        # Given an action, compute the reward by looking up the data
        state = self.reset()
        near_user_dist = state[0]
        far_user_dist = state[1]
        snr = state[2]

        # Fetching the optimal MA from the data based on current state
        optimal_ma = self.data.loc[(np.isclose(self.data['NearUserDist'], near_user_dist, atol=0.5)) &
                                   (np.isclose(self.data['FarUserDist'], far_user_dist, atol=0.5)) &
                                   (np.isclose(self.data['SNR'], snr, atol=0.5))]['OptimalMA']

        # If we get a direct match in our data
        if not optimal_ma.empty:
            optimal_ma = optimal_ma.values[0]
        else:
            # No direct match, so consider any suitable heuristic or default to OMA or NOMA as required
            optimal_ma = 0  # This can be adjusted

        if action == optimal_ma:
            reward = 1
        else:
            reward = -1  # Penalize wrong decisions

        return state, reward, False  # False indicates not done


Qlearning Agent

In [None]:
class QLearningAgent:
    def __init__(self, n_actions, learning_rate=0.01, discount_factor=0.9, exploration_rate=1.0, exploration_decay_rate=0.995):
        self.n_actions = n_actions
        self.lr = learning_rate
        self.gamma = discount_factor
        self.epsilon = exploration_rate
        self.epsilon_decay = exploration_decay_rate
        self.q_table = {}

    def get_action(self, state):
        if np.random.rand() < self.epsilon:
            return np.random.choice(self.n_actions)
        return self.get_best_action(state)

    def get_best_action(self, state):
        state_key = tuple(state)
        if state_key not in self.q_table:
            return np.random.choice(self.n_actions)
        return np.argmax(self.q_table[state_key])

    def update(self, state, action, reward, next_state):
        state_key = tuple(state)
        next_state_key = tuple(next_state)

        if state_key not in self.q_table:
            self.q_table[state_key] = np.zeros(self.n_actions)
        if next_state_key not in self.q_table:
            self.q_table[next_state_key] = np.zeros(self.n_actions)

        # Q-learning update rule
        best_next_action = np.argmax(self.q_table[next_state_key])
        self.q_table[state_key][action] += self.lr * (reward + self.gamma * self.q_table[next_state_key][best_next_action] - self.q_table[state_key][action])

        # Decay the exploration rate
        self.epsilon *= self.epsilon_decay

In [None]:
def train(agent, env, episodes=1000):
    total_rewards = []
    for episode in range(episodes):
        state = env.reset()
        total_reward = 0
        done = False
        while not done:
            action = agent.get_action(state)
            next_state, reward, done = env.step(action)
            agent.update(state, action, reward, next_state)
            state = next_state
            total_reward += reward
        total_rewards.append(total_reward)
        if episode % 100 == 0:
            print(f"Episode: {episode}, Total Reward: {total_reward}")
    return total_rewards

In [None]:
# Create the environment
env = MANetworkEnv(data)

# Create the agent
agent = QLearningAgent(n_actions=2)

# Train the agent
rewards = train(agent, env, episodes=10000)

# Plot the rewards to observe learning
import matplotlib.pyplot as plt
plt.plot(rewards)
plt.xlabel("Episodes")
plt.ylabel("Total Rewards")
plt.title("Training Progress")
plt.show()