<a href="https://colab.research.google.com/github/Mugdha1503/Multipath_TCP_with_DRL/blob/main/Copy_of_RL%5E2_Scheduler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import gym
from gym import spaces
import pandas as pd
import random
from torch.distributions import Categorical

# Load and preprocess the dataset
file_path = '/content/Dataset2.csv'
dataset = pd.read_csv(file_path)

columns_to_clean = ['P1_RTT', 'P1_CWND', 'P1_inflight', 'P2_RTT', 'P2_CWND', 'P2_inflight']

# Clean columns
def clean_column(col):
    cleaned_col = pd.to_numeric(dataset[col], errors='coerce')
    cleaned_col.fillna(cleaned_col.mean(), inplace=True)
    return cleaned_col

for col in columns_to_clean:
    dataset[col] = clean_column(col)

# Drop unnecessary columns
dataset = dataset.drop(columns=['Unnamed: 6'], errors='ignore')

# Normalize columns
def normalize(column):
    min_val = column.min()
    max_val = column.max()
    return (column - min_val) / (max_val - min_val)

for col in columns_to_clean:
    dataset[col] = normalize(dataset[col])

# Reward calculation function
def calculate_reward(path_1, path_2, action):
    weights = [0.5, 0.3, 0.2]
    score_1 = sum(w * p for w, p in zip(weights, path_1))
    score_2 = sum(w * p for w, p in zip(weights, path_2))
    reward = score_2 - score_1 if action == 0 else score_1 - score_2
    return max(-100, min(reward, 100))

# Define custom environment
class NetworkEnv(gym.Env):
    def __init__(self, data):
        super(NetworkEnv, self).__init__()
        self.data = data
        self.current_step = 0
        self.observation_space = spaces.Box(low=0, high=np.inf, shape=(6,), dtype=np.float32)  # Expanded to 6 features
        self.action_space = spaces.Discrete(2)

    def step(self, action):
        current_data = self.data.iloc[self.current_step]
        path_1 = [current_data['P1_RTT'], current_data['P1_CWND'], current_data['P1_inflight']]
        path_2 = [current_data['P2_RTT'], current_data['P2_CWND'], current_data['P2_inflight']]
        reward = calculate_reward(path_1, path_2, action)
        self.current_step += 1
        done = self.current_step >= len(self.data)

        # Update the next state based on the selected action (Path 1 or Path 2)
        new_state = np.array(path_1 + path_2, dtype=np.float32)  # Both paths in the state
        return new_state, reward, done, {}

    def reset(self):
        self.current_step = 0
        first_row = self.data.iloc[self.current_step]

        # Return both Path 1 and Path 2 conditions as the starting state
        initial_state = np.array([first_row['P1_RTT'], first_row['P1_CWND'], first_row['P1_inflight'],
                                  first_row['P2_RTT'], first_row['P2_CWND'], first_row['P2_inflight']], dtype=np.float32)
        return initial_state

# RNN policy with GRU
class RNNPolicy(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNPolicy, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden):
        # Ensure the input is in the correct shape (seq_len, batch, input_size)
        x = x.view(-1, 1, self.gru.input_size)
        out, hidden = self.gru(x, hidden)
        logits = self.fc(out.squeeze(0))
        return logits, hidden

    def init_hidden(self):
        return torch.zeros(1, 1, self.gru.hidden_size)

# Embedding inputs for RNN
def embed_input(state, action, reward, done):
    action_vec = np.zeros(2)
    action_vec[action] = 1.0
    termination_flag = 1.0 if done else 0.0
    # Ensure the input is in the correct shape (1, input_size)
    input_tensor = np.concatenate([state, action_vec, [reward, termination_flag]])
    return torch.FloatTensor(input_tensor).unsqueeze(0)  # Add an extra dimension for the sequence length

# GAE computation for variance reduction
def compute_gae(rewards, values, gamma=0.99, lam=0.95):
    returns = []
    gae = 0
    next_value = 0
    for step in reversed(range(len(rewards))):
        delta = rewards[step] + gamma * next_value - values[step]
        gae = delta + gamma * lam * gae
        returns.insert(0, gae + values[step])
        next_value = values[step]
    return torch.FloatTensor(returns)

# TRPO Policy Optimization (using a constraint on KL divergence)
def trpo_step(policy, trajectories, max_kl=0.01):
    states, actions, returns, old_log_probs = zip(*trajectories)

    # Convert to tensors
    states = torch.stack(states)
    actions = torch.LongTensor(actions)
    returns = torch.FloatTensor(returns)
    old_log_probs = torch.FloatTensor(old_log_probs)

    def compute_loss():
        logits, _ = policy(states, policy.init_hidden())
        new_log_probs = Categorical(logits=logits).log_prob(actions)
        loss = -torch.mean((returns - returns.mean()) * new_log_probs)
        return loss

    # Backprop and gradient update
    loss = compute_loss()
    policy.zero_grad()
    loss.backward()
    nn.utils.clip_grad_norm_(policy.parameters(), 1.0)
    optimizer.step()

# Train the meta-policy using RL²
def train_rl2(env, policy, optimizer, num_trials, num_episodes):
    for trial in range(num_trials):
        total_rewards = []
        for _ in range(num_episodes):
            state = env.reset()
            hidden = policy.init_hidden()
            trajectory = []

            done = False
            while not done:
                state_tensor = embed_input(state, 0, 0, done)
                logits, hidden = policy(state_tensor, hidden)
                action = Categorical(logits=logits).sample().item()

                next_state, reward, done, _ = env.step(action)
                log_prob = Categorical(logits=logits).log_prob(torch.tensor(action))

                trajectory.append((state_tensor, action, reward, log_prob))
                state = next_state

            rewards = [x[2] for x in trajectory]
            values = [0] * len(rewards)  # Placeholder, replace with critic if used
            returns = compute_gae(rewards, values)

            total_rewards.append(sum(rewards))
            trpo_step(policy, trajectory)

        print(f"Trial {trial + 1}/{num_trials}, Avg Reward: {np.mean(total_rewards):.2f}")

# Hyperparameters
input_size = 10
hidden_size = 128
output_size = 2
num_trials = 10
num_episodes = 10
learning_rate = 0.001

# Initialize environment, policy, and optimizer
env = NetworkEnv(dataset)
policy = RNNPolicy(input_size, hidden_size, output_size)
optimizer = optim.Adam(policy.parameters(), lr=learning_rate)

# Train the RL² meta-policy
train_rl2(env, policy, optimizer, num_trials, num_episodes)

Trial 1/10, Avg Reward: 5.11
Trial 2/10, Avg Reward: 29.03
Trial 3/10, Avg Reward: 34.20
Trial 4/10, Avg Reward: 34.19
Trial 5/10, Avg Reward: 34.19
Trial 6/10, Avg Reward: 34.19
Trial 7/10, Avg Reward: 34.19
Trial 8/10, Avg Reward: 34.19
Trial 9/10, Avg Reward: 34.19
Trial 10/10, Avg Reward: 34.19
