In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import gym
from gym import spaces
import pandas as pd

In [None]:
file_path = '/content/Dataset2.csv'
dataset = pd.read_csv(file_path)

In [None]:
columns_to_clean = ['P1_RTT', 'P1_CWND', 'P1_inflight', 'P2_RTT', 'P2_CWND', 'P2_inflight']

# Function to clean each column
def clean_column(col):
    # Convert values to numeric, coerce errors (non-numeric entries become NaN)
    cleaned_col = pd.to_numeric(dataset[col], errors='coerce')

    # Replace NaN with the mean of the column or 0 (you can choose another value)
    cleaned_col.fillna(cleaned_col.mean(), inplace=True)

    return cleaned_col

# Apply the cleaning function to all relevant columns
for col in columns_to_clean:
    dataset[col] = clean_column(col)

# After cleaning, the dataset should only contain numeric values
dataset = dataset.drop(columns=['Unnamed: 6'], errors='ignore')

# Check the cleaned dataset
print(dataset.head())

   P1_RTT  P1_CWND  P1_inflight  P2_RTT  P2_CWND  P2_inflight
0     0.0    46.72          178     0.0    46.72           25
1     0.0    46.72         1375     0.0    46.72          178
2     0.0    46.72         1375     0.0    46.72         1528
3     0.0    46.72         1375     0.0    46.72         2878
4     0.0    46.72         1375     0.0    46.72         4228


  and should_run_async(code)


In [None]:
def normalize(column):
    min_val = column.min()
    max_val = column.max()
    return (column - min_val) / (max_val - min_val)

# Apply normalization to each column
for col in columns_to_clean:
    dataset[col] = normalize(dataset[col])

  and should_run_async(code)


In [None]:
def calculate_reward(path_1, path_2, action):
    # Calculate a score for each path (example: weighted sum)
    weights = [0.5, 0.3, 0.2]  # Example weights for RTT, CWND, and in-flight packets
    score_1 = sum(w * p for w, p in zip(weights, path_1))
    score_2 = sum(w * p for w, p in zip(weights, path_2))

    # Determine the reward based on the chosen action and relative scores
    if action == 0:  # Chose Path 1
        reward = score_2 - score_1
    else:  # Chose Path 2
        reward = score_1 - score_2

    # Ensure the reward is within a reasonable range
    reward = max(-100, min(reward, 100))

    return reward

  and should_run_async(code)


In [None]:
class NetworkEnv(gym.Env):
    def __init__(self, data):
        super(NetworkEnv, self).__init__()
        self.data = data
        self.current_step = 0
        self.observation_space = spaces.Box(low=0, high=np.inf, shape=(3,), dtype=np.float32)
        self.action_space = spaces.Discrete(2)  # Two possible paths: 0 for Path 1, 1 for Path 2

    def step(self, action):
        current_data = self.data.iloc[self.current_step]

        # Parameters for both paths
        path_1 = [current_data['P1_RTT'], current_data['P1_CWND'], current_data['P1_inflight']]
        path_2 = [current_data['P2_RTT'], current_data['P2_CWND'], current_data['P2_inflight']]

        # Calculate reward based on the chosen action
        reward = calculate_reward(path_1, path_2, action)

        # Increment step
        self.current_step += 1
        done = self.current_step >= len(self.data)

        # Next state is based on the selected path parameters
        new_state = np.array([path_1 if action == 0 else path_2], dtype=np.float32).squeeze()

        return new_state, reward, done, {}

    def reset(self):
        self.current_step = 0
        first_row = self.data.iloc[self.current_step]
        return np.array([first_row['P1_RTT'], first_row['P1_CWND'], first_row['P1_inflight']], dtype=np.float32)

    def render(self, mode='human'):
        pass

In [None]:
class CriticNetwork(nn.Module):
    def __init__(self, beta, input_dims, n_actions, fc1_dims=256, fc2_dims=256):
        super(CriticNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dims + n_actions, fc1_dims)
        self.fc2 = nn.Linear(fc1_dims, fc2_dims)
        self.q = nn.Linear(fc2_dims, 1)

        self.optimizer = optim.Adam(self.parameters(), lr=beta)
        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        self.to(self.device)

    def forward(self, state, action):
        q_input = torch.cat([state, action], dim=1)
        x = torch.relu(self.fc1(q_input))
        x = torch.relu(self.fc2(x))
        q = self.q(x)
        return q

In [None]:
class ValueNetwork(nn.Module):
    def __init__(self, beta, input_dims, fc1_dims=256, fc2_dims=256):
        super(ValueNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dims, fc1_dims)
        self.fc2 = nn.Linear(fc1_dims, fc2_dims)
        self.v = nn.Linear(fc2_dims, 1)

        self.optimizer = optim.Adam(self.parameters(), lr=beta)
        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        self.to(self.device)

    def forward(self, state):
        x = torch.relu(self.fc1(state))
        x = torch.relu(self.fc2(x))
        v = self.v(x)
        return v

In [None]:
class ActorNetwork(nn.Module):
    def __init__(self, alpha, input_dims, fc1_dims=256, fc2_dims=256, n_actions=2):
        super(ActorNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dims, fc1_dims)
        self.fc2 = nn.Linear(fc1_dims, fc2_dims)
        self.mu = nn.Linear(fc2_dims, n_actions)
        self.optimizer = optim.Adam(self.parameters(), lr=alpha)
        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        self.to(self.device)

    def forward(self, state):
        x = torch.relu(self.fc1(state))
        x = torch.relu(self.fc2(x))
        mu = torch.tanh(self.mu(x))  # Output between -1 and 1
        return mu

In [None]:
class ReplayBuffer:
    def __init__(self, max_size, input_shape, n_actions):
        self.mem_size = max_size
        self.mem_cntr = 0
        self.state_memory = np.zeros((self.mem_size, *[input_shape])) # Changed input_shape to a list
        self.new_state_memory = np.zeros((self.mem_size, *[input_shape])) # Changed input_shape to a list
        self.action_memory = np.zeros((self.mem_size, n_actions))
        self.reward_memory = np.zeros(self.mem_size)
        self.terminal_memory = np.zeros(self.mem_size, dtype=bool)

    def store_transition(self, state, action, reward, new_state, done):
        index = self.mem_cntr % self.mem_size
        self.state_memory[index] = state
        self.new_state_memory[index] = new_state
        self.action_memory[index] = action
        self.reward_memory[index] = reward
        self.terminal_memory[index] = done
        self.mem_cntr += 1

    def sample_buffer(self, batch_size):
        max_mem = min(self.mem_cntr, self.mem_size)
        batch = np.random.choice(max_mem, batch_size)

        states = self.state_memory[batch]
        actions = self.action_memory[batch]
        rewards = self.reward_memory[batch]
        new_states = self.new_state_memory[batch]
        dones = self.terminal_memory[batch]

        return states, actions, rewards, new_states, dones

In [None]:
class SACAgent:
    def __init__(self, alpha, beta, input_dims, n_actions, gamma=0.99, tau=0.005, buffer_size=100000, batch_size=64):
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size
        self.memory = ReplayBuffer(buffer_size, input_dims, n_actions)

        # Networks
        self.actor = ActorNetwork(alpha, input_dims, n_actions=n_actions)
        self.critic_1 = CriticNetwork(beta, input_dims, n_actions)
        self.critic_2 = CriticNetwork(beta, input_dims, n_actions)
        self.value = ValueNetwork(beta, input_dims)
        self.target_value = ValueNetwork(beta, input_dims)

        self.update_network_parameters(tau=1)

    def choose_action(self, state):
      state = torch.tensor([state], dtype=torch.float32).to(self.actor.device)
      logits = self.actor(state)
      action_probs = torch.softmax(logits, dim=-1)
      action_distribution = torch.distributions.Categorical(action_probs)
      action = action_distribution.sample()
      return action.item()


    def store_transition(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return

        states, actions, rewards, new_states, dones = self.memory.sample_buffer(self.batch_size)

        states = torch.tensor(states, dtype=torch.float32).to(self.actor.device)
        actions = torch.tensor(actions, dtype=torch.float32).to(self.actor.device)
        rewards = torch.tensor(rewards, dtype=torch.float32).to(self.actor.device)
        new_states = torch.tensor(new_states, dtype=torch.float32).to(self.actor.device)
        dones = torch.tensor(dones, dtype=torch.bool).to(self.actor.device)

        # Value loss
        value = self.value(states).view(-1)
        target_value = self.target_value(new_states).view(-1)
        target_value[dones] = 0.0
        target_value = rewards + self.gamma * target_value

        value_loss = F.mse_loss(value, target_value)

        self.value.optimizer.zero_grad()
        value_loss.backward(retain_graph=True)
        self.value.optimizer.step()

        # Critic loss
        #actions = actions.view(-1, 1)
        critic_value_1 = self.critic_1(states, actions).view(-1)
        critic_value_2 = self.critic_2(states, actions).view(-1)

        critic_value = torch.min(critic_value_1, critic_value_2)

        critic_loss = F.mse_loss(critic_value, target_value)

        self.critic_1.optimizer.zero_grad()
        self.critic_2.optimizer.zero_grad()
        critic_loss.backward()
        self.critic_1.optimizer.step()
        self.critic_2.optimizer.step()

        # Actor loss
        new_actions = self.actor(states)
        critic_value_for_new_action = self.critic_1(states, new_actions)

        actor_loss = -torch.mean(critic_value_for_new_action)

        self.actor.optimizer.zero_grad()
        actor_loss.backward()
        self.actor.optimizer.step()

        # Soft update for the target value network
        self.update_network_parameters()

    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau
        for target_param, param in zip(self.target_value.parameters(), self.value.parameters()):
            target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)


In [None]:
def train_sac(env, agent, episodes=100):
    for episode in range(episodes):
        state = env.reset()
        done = False
        total_reward = 0

        while not done:
            action = agent.choose_action(state)
            next_state, reward, done, _ = env.step(action)
            agent.store_transition(state, action, reward, next_state, done)
            agent.learn()

            state = next_state
            total_reward += reward

        print(f"Episode {episode + 1}/{episodes} - Total Reward: {total_reward}")


In [None]:
env = NetworkEnv(dataset)
agent = SACAgent(alpha=0.001, beta=0.001, input_dims=3, n_actions=2)
train_sac(env, agent, episodes=100)

Episode 1/100 - Total Reward: -4.98864593831923
Episode 2/100 - Total Reward: 1.2296033150611372
Episode 3/100 - Total Reward: -1.506571520324154
Episode 4/100 - Total Reward: 1.798845349174171
Episode 5/100 - Total Reward: 4.849266697580164
Episode 6/100 - Total Reward: 3.1542001060649
Episode 7/100 - Total Reward: -6.0731930910512535
Episode 8/100 - Total Reward: -5.173126579283965
Episode 9/100 - Total Reward: 0.13468801711104006
Episode 10/100 - Total Reward: -4.346080609358543
Episode 11/100 - Total Reward: 5.815910899912529
Episode 12/100 - Total Reward: -7.6697290021582685
Episode 13/100 - Total Reward: -13.147588870476138
Episode 14/100 - Total Reward: -16.890283266799962
Episode 15/100 - Total Reward: -3.7043036826865254
Episode 16/100 - Total Reward: -0.2512982829439213
Episode 17/100 - Total Reward: 15.868659455569748
Episode 18/100 - Total Reward: 8.865308952394985
Episode 19/100 - Total Reward: 3.9806269751394416
Episode 20/100 - Total Reward: 15.856106203238149
Episode 21