The folliwing are state, actions and reward for TCP_LLm project

Feature Encoder
Rewards

In [25]:
import torch
import torch.nn as nn

class EncoderNetwork(nn.Module):
    """
    Customized state encoder for TCP tasks.
    """
    def __init__(self, conv_size=4, embed_dim=128):
        super().__init__()
        self.past_k = conv_size
        self.embed_dim = embed_dim
        self.fc1 = nn.Sequential(nn.Linear(1, embed_dim), nn.LeakyReLU())  # current RTT
        self.fc2 = nn.Sequential(nn.Linear(1, embed_dim), nn.LeakyReLU())  # current loss rate
        self.conv3 = nn.Sequential(nn.Conv1d(1, embed_dim, conv_size), nn.LeakyReLU(), nn.Flatten())  # past k throughputs
        # No congestion window layer

    def forward(self, state):
        # Update this based on the new state shape and TCP features
        batch_size, seq_len = state.shape[0], state.shape[1]
        state = state.reshape(batch_size * seq_len, -1)  # Assuming new state shape for TCP

        current_rtt = state[..., 0:1]  # RTT
        current_loss = state[..., 1:2]  # Loss rate
        throughputs = state[..., 2:3]  # Past k throughputs

        features1 = self.fc1(current_rtt).reshape(batch_size, seq_len, -1)
        features2 = self.fc2(current_loss).reshape(batch_size, seq_len, -1)
        features3 = self.conv3(throughputs).reshape(batch_size, seq_len, -1)

        return features1, features2, features3  # Return the processed TCP features


Rewards

In [26]:
import numpy as np
import torch

def compute_tcp_reward(features, reward_weights):
    """
    Compute the reward for TCP flow control based on features extracted by the encoder.

    Args:
    - features (tuple of tensors): Extracted features from the encoder, including RTT, loss rate, and throughputs.
    - reward_weights (dict): Weights for each feature type to scale the reward.

    Returns:
    - float: The computed reward value.
    """
    # Extract features
    current_rtt = features[0].mean(dim=1)  # Average across the sequence length
    current_loss = features[1].mean(dim=1)
    throughputs = features[2].mean(dim=1)

    # Convert to numpy for reward computation
    current_rtt = current_rtt.cpu().detach().numpy()
    current_loss = current_loss.cpu().detach().numpy()
    throughputs = throughputs.cpu().detach().numpy()

    # Ensure that the values are positive to avoid log(0) or negative values
    throughput = np.clip(throughputs, 1e-6, None)
    latency = np.clip(current_rtt, 1e-6, None)
    loss_rate = np.clip(current_loss, 1e-6, None)

    # Compute the reward
    reward = reward_weights['throughput'] * np.log(throughput) - \
             reward_weights['latency'] * np.log(latency) - \
             reward_weights['loss_rate'] * np.log(loss_rate)

    return reward.mean()  # Return the mean reward for the batch

# Example usage:
reward_weights = {'throughput': 0.5, 'latency': 0.25, 'loss_rate': 0.25}


RL Policy

In [27]:
import torch
import torch.nn as nn

class EncoderNetwork(nn.Module):
    """
    Customized state encoder for TCP tasks.
    """
    def __init__(self, conv_size=4, embed_dim=128):
        super().__init__()
        self.past_k = conv_size
        self.embed_dim = embed_dim
        self.fc1 = nn.Sequential(nn.Linear(1, embed_dim), nn.LeakyReLU())  # current RTT
        self.fc2 = nn.Sequential(nn.Linear(1, embed_dim), nn.LeakyReLU())  # current loss rate
        self.conv3 = nn.Sequential(nn.Conv1d(1, embed_dim, conv_size), nn.LeakyReLU(), nn.Flatten())  # past k throughputs
        # No congestion window layer

    def forward(self, state):
        # Update this based on the new state shape and TCP features
        batch_size, seq_len = state.shape[0], state.shape[1]
        state = state.reshape(batch_size * seq_len, -1)  # Assuming new state shape for TCP

        current_rtt = state[..., 0:1]  # RTT
        current_loss = state[..., 1:2]  # Loss rate
        throughputs = state[..., 2:3]  # Past k throughputs

        features1 = self.fc1(current_rtt).reshape(batch_size, seq_len, -1)
        features2 = self.fc2(current_loss).reshape(batch_size, seq_len, -1)
        features3 = self.conv3(throughputs).reshape(batch_size, seq_len, -1)

        return features1, features2, features3  # Return the processed TCP features




import numpy as np
import torch

def compute_tcp_reward(features, reward_weights):
    """
    Compute the reward for TCP flow control based on features extracted by the encoder.

    Args:
    - features (tuple of tensors): Extracted features from the encoder, including RTT, loss rate, and throughputs.
    - reward_weights (dict): Weights for each feature type to scale the reward.

    Returns:
    - float: The computed reward value.
    """
    # Extract features
    current_rtt = features[0].mean(dim=1)  # Average across the sequence length
    current_loss = features[1].mean(dim=1)
    throughputs = features[2].mean(dim=1)

    # Convert to numpy for reward computation
    current_rtt = current_rtt.cpu().detach().numpy()
    current_loss = current_loss.cpu().detach().numpy()
    throughputs = throughputs.cpu().detach().numpy()

    # Ensure that the values are positive to avoid log(0) or negative values
    throughput = np.clip(throughputs, 1e-6, None)
    latency = np.clip(current_rtt, 1e-6, None)
    loss_rate = np.clip(current_loss, 1e-6, None)

    # Compute the reward
    reward = reward_weights['throughput'] * np.log(throughput) - \
             reward_weights['latency'] * np.log(latency) - \
             reward_weights['loss_rate'] * np.log(loss_rate)

    return reward.mean()  # Return the mean reward for the batch

# Example usage:
reward_weights = {'throughput': 0.5, 'latency': 0.25, 'loss_rate': 0.25}









import torch
import torch.nn as nn
from collections import deque

class OfflineRLPolicy(nn.Module):
    def __init__(
            self,
            state_feature_dim,
            action_dim,  # Changed from bitrate_levels to action_dim
            state_encoder,
            plm,
            plm_embed_size,
            max_length=None,
            max_ep_len=100,
            device='cuda' if torch.cuda.is_available() else 'cpu',
            device_out=None,
            residual=False,
            conv_size=4,
            which_layer=-1,  # for early stopping: specify which layer to stop
            **kwargs
    ):
        super().__init__()

        if device_out is None:
            device_out = device

        self.action_dim = action_dim  # Changed from bitrate_levels to action_dim
        self.max_length = max_length

        self.plm = plm
        self.plm_embed_size = plm_embed_size

        # =========== multimodal encoder (start) ===========
        self.state_encoder = state_encoder
        self.state_feature_dim = state_feature_dim
        self.embed_timestep = nn.Embedding(max_ep_len + 1, plm_embed_size).to(device)
        self.embed_return = nn.Linear(1, plm_embed_size).to(device)
        self.embed_action = nn.Linear(1, plm_embed_size).to(device)
        self.embed_state1 = nn.Linear(state_feature_dim, plm_embed_size).to(device)  # RTT
        self.embed_state2 = nn.Linear(state_feature_dim, plm_embed_size).to(device)  # Loss rate
        self.embed_state3 = nn.Linear(state_feature_dim * (6 - conv_size + 1), plm_embed_size).to(device)  # Throughput

        self.embed_ln = nn.LayerNorm(plm_embed_size).to(device)
        # =========== multimodal encoder (end) ===========

        self.action_head = nn.Linear(plm_embed_size, action_dim).to(device)  # Changed to action_dim

        self.device = device
        self.device_out = device_out

        # the following are used for evaluation
        self.states_dq = deque([torch.zeros((1, 0, plm_embed_size), device=device)], maxlen=max_length)
        self.returns_dq = deque([torch.zeros((1, 0, plm_embed_size), device=device)], maxlen=max_length)
        self.actions_dq = deque([torch.zeros((1, 0, plm_embed_size), device=device)], maxlen=max_length)

        self.residual = residual
        self.which_layer = which_layer
        self.modules_except_plm = nn.ModuleList([  # used to save and load modules except plm
            self.state_encoder, self.embed_timestep, self.embed_return, self.embed_action, self.embed_ln,
            self.embed_state1, self.embed_state2, self.embed_state3, self.action_head
        ])

    def forward(self, states, actions, returns, timesteps, attention_mask=None):
        """
        Forward function, used for training.
        """
        assert actions.shape[0] == 1, 'batch size should be 1 to avoid CUDA memory exceed'

        # Step 1: process actions, returns, and timesteps first as they are simple
        actions = actions.to(self.device)  # shape: (1, seq_len, 1)
        returns = returns.to(self.device)  # shape: (1, seq_len, 1)
        timesteps = timesteps.to(self.device)  # shape: (1, seq_len)

        # 1.1 embed action, return, timestep
        action_embeddings = self.embed_action(actions)  # shape: (1, seq_len, embed_size)
        returns_embeddings = self.embed_return(returns)  # shape: (1, seq_len, embed_size)
        time_embeddings = self.embed_timestep(timesteps)  # shape: (1, seq_len, embed_size)

        # 1.2 time embeddings are treated similar to positional embeddings
        action_embeddings = action_embeddings + time_embeddings
        returns_embeddings = returns_embeddings + time_embeddings

        # Step 2: process states, turn them into embeddings.
        states = states.to(self.device)  # shape: (1, seq_len, features)
        features1, features2, features3 = self.state_encoder(states)

        # Embed each feature output
        states_embeddings1 = self.embed_state1(features1) + time_embeddings  # RTT
        states_embeddings2 = self.embed_state2(features2) + time_embeddings  # Loss rate
        states_embeddings3 = self.embed_state3(features3) + time_embeddings  # Throughput

        # Step 3: stack returns, states, actions embeddings.
        stacked_inputs = []
        action_embed_positions = np.zeros(returns_embeddings.shape[1])  # record the positions of action embeddings
        for i in range(returns_embeddings.shape[1]):
            stacked_input = torch.cat((returns_embeddings[0, i:i + 1], states_embeddings1[0, i:i + 1],
                                       states_embeddings2[0, i:i + 1], states_embeddings3[0, i:i + 1],
                                       action_embeddings[0, i:i + 1]), dim=0)
            stacked_inputs.append(stacked_input)
            action_embed_positions[i] = (i + 1) * (2 + 3)  # Updated for 3 features
        stacked_inputs = torch.cat(stacked_inputs, dim=0).unsqueeze(0)
        stacked_inputs = stacked_inputs[:, -self.plm_embed_size:, :]  # truncate sequence length
        stacked_inputs_ln = self.embed_ln(stacked_inputs)  # layer normalization

        # Step 4: feed stacked embeddings into the plm
        if attention_mask is None:
            attention_mask = torch.ones((stacked_inputs_ln.shape[0], stacked_inputs_ln.shape[1]), dtype=torch.long, device=self.device)

        transformer_outputs = self.plm(
            inputs_embeds=stacked_inputs_ln,
            attention_mask=attention_mask,
            output_hidden_states=True,
            stop_layer_idx=self.which_layer,
        )
        logits = transformer_outputs['last_hidden_state']
        if self.residual:
            logits = logits + stacked_inputs_ln  # residual add

        # Step 5: predict actions
        logits_used = logits[:, action_embed_positions - 2]
        action_pred = self.action_head(logits_used)

        return action_pred


In [28]:
# import pandas as pd

# # Specify the paths to your six CSV files
# file1 = '/Users/shyamshrestha/Desktop/my data/client1_pcc.csv'
# file2 = '/Users/shyamshrestha/Desktop/untitled folder/ client1 _bbr_full.csv'
# file3 = '/Users/shyamshrestha/Desktop/untitled folder/client2_pcc.csv'
# file4 = '/Users/shyamshrestha/Desktop/untitled folder/client1_cubic_full.csv'
# file5 = '/Users/shyamshrestha/Desktop/untitled folder/client2_bbr.csv'
# file6 = '/Users/shyamshrestha/Desktop/untitled folder/client2_cubic.csv'

# # List of file paths
# file_paths = [file1, file2, file3, file4, file5, file6]

# # List to hold data from all files
# data_frames = []

# # Read each CSV file and append to the list
# for file in file_paths:
#     df = pd.read_csv(file)
#     data_frames.append(df)

# # Concatenate all data frames into one
# combined_df = pd.concat(data_frames, ignore_index=True)

# # Specify the output file name
# output_file = '/Users/shyamshrestha/Desktop/untitled folder/allcombined.csv'

# # Save the combined data frame to a new CSV file
# combined_df.to_csv(output_file, index=False)

# print(f"All CSV files combined into {output_file}")


experience pool

In [30]:
import numpy as np
import pandas as pd
import pickle
import torch

def compute_tcp_reward(features, reward_weights):
    """
    Compute the reward for TCP flow control based on features extracted by the encoder.

    Args:
    - features (tuple of tensors): Extracted features from the encoder, including RTT, loss rate, and throughputs.
    - reward_weights (dict): Weights for each feature type to scale the reward.

    Returns:
    - float: The computed reward value.
    """
    # Extract features
    current_rtt = features[0]
    current_loss = features[1]
    throughputs = features[2]

    # Convert to numpy for reward computation
    current_rtt = current_rtt.cpu().detach().numpy()
    current_loss = current_loss.cpu().detach().numpy()
    throughputs = throughputs.cpu().detach().numpy()

    # Ensure that the values are positive to avoid log(0) or negative values
    throughput = np.clip(throughputs, 1e-6, None)
    latency = np.clip(current_rtt, 1e-6, None)
    loss_rate = np.clip(current_loss, 1e-6, None)

    # Compute the reward
    reward = reward_weights['throughput'] * np.log(throughput) - \
             reward_weights['latency'] * np.log(latency) - \
             reward_weights['loss_rate'] * np.log(loss_rate)

    return reward.mean()  # Return the mean reward for the batch

class ExperiencePool:
    """
    Experience pool for collecting trajectories.
    """
    def __init__(self):
        self.states = []
        self.actions = []
        self.rewards = []
        self.dones = []

    def add(self, state, action, reward, done):
        self.states.append(state)  # sometimes state is also called obs (observation)
        self.actions.append(action)
        self.rewards.append(reward)
        self.dones.append(done)

    def __len__(self):
        return len(self.states)


# Define the CCA mapping
cca_mapping = {'Cubic': 0, 'BBR': 1, 'PCC': 2}

# Load your data
data_path = '/content/encoded_file.csv'  # Replace with your actual data file path
df = pd.read_csv(data_path)

# Create an instance of ExperiencePool
exp_pool = ExperiencePool()

# Initialize the global reward variable
global_reward = 0

# Iterate through each row and update the experience pool
for index, row in df.iterrows():
    # Extract state features
    current_rtt = row['Latency']  # RTT
    current_loss = row['LossRate']  # Loss rate
    throughput = row['Throughput']  # Throughput

    # Normalize or preprocess if needed
    state = np.array([current_rtt, current_loss, throughput])

    # Reshape state to match the expected input of EncoderNetwork
    # Assuming seq_len = 1 for a single time step
    state_tensor = torch.tensor(state, dtype=torch.float).unsqueeze(0).unsqueeze(0)  # Shape (1, 1, 3)

    # Action is CCA selection
    action = row['CCAs']  # Already in numeric format

    # Compute reward based on your reward function
    reward_weights = {'throughput': 0.5, 'latency': 0.25, 'loss_rate': 0.25}
    reward = compute_tcp_reward(
        features=(state_tensor[:, :, 0:1], state_tensor[:, :, 1:2], state_tensor[:, :, 2:3]),
        reward_weights=reward_weights
    )

    # Assuming 'done' is an indicator of end of an episode, set it accordingly
    done = 0  # Placeholder, adjust according to your logic

    # Add to experience pool
    exp_pool.add(state=state, action=action, reward=reward, done=done)

# Save the experience pool
pickle_save_path = 'exp_pool.pkl'
pickle.dump(exp_pool, open(pickle_save_path, 'wb'))
print(f"Done. Experience pool saved at:", pickle_save_path)

FileNotFoundError: [Errno 2] No such file or directory: '/content/encoded_file.csv'

In [None]:
import os
import sys
import numpy as np
import torch
import pickle

from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
from pprint import pprint
from munch import Munch
from torch.nn import CrossEntropyLoss
from torch.optim import AdamW
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.tensorboard import SummaryWriter
from baseline_special.utils.utils import load_traces
from plm_special.trainer import Trainer
from plm_special.evaluate import evaluate_on_env
from plm_special.test import test_on_env
from plm_special.data.dataset import ExperienceDataset
from plm_special.models.rl_policy import OfflineRLPolicy
from plm_special.models.state_encoder import EncoderNetwork
from plm_special.utils.utils import set_random_seed
from plm_special.utils.plm_utils import load_plm
from plm_special.utils.console_logger import ConsoleLogger
from config import cfg


PLM_LAYER_SIZES = {
    'gpt2': {
        'base': 24,
        'small': 12,
        'large': 36,
        'xl': 48
    },
    'llama': {
        'base': 32,
    },
    't5-lm': {
        'base': 12,
        'small': 6,
        'large': 24,
        'xl': 24
    }
}

# Updated reward computation function
def process_reward(reward,
                   max_reward=exp_dataset_info.max_reward,
                   min_reward=exp_dataset_info.min_reward,
                   scale=args.scale):
    # Assuming reward is already scaled in your implementation
    return reward

def save_model(args, model, save_dir):
    if args.rank > 0:
        model.plm.save_pretrained(save_dir)
        torch.save(model.modules_except_plm.state_dict(), os.path.join(save_dir, 'modules_except_plm.bin'))
    else:
        torch.save(model.state_dict(), os.path.join(save_dir, 'model.bin'))

def load_model(args, model, model_dir):
    if args.rank > 0:
        model.plm.load_adapter(model_dir, adapter_name='default')
        model.modules_except_plm.load_state_dict(torch.load(os.path.join(model_dir, 'modules_except_plm.bin')))
    else:
        model.load_state_dict(torch.load(os.path.join(model_dir, 'model.bin')))
    return model

def adapt(args, model, exp_dataset, exp_dataset_info, eval_env_settings, checkpoint_dir, best_model_dir, eval_process_reward_fn):
    optimizer = AdamW(
        model.parameters(),
        lr=args.lr,
        weight_decay=args.weight_decay,
    )
    lr_scheduler = LambdaLR(
        optimizer,
        lambda steps: min((steps + 1) / args.warmup_steps, 1)
    )
    loss_fn = CrossEntropyLoss()
    trainer = Trainer(args, model=model, optimizer=optimizer, exp_dataset=exp_dataset, loss_fn=loss_fn, device=args.device, lr_scheduler=lr_scheduler,
                      grad_accum_steps=args.grad_accum_steps)

    target_return = exp_dataset_info.max_return * args.target_return_scale
    best_eval_return = 0.

    total_train_losses = []
    for epoch in range(args.num_epochs):
        train_logs, train_losses = trainer.train_epoch()
        total_train_losses.extend(train_losses)
        print('='* 20, f'Training Iteration #{epoch}', '=' * 20)
        print('>' * 10, 'Training Information:')
        pprint(train_logs)

        if epoch % args.save_checkpoint_per_epoch == 0:
            checkpoint_dir_epoch = os.path.join(checkpoint_dir, str(epoch))
            if not os.path.exists(checkpoint_dir_epoch):
                os.makedirs(checkpoint_dir_epoch)
            save_model(args, model, checkpoint_dir_epoch)
            print('Checkpoint saved at:', checkpoint_dir_epoch)

        if epoch % args.eval_per_epoch == 0:
            eval_logs = evaluate_on_env(args, env_settings=eval_env_settings, model=model, target_return=target_return, max_ep_num=args.trace_num,
                                        process_reward_fn=eval_process_reward_fn)
            episodes_return = eval_logs['episodes_return']
            if best_eval_return < episodes_return:
                best_eval_return = episodes_return
                save_model(args, model, best_model_dir)
                print('Best model saved at:', best_model_dir)

            eval_logs['best_return'] = best_eval_return
            print('>' * 10, 'Evaluation Information')
            pprint(eval_logs)
    train_losses_path = os.path.join(checkpoint_dir, 'train_losses.txt')
    np.savetxt(train_losses_path, total_train_losses, fmt='%.6f', delimiter='\n')


def test(args, model, exp_dataset_info, env_settings, model_dir, result_dir, test_process_reward_fn):
    model = load_model(args, model, model_dir)
    print('Load model from:', model_dir)
    target_return = exp_dataset_info.max_return * args.target_return_scale
    results = test_on_env(args, model, result_dir, env_settings, target_return, args.trace_num, test_process_reward_fn, seed=args.seed)
    print(results)
    print('Test time:', results['time'], '\nMean reward:', results['mean_reward'])
    print('Results saved at:', result_dir)


def run(args):
    assert args.plm_type in cfg.plm_types
    assert args.plm_size in cfg.plm_sizes
    assert args.exp_pool_path is not None, 'please specify a experience pool path for training'
    assert args.trace in cfg.trace_dirs.keys()
    assert args.video in cfg.video_size_dirs.keys()

    set_random_seed(args.seed)

    trace_dir = cfg.trace_dirs[args.trace]
    video_size_dir = cfg.video_size_dirs[args.video]
    all_cooked_time ,all_cooked_bw ,all_file_names, all_mahimahi_ptrs = load_traces(trace_dir)
    args.trace_num = min(args.trace_num, len(all_file_names))
    if args.trace_num == -1:
        args.trace_num = len(all_file_names)
    if args.trace_num == len(all_file_names):
        args.fixed_order = True

    env_settings = {
        'all_cooked_time': all_cooked_time,
        'all_cooked_bw': all_cooked_bw,
        'all_file_names': all_file_names,
        'all_mahimahi_ptrs': all_mahimahi_ptrs,
        'video_size_dir': video_size_dir,
        'fixed': args.fixed_order,
        'trace_num': args.trace_num,
    }

    exp_pool = pickle.load(open(args.exp_pool_path, 'rb'))
    exp_dataset = ExperienceDataset(exp_pool, gamma=args.gamma, scale=args.scale, max_length=args.w, sample_step=args.sample_step)
    exp_dataset_info = Munch(exp_dataset.exp_dataset_info)
    print('Experience dataset info:')
    pprint(exp_dataset_info)

    plm, *_ = load_plm(args.plm_type, os.path.join(cfg.plm_dir, args.plm_type, args.plm_size),
                       device_input_side=args.device, device_output_side=args.device_out, device_middle_side=args.device_mid)

    if args.plm_type != 'llama':
        plm = plm.to(args.device)

    if args.rank != -1:
        plm = peft_model(plm, args.plm_type, rank=args.rank)

    assert args.state_feature_dim is not None, 'please specify state feature dim to create state encoder'
    state_encoder = EncoderNetwork(embed_dim=args.state_feature_dim)
    state_encoder = state_encoder.to(args.device)

    plm_embed_size = cfg.plm_embed_sizes[args.plm_type][args.plm_size]
    max_ep_len = exp_dataset_info.max_timestep + 1
    rl_policy = OfflineRLPolicy(state_feature_dim=args.state_feature_dim, bitrate_levels=BITRATE_LEVELS, state_encoder=state_encoder, plm=plm, plm_embed_size=plm_embed_size,
                                           max_length=args.w, max_ep_len=max_ep_len, device=args.device, device_out=args.device_out, which_layer=args.which_layer)

    train_exp_pool_info = args.exp_pool_path.split('/')[-4:-1]
    train_exp_pool_info = '_'.join(train_exp_pool_info)
    models_dir = os.path.join(cfg.plm_ft_dir, f'{args.plm_type}_{args.plm_size}', train_exp_pool_info + f'_ss_{args.sample_step}', f'rank_{args.rank}_w_{args.w}_gamma_{args.gamma}_sfd_{args.state_feature_dim}'\
                              f'_lr_{args.lr}_wd_{args.weight_decay}_warm_{args.warmup_steps}_epochs_{args.num_epochs}_seed_{args.seed}')
    results_dir = os.path.join(cfg.results_dir, f'{args.trace}_{args.video}', f'trace_num_{args.trace_num}_fixed_{args.fixed_order}', f'{args.plm_type}_{args.plm_size}',
                               f'early_stop_{args.which_layer}_rank_{args.rank}_w_{args.w}_gamma_{args.gamma}_sfd_{args.state_feature_dim}_target_return_{args.target_return_scale}')

    print('Models saved at:', models_dir)
    print('Results saved at:', results_dir)

    if not os.path.exists(models_dir):
        os.makedirs(models_dir)
    if not os.path.exists(results_dir):
        os.makedirs(results_dir)

    # Selective adaption or testing
    if args.mode == 'train':
        adapt(args, rl_policy, exp_dataset, exp_dataset_info, env_settings, models_dir, os.path.join(models_dir, 'best'), eval_process_reward_fn=process_reward)
    elif args.mode == 'test':
        test(args, rl_policy, exp_dataset_info, env_settings, models_dir, results_dir, test_process_reward_fn=process_reward)
    else:
        raise NotImplementedError(f'Unsupported mode: {args.mode}')


def main():
    parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
    parser.add_argument('--exp_pool_path', type=str, default=None, help='experience pool path')
    parser.add_argument('--num_epochs', type=int, default=50, help='number of epochs')
    parser.add_argument('--lr', type=float, default=1e-4, help='learning rate')
    parser.add_argument('--weight_decay', type=float, default=1e-4, help='weight decay')
    parser.add_argument('--warmup_steps', type=int, default=1000, help='warmup steps for learning rate scheduler')
    parser.add_argument('--grad_accum_steps', type=int, default=1, help='gradient accumulation steps')
    parser.add_argument('--rank', type=int, default=-1, help='Rank of PEFT model')
    parser.add_argument('--plm_type', type=str, default='gpt2', help='pre-trained language model type')
    parser.add_argument('--plm_size', type=str, default='small', help='pre-trained language model size')
    parser.add_argument('--w', type=int, default=40, help='temporal window size')
    parser.add_argument('--sample_step', type=int, default=1, help='sample step')
    parser.add_argument('--gamma', type=float, default=0.99, help='discount factor')
    parser.add_argument('--state_feature_dim', type=int, default=None, help='state feature dimension')
    parser.add_argument('--target_return_scale', type=float, default=1.0, help='target return scale')
    parser.add_argument('--which_layer', type=int, default=-1, help='which layer to extract hidden states as input of policy')
    parser.add_argument('--trace_num', type=int, default=-1, help='number of traces')
    parser.add_argument('--trace', type=str, default='high', help='trace')
    parser.add_argument('--video', type=str, default='high', help='video size')
    parser.add_argument('--fixed_order', type=bool, default=False, help='whether to load trace in fixed order')
    parser.add_argument('--scale', type=bool, default=False, help='whether to scale the reward')

    parser.add_argument('--save_checkpoint_per_epoch', type=int, default=10, help='save checkpoint per number of epochs')
    parser.add_argument('--eval_per_epoch', type=int, default=5, help='evaluation per number of epochs')
    parser.add_argument('--mode', type=str, default='train', help='train or test')

    # Device arguments
    parser.add_argument('--device', type=str, default='cuda', help='Device for state_encoder and plm')
    parser.add_argument('--device_mid', type=str, default='cuda', help='Device for middle layers of plm')
    parser.add_argument('--device_out', type=str, default='cuda', help='Device for output layers of plm')

    parser.add_argument('--seed', type=int, default=42, help='random seed')

    args = parser.parse_args()
    run(args)


if __name__ == '__main__':
    main()



In [44]:
import pandas as pd
import numpy as np
import pickle

class ExperiencePool:
    """
    Experience pool for collecting trajectories.
    """
    def __init__(self):
        self.states = []
        self.actions = []
        self.rewards = []
        self.dones = []
 
    def add(self, state, action, reward, done):
        self.states.append(state)  # sometimes state is also called observation (obs)
        self.actions.append(action)
        self.rewards.append(reward)
        self.dones.append(done)
 
    def __len__(self):
        return len(self.states)

# Load your dataset from a CSV file
csv_file_path = '/home/sit-research/Desktop/NetLLM-master/tcp-llm_dataset/encoded_file2.csv'  # Replace with the actual path to your dataset
df = pd.read_csv(csv_file_path)

# Define the list of columns to include in the state
columns_to_use = [
    'Throughput',     # Measured throughput
    'LossRate',       # Packet loss rate
    'Latency',        # Round-trip time (RTT)
    'SendingRate',    # Sending rate of the flow
]
for column in columns_to_use:
    df[column] = df[column].astype(np.float32)
# Initialize the experience pool
exp_pool = ExperiencePool()

# Iterate through each row and calculate rewards
for index, row in df.iterrows():
    # Create state from relevant columns


    state = np.array(row[columns_to_use])
    
    # Example reward function: Maximize throughput, minimize latency and packet loss
    throughput = row['Throughput']
    latency = row['Latency']
    packet_loss = row['LossRate']
    
    # Reward function
    reward = throughput / (latency + 1) - packet_loss
    
    # Use the current CCA as the action
    cca_action = row['CCAs']  # Assumes CCAs are already encoded as integers
    
    # Add to experience pool
    exp_pool.add(state=state, action=np.float32(cca_action), reward=np.float32(reward), done=0)
    break

# Save the experience pool using pickle
pickle_save_path = 'exp_pool_with_cca_actions.pkl'
pickle.dump(exp_pool, open(pickle_save_path, 'wb'))
print(f"Done. Experience pool saved at: {pickle_save_path}")


Done. Experience pool saved at: exp_pool_with_cca_actions.pkl


In [42]:
exp_pool.states[0].dtype

dtype('O')

In [33]:
type(exp_pool.states)

list

In [34]:
len(exp_pool.states)

1

In [35]:
exp_pool.states[0][0]

9069003.0

In [36]:
exp_pool.states[0][1]

0.0034533278085291386

In [37]:
exp_pool.states[0][2]

0.0002651966060511768

In [38]:
exp_pool.states[0][3]

441188.5625

In [39]:
type(exp_pool.states[0])

numpy.ndarray

In [40]:
type(exp_pool.actions[0])

numpy.float32

In [41]:
type(exp_pool.actions)

list

In [None]:
import numpy as np
import pandas as pd
import pickle
import torch

def compute_tcp_reward(features, reward_weights):
    """
    Compute the reward for TCP flow control based on features extracted by the encoder.

    Args:
    - features (tuple of tensors): Extracted features from the encoder, including RTT, loss rate, and throughputs.
    - reward_weights (dict): Weights for each feature type to scale the reward.

    Returns:
    - np.array: The computed reward value.
    """
    # Extract features
    current_rtt = features[0]
    current_loss = features[1]
    throughputs = features[2]

    # Convert to numpy for reward computation
    current_rtt = current_rtt.cpu().detach().numpy()
    current_loss = current_loss.cpu().detach().numpy()
    throughputs = throughputs.cpu().detach().numpy()

    # Ensure that the values are positive to avoid log(0) or negative values
    throughput = np.clip(throughputs, 1e-6, None)
    latency = np.clip(current_rtt, 1e-6, None)
    loss_rate = np.clip(current_loss, 1e-6, None)

    # Compute the reward
    reward = reward_weights['throughput'] * np.log(throughput) - \
             reward_weights['latency'] * np.log(latency) - \
             reward_weights['loss_rate'] * np.log(loss_rate)

    return reward.mean()  # Return the mean reward for the batch

class ExperiencePool:
    """
    Experience pool for collecting trajectories.
    """
    def __init__(self):
        self.states = []
        self.actions = []
        self.rewards = []
        self.dones = []

    def add(self, state, action, reward, done):
        self.states.append(np.array(state))  # Store state as np.array
        self.actions.append(np.array(action))  # Store action as np.array
        self.rewards.append(np.array(reward))  # Store reward as np.array
        self.dones.append(np.array(done))  # Store done as np.array

    def __len__(self):
        return len(self.states)

# Define the CCA mapping
cca_mapping = {0: 'Cubic', 1: 'BBR', 2: 'PCC'}  # Adjusted mapping for actions

# Load your data
data_path = '/home/sit-research/Desktop/NetLLM-master/tcp-llm_dataset/encoded_file.csv'  # Replace with your actual data file path
df = pd.read_csv(data_path)

# Create an instance of ExperiencePool
exp_pool = ExperiencePool()

# Initialize the global reward variable
global_reward = 0

# Iterate through each row and update the experience pool
for index, row in df.iterrows():
    # Extract state features
    current_rtt = row['Latency']  # RTT
    current_loss = row['LossRate']  # Loss rate
    throughput = row['Throughput']  # Throughput

    # Normalize or preprocess if needed
    state = np.array([current_rtt, current_loss, throughput])

    # Reshape state to match the expected input of EncoderNetwork
    # Assuming seq_len = 1 for a single time step
    state_tensor = torch.tensor(state, dtype=torch.float).unsqueeze(0).unsqueeze(0)  # Shape (1, 1, 3)

    # Action is CCA selection
    action = np.array(row['CCAs'])  # Convert action to np.array

    # Compute reward based on your reward function
    reward_weights = {'throughput': 0.5, 'latency': 0.25, 'loss_rate': 0.25}
    reward = compute_tcp_reward(
        features=(state_tensor[:, :, 0:1], state_tensor[:, :, 1:2], state_tensor[:, :, 2:3]),
        reward_weights=reward_weights
    )

    # Convert reward to np.array
    reward = np.array(reward)

    # Assuming 'done' is an indicator of end of an episode, set it accordingly
    done = np.array(0)  # Placeholder, adjust according to your logic

    # Add to experience pool
    exp_pool.add(state=state, action=action, reward=reward, done=done)

# Save the experience pool
pickle_save_path = 'exp_pool_tllm.pkl'
with open(pickle_save_path, 'wb') as f:
    pickle.dump(exp_pool, f)

print(f"Done. Experience pool saved at:", pickle_save_path)


In [2]:
!python train_script.py --which-layer 3


python: can't open file 'train_script.py': [Errno 2] No such file or directory
