<a href="https://colab.research.google.com/github/Mugdha1503/maml_rl/blob/main/MAML_Scheduler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import gym
from gym import spaces
import pandas as pd
import random

# Load and preprocess the dataset
file_path = '/content/Dataset2.csv'
dataset = pd.read_csv(file_path)

columns_to_clean = ['P1_RTT', 'P1_CWND', 'P1_inflight', 'P2_RTT', 'P2_CWND', 'P2_inflight']

# Clean columns
def clean_column(col):
    cleaned_col = pd.to_numeric(dataset[col], errors='coerce')
    cleaned_col.fillna(cleaned_col.mean(), inplace=True)
    return cleaned_col

for col in columns_to_clean:
    dataset[col] = clean_column(col)

# Drop any unnecessary columns
dataset = dataset.drop(columns=['Unnamed: 6'], errors='ignore')

# Normalize columns
def normalize(column):
    min_val = column.min()
    max_val = column.max()
    return (column - min_val) / (max_val - min_val)

for col in columns_to_clean:
    dataset[col] = normalize(dataset[col])

# Define the reward calculation function
def calculate_reward(path_1, path_2, action):
    weights = [0.5, 0.3, 0.2]
    score_1 = sum(w * p for w, p in zip(weights, path_1))
    score_2 = sum(w * p for w, p in zip(weights, path_2))
    reward = score_2 - score_1 if action == 0 else score_1 - score_2
    return max(-100, min(reward, 100))

# Define a function to sample a random MDP
def sample_mdp():
    return NetworkEnv(dataset)

# Define custom environment
class NetworkEnv(gym.Env):
    def __init__(self, data):
        super(NetworkEnv, self).__init__()
        self.data = data
        self.current_step = 0
        self.observation_space = spaces.Box(low=0, high=np.inf, shape=(3,), dtype=np.float32)
        self.action_space = spaces.Discrete(2)

    def step(self, action):
        current_data = self.data.iloc[self.current_step]
        path_1 = [current_data['P1_RTT'], current_data['P1_CWND'], current_data['P1_inflight']]
        path_2 = [current_data['P2_RTT'], current_data['P2_CWND'], current_data['P2_inflight']]
        reward = calculate_reward(path_1, path_2, action)
        self.current_step += 1
        done = self.current_step >= len(self.data)
        new_state = np.array(path_1 if action == 0 else path_2, dtype=np.float32)
        return new_state, reward, done, {}

    def reset(self):
        self.current_step = 0
        first_row = self.data.iloc[self.current_step]
        return np.array([first_row['P1_RTT'], first_row['P1_CWND'], first_row['P1_inflight']], dtype=np.float32)

    def render(self, mode='human'):
        pass

# Define a simple policy network
class PolicyNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.softmax(self.fc2(x))
        return x

# Define the MAML algorithm with allow_unused flag
class MAML:
    def __init__(self, policy, alpha, beta, num_inner_steps):
        self.policy = policy
        self.alpha = alpha  # Task-specific learning rate
        self.beta = beta    # Meta-update learning rate
        self.num_inner_steps = num_inner_steps
        self.optimizer = optim.Adam(self.policy.parameters(), lr=self.beta)

    def adapt(self, task, initial_params):
        adapted_params = initial_params

        # Perform task-specific adaptation (inner loop)
        for _ in range(self.num_inner_steps):
            # Get a trajectory from the task using the standalone function
            states, actions, rewards, _ = sample_trajectory(task, self.policy)

            # Compute loss
            loss = self.compute_loss(states, actions, rewards, adapted_params)

            # Compute gradients with allow_unused=True
            grads = torch.autograd.grad(loss, adapted_params, create_graph=True, allow_unused=True)

            # Update parameters using gradient descent
            adapted_params = [param - self.alpha * grad if grad is not None else param
                              for param, grad in zip(adapted_params, grads)]

        return adapted_params

    def compute_loss(self, states, actions, rewards, params):
        log_probs = []
        for state, action in zip(states, actions):
            action_probs = self.policy(state)


            action_probs = action_probs.squeeze()
            if len(action_probs.shape) == 0:
                action_probs = action_probs.unsqueeze(0)

            log_prob = torch.log(action_probs[action])
            log_probs.append(log_prob)

        log_probs = torch.stack(log_probs)
        rewards = torch.FloatTensor(rewards)

        # Compute the loss
        loss = -torch.sum(log_probs * rewards)
        return loss

    def meta_update(self, tasks):
        meta_loss = 0
        total_rewards = 0  # To accumulate rewards across tasks

        for task in tasks:
            initial_params = list(self.policy.parameters())

            # Adapt the policy to the task
            adapted_params = self.adapt(task, initial_params)

            # Get a new trajectory with the adapted policy
            states, actions, rewards, _ = sample_trajectory(task, self.policy)

            # Compute the loss for the adapted policy
            loss = self.compute_loss(states, actions, rewards, adapted_params)

            # Accumulate the meta-loss
            meta_loss += loss

            # Calculate the total reward for the task
            total_rewards += sum(rewards)  # Sum of rewards for this task

        # Perform meta-update using the accumulated meta-loss
        self.optimizer.zero_grad()
        meta_loss.backward()
        self.optimizer.step()

        # Print the average reward for this meta-update
        avg_reward = total_rewards / len(tasks)
        print(f"Average reward for this meta-update: {avg_reward:.2f}")

    def train(self, tasks, num_meta_iterations):
        for iteration in range(num_meta_iterations):
            print(f"Meta iteration {iteration + 1}/{num_meta_iterations} starting...")
            self.meta_update(tasks)
            print(f"Meta iteration {iteration + 1}/{num_meta_iterations} complete")

# Define a function to sample a trajectory from the environment
def sample_trajectory(env, policy):
    state = env.reset()
    done = False
    states, actions, rewards = [], [], []

    while not done:
        state_tensor = torch.FloatTensor(state).unsqueeze(0)
        action_probs = policy(state_tensor).detach().numpy().squeeze()
        action = np.random.choice(len(action_probs), p=action_probs)

        next_state, reward, done, _ = env.step(action)

        states.append(state_tensor)
        actions.append(action)
        rewards.append(reward)

        state = next_state

    return states, actions, rewards, done

# Helper function to sample a batch of tasks
def sample_tasks(num_tasks, dataset):
    tasks = []
    for _ in range(num_tasks):
        tasks.append(NetworkEnv(dataset))
    return tasks

# Hyperparameters
input_size = 3  # State size (3: RTT, CWND, Inflight)
hidden_size = 128
output_size = 2  # Action size (2 paths)
alpha = 0.01  # Inner loop learning rate
beta = 0.001  # Outer loop (meta-update) learning rate
num_inner_steps = 5
num_meta_iterations = 50  # Number of meta-training iterations
num_tasks = 10  # Number of tasks to sample for meta-training

# Initialize environment, policy, and MAML
policy = PolicyNetwork(input_size, hidden_size, output_size)
maml = MAML(policy, alpha, beta, num_inner_steps)

# Sample tasks and train MAML
tasks = sample_tasks(num_tasks, dataset)
maml.train(tasks, num_meta_iterations)

Meta iteration 1/50 starting...
Average reward for this meta-update: -2.95
Meta iteration 1/50 complete
Meta iteration 2/50 starting...
Average reward for this meta-update: -0.02
Meta iteration 2/50 complete
Meta iteration 3/50 starting...
Average reward for this meta-update: 2.70
Meta iteration 3/50 complete
Meta iteration 4/50 starting...
Average reward for this meta-update: -1.92
Meta iteration 4/50 complete
Meta iteration 5/50 starting...
Average reward for this meta-update: 2.89
Meta iteration 5/50 complete
Meta iteration 6/50 starting...
Average reward for this meta-update: 1.26
Meta iteration 6/50 complete
Meta iteration 7/50 starting...
Average reward for this meta-update: 2.59
Meta iteration 7/50 complete
Meta iteration 8/50 starting...
Average reward for this meta-update: 4.00
Meta iteration 8/50 complete
Meta iteration 9/50 starting...
Average reward for this meta-update: 4.63
Meta iteration 9/50 complete
Meta iteration 10/50 starting...
Average reward for this meta-update: 