In [None]:
!pip install -U bitsandbytes

!pip install -U transformers peft accelerate
 
!pip install -U accelerate peft
 

In [None]:
!pip3 install torch torchvision --index-url https://download.pytorch.org/whl/cu126
!pip install transformers peft accelerate datasets


In [None]:
!pip install safetensors


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer , BitsAndBytesConfig
import torch

torch.cuda.empty_cache()

model_name = "Qwen/Qwen2.5-1.5B-Instruct"
#configuration for quantizing the qwen model (windows setup)
# bnb_cfg = BitsAndBytesConfig(
#     load_in_4bit=True,                # 4-bit
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4",        
#     bnb_4bit_compute_dtype=torch.bfloat16 
# )

#linux quantization:
bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True,                
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",        
    bnb_4bit_compute_dtype=torch.bfloat16 
)

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    # quantization_config=bnb_cfg,
    #torch_dtype=torch.float16,   
    quantization_config= bnb_cfg ,   
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token


In [None]:
#simple adapter cache to fix later:
adapter_cache = {}
def get_adapter(adapter_name):
    if adapter_name in adapter_cache:
        return adapter_cache[adapter_name]
    model = load_adapter(f"./adapters/{adapter_name}")
    adapter_cache[adapter_name] = model
    if len(adapter_cache) > CACHE_LIMIT:
        evict_one()  # define later
    return model


In [None]:
#adapter from: https://huggingface.co/DreamGallery/Qwen-Qwen2.5-1.5B-Instruct-1727452927
# from peft import PeftModel

# model = PeftModel.from_pretrained(
#     base_model,
#     "DreamGallery/Qwen-Qwen2.5-1.5B-Instruct-1727452927",  # adapter path
#     trust_remote_code=True
# )


In [None]:
prompt = "Explain the difference between L1 and L2 regularization in machine learning."
inputs = tokenizer(prompt, return_tensors="pt").to(base_model.device)
with torch.no_grad():
    out = base_model.generate(**inputs, max_new_tokens=64)
print(tokenizer.decode(out[0], skip_special_tokens=True))


In [None]:
#dataset of prompts:
from datasets import load_dataset

ds1 = load_dataset("fka/awesome-chatgpt-prompts")

In [None]:
from datasets import load_dataset

ds = load_dataset("RUC-DataLab/DataScience-Instruct-500K")

In [None]:
from datasets import load_dataset

ds2 = load_dataset("FreedomIntelligence/medical-o1-reasoning-SFT", "en")

In [None]:
import datasets
print(datasets.__version__)

In [None]:
from datasets import Dataset

print(type(ds))   # Should be <class 'datasets.arrow_dataset.Dataset'>
print(type(ds1))  # Should be <class 'datasets.arrow_dataset.Dataset'>

In [None]:
import datasets
print(datasets.__version__)

In [None]:
def tokenize_ds(batch):
    prompts = []
    targets = []

    for messages, evaluation in zip(batch["messages"], batch["evaluation"]):
        if isinstance(messages, list):
            prompt = "\n".join([f"{m.get('role', '')}: {m.get('content', '')}" for m in messages])
        else:
            prompt = str(messages)

        prompts.append(prompt)
        targets.append(str(evaluation))

    return tokenizer(
        prompts,
        text_target=targets,
        truncation=True,
        padding="max_length",
        max_length=512,
    )

def tokenize_ds1(batch):
    prompts = []
    targets = []

    for act, prompt in zip(batch["act"], batch["prompt"]):
        inst = " ".join(act) if isinstance(act, list) else str(act)
        prompts.append(inst)
        targets.append(str(prompt))

    return tokenizer(
        prompts,
        text_target=targets,
        truncation=True,
        padding="max_length",
        max_length=512,
    )

def tokenize_ds3(batch):
    return tokenizer(
        batch["Question"],
        text_target=batch["Response"],
        truncation=True,
        padding="max_length",
        max_length=512,
    )



train1 = ds.map(tokenize_ds, batched=True, batch_size=256)
train2 = ds1.map(tokenize_ds1, batched=True, batch_size=256)



In [None]:
train3 = ds2.map(tokenize_ds3, batched=False)

In [None]:
from peft import LoraConfig 
lora_cfg_1 = LoraConfig(
    r=4,
    lora_alpha=16,
    target_modules=["q_proj","v_proj"],
    lora_dropout=0.1,
    task_type="CAUSAL_LM"
)
lora_cfg_2 = LoraConfig(
    r=4,
    lora_alpha=16,
    target_modules=["q_proj","v_proj"],
    lora_dropout=0.1,
    task_type="CAUSAL_LM"
)


In [None]:

print("CUDA available:", torch.cuda.is_available())
print("Current device:", torch.cuda.get_device_name(0))

In [None]:
!nvidia-smi

In [None]:
from peft import get_peft_model
from transformers import TrainingArguments, Trainer
torch.cuda.empty_cache()

model1 = get_peft_model(base_model, lora_cfg_1)
args1 = TrainingArguments(
    per_device_train_batch_size=1,
    num_train_epochs=1,
    learning_rate=2e-4,
    output_dir="./adapter1",
    gradient_accumulation_steps=1,
    logging_dir="./logs",         # where logs are saved
    logging_steps=5,              # print every 5 steps
    report_to="none",             # disable WandB, keep console output
    disable_tqdm=False            # show progress bar
)
small_train1 = train1["train"].select(range(100))  # use only first 100 samples for faster runtime 

trainer1 = Trainer(model=model1, args=args1, train_dataset=small_train1)
trainer1.train()
model1.save_pretrained("./adapters/adapter1")

del model1
torch.cuda.empty_cache()

model2 = get_peft_model(base_model, lora_cfg_2)
args2 = TrainingArguments(
    per_device_train_batch_size=1,
    num_train_epochs=1,
    learning_rate=2e-4,
    output_dir="./adapter2",
    gradient_accumulation_steps=1,
    logging_dir="./logs",         # where logs are saved
    logging_steps=5,              # print every 5 steps
    report_to="none",             # disable WandB, keep console output
    disable_tqdm=False            # show progress bar
)
small_train2 = train2["train"].select(range(100))
trainer2 = Trainer(model=model2, args=args2, train_dataset=small_train2)
trainer2.train()
model2.save_pretrained("./adapters/adapter2")


#adapter 3:

model3 = get_peft_model(base_model, lora_cfg_2)
args3 = TrainingArguments(
    per_device_train_batch_size=1,
    num_train_epochs=1,
    learning_rate=2e-4,
    output_dir="./adapter3",
    gradient_accumulation_steps=1,
    logging_dir="./logs",         # where logs are saved
    logging_steps=5,              # print every 5 steps
    report_to="none",             # disable WandB, keep console output
    disable_tqdm=False            # show progress bar
)
small_train3 = train3["train"].select(range(100))
trainer3 = Trainer(model=model3, args=args3, train_dataset=small_train3)
trainer3.train()
model3.save_pretrained("./adapters/adapter3")


In [None]:
import os
from safetensors.torch import load_file

def get_dir_size_mb(path):
    total_size = 0
    for root, _, files in os.walk(path):
        for f in files:
            fp = os.path.join(root, f)
            if os.path.isfile(fp):
                total_size += os.path.getsize(fp)
    return total_size / (1024 * 1024)  # MB

def get_adapter_vram_mb(adapter_path, dtype_bytes=2):
    """
    dtype_bytes:
        fp16 / bf16 = 2
        fp32 = 4
    """
    adapter_file = os.path.join(adapter_path, "adapter_model.safetensors")
    state_dict = load_file(adapter_file)  # SAFE loader
    total_params = sum(t.numel() for t in state_dict.values())
    vram_mb = total_params * dtype_bytes / (1024 * 1024)
    return vram_mb

# Configuration
base_path = "/kaggle/working/adapters"   
adapter_names = ["adapter1", "adapter2", "adapter3"]

# Process each adapter
print("=" * 55)
print("ADAPTER SIZE ANALYSIS")
print("=" * 55)

for adapter_name in adapter_names:
    adapter_path = os.path.join(base_path, adapter_name)
    
    print(f"\n📦 {adapter_name}")
    print("-" * 30)
    
    if os.path.exists(adapter_path):
        # Disk size
        size_mb = get_dir_size_mb(adapter_path)
        print(f"  💾 Disk size:   {size_mb:.2f} MB")
        
        # VRAM size
        vram_cost = get_adapter_vram_mb(adapter_path, dtype_bytes=2)
        print(f" VRAM cost:   {vram_cost:.2f} MB")
    else:
        print(f" Directory not found: {adapter_path}")

print("\n" + "=" * 55)

In [None]:
from transformers import  AutoTokenizer
from peft import PeftModel

model_name = "Qwen/Qwen2.5-1.5B-Instruct"
adapter_path = "./adapters/adapter1"

tokenizer = AutoTokenizer.from_pretrained(model_name)


model = PeftModel.from_pretrained(base_model, adapter_path)
model.eval()

text ="""### Instruction:
explain the difference between L1 and L2 regularization in machine learning.

### Input:

### Response:"""
 
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens=150, temperature=0.7, top_p=0.9)
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(decoded)

In [None]:
!git clone https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2


In [None]:

from sentence_transformers import SentenceTransformer , util

model = SentenceTransformer("./all-MiniLM-L6-v2")
adapter_tasks = [
    "role-playing instruction following across many creative and utility roles",
    "data science and machine learning problem solving with step-by-step reasoning",
    "medical diagnosis and clinical reasoning with detailed chain-of-thought explanations",
    "summarization of long articles"
]
adapter_embs = model.encode(adapter_tasks)

print(adapter_embs)
def select_adapter(user_prompt):
    prompt_emb = model.encode([user_prompt], normalize_embeddings=True)[0]
    sims = util.cos_sim(prompt_emb, adapter_embs)[0]
    top = sims.argmax().item()
    best_score = sims[top].item()
    return adapter_tasks[top], best_score
task, score = select_adapter("Can you summarize this text?")
print(task, score)


In [None]:
prompt = "Explain how blood pressure works in the human body."
adapter, scores = select_adapter(prompt)
print(adapter)   
print(scores)    


In [None]:
# import numpy as np
# import random

# class AdapterEnv:

#     def __init__(self, adapters, gpus):
     

#         self.adapters = adapters
#         self.gpus = gpus
#         self.queue_length = 0
#         self.avg_latency = 0
#         self.throughput = 0
#         self.baseline_inference_time = 1.0  # normalized baseline

#         self.current_query = None
#         self.current_step = 0

#     # STATE
#     def _get_state(self):
#         """Return the full state vector."""

#         # Adapter characteristics
#         adapter_vec = []
#         for a in self.adapters.values():
#             adapter_vec.extend([
#                 a["size_mb"],
#                 a["mem_required"],
#                 a["load_time"],
#                 a["latency"],
#                 a["domain_id"],
#                 1 if a["loaded"] else 0,
#                 a["which_gpu"] if a["loaded"] else -1
#             ])

#         # GPU characteristics
#         gpu_vec = []
#         for g in self.gpus.values():
#             gpu_vec.extend([
#                 g["free_vram"],
#                 g["used_vram"],
#                 len(g["loaded_adapters"]),
#                 g["eviction_time"]
#             ])

#         # System context
#         sys_vec = [
#             self.queue_length,
#             self.avg_latency,
#             self.throughput
#         ]

#         return np.array(adapter_vec + gpu_vec + sys_vec, dtype=np.float32)


#     # ACTION SPACE

#     def action_space(self):
#         """
#         Actions are encoded like this:
#         type 0 → selection (pick adapter id or  -1 = no-op)
#         type 1 → placement (load / evict / reuse / swap)
#         type 2 → routing (send query to GPU id)
#         """

#         selection_actions = list(self.adapters.keys()) + [-1]  # -1 = do nothing
#         placement_actions = ["load", "evict", "reuse", "swap"]
#         routing_actions = list(self.gpus.keys())

#         return {
#             "selection": selection_actions,
#             "placement": placement_actions,
#             "routing": routing_actions
#         }

#     def sample_action(self):
#         """Randomly choose valid actions."""
#         return {
#             "selection": random.choice(self.action_space()["selection"]),
#             "placement": random.choice(self.action_space()["placement"]),
#             "routing": random.choice(self.action_space()["routing"])
#         }

#     # STEP 
#     def step(self, action, user_query):
#         """
#         user_query contains:
#             - domain_id
#             - estimated_load
#         """

#         self.current_query = user_query
#         selected_adapter = action["selection"]
#         placement = action["placement"]
#         chosen_gpu = action["routing"]

#         reward = 0
#         cost = 0

#         # SELECTION COST
#         if selected_adapter != -1:
#             correct = (self.adapters[selected_adapter]["domain_id"] ==
#                        user_query["domain_id"])
#             reward += 1 if correct else -1

#         # PLACEMENT COST (LOAD / EVICT / REUSE / SWAP)
  
#         if placement == "load":
#             a = self.adapters[selected_adapter]
#             g = self.gpus[chosen_gpu]
#             # simulate load time and VRAM use
#             cost += a["load_time"]
#             g["free_vram"] -= a["mem_required"]
#             g["used_vram"] += a["mem_required"]
#             a["loaded"] = True
#             a["which_gpu"] = chosen_gpu
#             g["loaded_adapters"].append(selected_adapter)

#         elif placement == "evict":
#             a = self.adapters[selected_adapter]
#             g = self.gpus[a["which_gpu"]]
#             # eviction time
#             cost += g["eviction_time"]
#             g["free_vram"] += a["mem_required"]
#             g["used_vram"] -= a["mem_required"]
#             a["loaded"] = False
#             a["which_gpu"] = -1
#             if selected_adapter in g["loaded_adapters"]:
#                 g["loaded_adapters"].remove(selected_adapter)

#         elif placement == "swap":
#             cost += 2  # arbitrary extra penalty

        
#         # ROUTING COST (INFERENCE)
#         if selected_adapter != -1:
#             adapter_latency = self.adapters[selected_adapter]["latency"]
#         else:
#             adapter_latency = 2  # penalize doing nothing

#         inference_time = adapter_latency
#         cost += inference_time / self.baseline_inference_time

#         # GPU balancing reward
#         gpu_loads = [g["used_vram"] for g in self.gpus.values()]
#         balance = np.std(gpu_loads)
#         reward -= balance  # penalize imbalance

#         # Final reward = positive selection score - costs
#         final_reward = reward - cost

#         # update system context
#         self.avg_latency = 0.9 * self.avg_latency + 0.1 * inference_time
#         self.queue_length = max(0, self.queue_length - 1)

#         # next state
#         next_state = self._get_state()
#         done = False

#         return next_state, final_reward, done, {}

#     # ----------------------------------------------------------------
#     # RESET
#     # ----------------------------------------------------------------
#     def reset(self):
#         self.queue_length = 0
#         self.avg_latency = 0
#         self.throughput = 0
#         for a in self.adapters.values():
#             a["loaded"] = False
#             a["which_gpu"] = -1
#         for g in self.gpus.values():
#             g["loaded_adapters"].clear()
#             g["used_vram"] = 0
#             g["free_vram"] = g["total_vram"]
#         return self._get_state()


In [None]:
import pandas as pd
df = pd.read_csv('/kaggle/input/google-cluster-sample/borg_traces_data.csv')
df.head()


In [None]:
df.collection_type.unique()

In [None]:
from typing import Tuple
import gymnasium as gym
import numpy as np
from stable_baselines3 import DQN
from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env

MAX_GPU_VRAM_MB = 2    
MAX_DISK_MB = 100          # max adapter size 

class AdapterEnv(gym.Env):
   
    
    metadata = {"render_modes": []}

    def __init__(self, df: pd.DataFrame, num_adapters=3, num_gpus=2): #change nums here
        super().__init__()
        self.df = df
        self.num_adapters = num_adapters
        self.num_gpus = num_gpus
        self.current_idx = 0
        self.vram_capacity = 1.0
        self.baseline_latency = 0.5
        self.max_throughput = 1.0
        adapter_vram_mb = np.array([1.04] * num_adapters, dtype=np.float32)
        adapter_disk_mb = np.array([6.37] * num_adapters, dtype=np.float32
                                  )
        # GPU and adapter state
        self.adapter_loaded = np.zeros((num_gpus, num_adapters), dtype=np.float32)
        self.free_vram = np.ones(num_gpus, dtype=np.float32)
        self.queue_length = 0.0
        self.avg_latency = 0.0
        self.throughput = 0.0
        self.adapter_vram = adapter_vram_mb / MAX_GPU_VRAM_MB
        self.adapter_load_cost = adapter_disk_mb / MAX_DISK_MB
        self.adapter_last_used = np.zeros(num_adapters, dtype=np.int32)
        # Actions  
        self.num_actions = 5 * self.num_adapters
        self.action_space = gym.spaces.Discrete(self.num_actions)

        # Observation
        self.adapter_feat_len = 4
        self.gpu_feat_len = 2
        obs_size = (num_adapters * self.adapter_feat_len) + (num_gpus * self.gpu_feat_len) + 3
        self.observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(obs_size,), dtype=np.float32)

    def _get_obs(self):
        # simplified: adapter + GPU + system stats
        row = self.df.iloc[self.current_idx % len(self.df)]
        adapter_stats = []
        for a in range(self.num_adapters):
            loaded = float(self.adapter_loaded[:, a].any())
            vram_cost = self.adapter_vram[a]   # normalized
            load_cost = self.adapter_load_cost[a]
            reuse_age = min(
                (self.current_idx - self.adapter_last_used[a]) / 1000.0,1.0)
            adapter_stats.extend([loaded, vram_cost, load_cost, reuse_age])
        adapter_stats = np.array(adapter_stats, dtype=np.float32)

        gpu_stats = []
        for gpu in range(self.num_gpus):
            gpu_stats.extend([self.free_vram[gpu], np.sum(self.adapter_loaded[gpu]) ])
        gpu_stats = np.array(gpu_stats, dtype=np.float32)
        system_stats = np.array([self.queue_length, self.avg_latency, self.throughput], dtype=np.float32)
        return np.concatenate([adapter_stats, gpu_stats, system_stats]).astype(np.float32)

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.adapter_loaded[:] = 0.0
        self.free_vram[:] = 1.0
        #self.queue_length = 0.5
        self.avg_latency = 0.5
        self.throughput = 0.5
        self.current_idx = 0
        return self._get_obs(), {}
        
    def step(self, action):
        # decode action type
        action_type = action // (self.num_adapters * self.num_gpus)
        rem = action % (self.num_adapters * self.num_gpus)
        adapter_id = rem // self.num_gpus
        gpu_id = rem % self.num_gpus #  send to most free GPU
        row = self.df.iloc[self.current_idx % len(self.df)]
        # ---------------- Reward shaping ---------------- #

        # 1. Adapter correctness  
        # correct_adapter = (adapter_id % 2 == row.get('collection_type', 0))
        # r_domain = 1.0 if correct_adapter else -1.0
        #   Execution reward (penalize just selecting)
        r_execution = 0.0
        if action_type == 0:  # Just selecting gets ZERO
            r_execution = 0.0
        elif action_type in [1, 3, 4]:  # load/reuse/send gets reward
            r_execution = 1.0 # if correct_adapter else -1.0

        # 2. Latency penalty
        r_latency = - (self.avg_latency / self.baseline_latency)

        # 3. Throughput reward
        r_throughput = self.throughput / self.max_throughput

        # 4. VRAM safety
        vram_used = 1.0 - self.free_vram[gpu_id]
        if vram_used > self.vram_capacity:
            r_vram = -1.0
        else:
            r_vram = 0.1 * (1 - vram_used / self.vram_capacity)

        # 5. Eviction & swap penalties
        r_evict = 0.2 if (action_type == 2 and self.free_vram[gpu_id] < np.min(self.adapter_vram)) else -0.1 if action_type == 2 else 0.0
        needs_swap = (action_type == 1 and self.adapter_loaded[gpu_id].any()and self.free_vram[gpu_id] < self.adapter_vram[adapter_id])
        r_swap = -0.6 if needs_swap else 0.0

        # 6. GPU balancing
        gpu_vram_usages = 1.0 - self.free_vram
        r_balance = -np.std(gpu_vram_usages)
        if action_type in {1, 3, 4}:  # load, reuse, send
            self.adapter_last_used[adapter_id] = self.current_idx
        # Final reward
        reward = (
            # 2.0 * r_domain +   
            0.05 * r_execution +  # ← Smaller components
            0.02 * r_latency +
            0.01 * r_throughput +
            0.01 * r_balance +
            0.001 * (r_vram + r_evict + r_swap)   
        )
                 

        # ----- Action effects ----- #
        # if action_type == 0:  # select
        #     reward -= 0.02
        #    # reward += 1.0 if correct_adapter else -1.0
        #     pass
        if action_type == 1:  # load
            vram_cost = self.adapter_vram[adapter_id]
        
            if self.free_vram[gpu_id] >= vram_cost:
                if self.adapter_loaded[gpu_id, adapter_id] == 0.0:
                    self.adapter_loaded[gpu_id, adapter_id] = 1.0
                    self.free_vram[gpu_id] -= vram_cost
                    reward -= self.adapter_load_cost[adapter_id]
                else:
                    reward -= 1  # already loaded
            else:
                reward -= 1.0  # illegal load (OOM)
        
        elif action_type == 2:  # evict
            if self.adapter_loaded[gpu_id, adapter_id] == 1.0:
                self.adapter_loaded[gpu_id, adapter_id] = 0.0
                self.free_vram[gpu_id] += self.adapter_vram[adapter_id]
            else:
                reward -= 1  # useless eviction
        
        elif action_type == 3:
            if self.adapter_loaded[gpu_id, adapter_id] == 1.0 :
                reward += 1
            else:
                reward -= 1

        
        elif action_type == 4:  # send
            if self.adapter_loaded[gpu_id, adapter_id] == 1.0:
                reward += 1.0 #if correct_adapter else -1.0
    
        # update system stats
        # self.queue_length = np.clip(self.queue_length, 0, 1)
        # self.avg_latency = self.queue_length
        # self.throughput = 1.0 - self.avg_latency
 
        # move to next job
        self.current_idx += 1
        terminated = self.current_idx >= 200
        truncated = False
         
        return self._get_obs(), float(reward), terminated, truncated,  {}

    def render(self):
        pass


# Make sure the environment works properly
env = AdapterEnv(df, num_adapters=3, num_gpus=2)
check_env(env, warn=True)
# ------------------ Training ------------------ #
model = PPO(
    "MlpPolicy",
    env,
    verbose=1,
    learning_rate=3e-4,
    n_steps=2048,
    batch_size=256,
    ent_coef=0.02,           # More exploration
    vf_coef=0.5,             # Focus on value function
    gamma=0.99
)

# Train for a reasonable number of timesteps
print("Adapters:", env.num_adapters, "GPUs:", env.num_gpus, "Actions:", env.action_space.n)
#action, _states = model.predict(obs, deterministic=False)
model.learn(total_timesteps=100_000)

# Save the trained model
model.save("adapter_ppo")
print("Training complete.")

# ------------------ Inference ------------------ #
obs, _ = env.reset()
for i in range(50):  # test 20 steps
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, terminated, truncated, info = env.step(action)
    print(f"Step {i}: Action {action}, Reward {reward}")
    if terminated:
        obs, _ = env.reset()
    

In [None]:
# import gymnasium as gym
# from collections import defaultdict
# import random
# import numpy as np

# MAX_GPU_VRAM_MB = 2
# MAX_DISK_MB = 100

# class AdapterEnv(gym.Env):

#     def __init__(self, num_adapters=3, num_gpus=2):
#         super().__init__()

#         self.num_adapters = num_adapters
#         self.num_gpus = num_gpus

#         self.adapter_vram = np.array([1.04] * num_adapters) / MAX_GPU_VRAM_MB
#         self.adapter_load_cost = np.array([6.37] * num_adapters) / MAX_DISK_MB

#         # 4 actions × adapter × gpu
#         self.num_actions = 4 * num_adapters * num_gpus
#         self.action_space = gym.spaces.Discrete(self.num_actions)

#         self.reset()

#     def reset(self, seed=None, options=None):
#         self.adapter_loaded = np.zeros((self.num_gpus, self.num_adapters), dtype=int)
#         self.free_vram = np.ones(self.num_gpus)

#         self.queue_length = np.random.randint(5, 15)   
#         self.step_count = 0

#         return self._get_state(), {}

#     def _get_state(self):
#         return (
#             tuple(self.adapter_loaded.flatten()),
#             tuple(np.round(self.free_vram, 1)),
#             self.queue_length                              
#         )

#     def step(self, action):
#         action_type = action // (self.num_adapters * self.num_gpus)
#         rem = action % (self.num_adapters * self.num_gpus)
#         adapter_id = rem // self.num_gpus
#         gpu_id = rem % self.num_gpus

#         reward = 0.0

#         # ---- LOAD ----
#         if action_type == 0:
#             cost = self.adapter_vram[adapter_id]
#             if self.free_vram[gpu_id] >= cost and self.adapter_loaded[gpu_id, adapter_id] == 0:
#                 self.adapter_loaded[gpu_id, adapter_id] = 1
#                 self.free_vram[gpu_id] -= cost
#                 reward = +0.5
#             else:
#                 reward = -0.5

#         # ---- EVICT ----
#         elif action_type == 1:
#             if self.adapter_loaded[gpu_id, adapter_id] == 1:
#                 self.adapter_loaded[gpu_id, adapter_id] = 0
#                 self.free_vram[gpu_id] += self.adapter_vram[adapter_id]
#                 reward = +0.2
#             else:
#                 reward = -0.2

#         # ---- REUSE ----
#         elif action_type == 2:
#             if self.adapter_loaded[gpu_id, adapter_id] == 1 and self.queue_length > 0:
#                 self.queue_length -= 1
#                 reward = +1.5  # Same as load+send, but saves VRAM
         
                
#             else:
#                 reward = -0.5

#         # ---- SEND  ----
#         elif action_type == 3:
#             if self.adapter_loaded[gpu_id, adapter_id] and self.queue_length > 0:
#                 self.queue_length -= 1                    
#                 reward = +1.0                               
#             else:
#                 reward = -1.0

#         # ---- Arrival process ----
#         arrivals = np.random.poisson(0.3)               
#         self.queue_length += arrivals
#         self.queue_length = max(self.queue_length, 0)
#         if self.queue_length == 0:
#             reward += 20.0   # strong terminal reward
#         self.step_count += 1
#         reward -= 0.05
#         # Episode ends when work is done OR timeout
#         terminated = self.queue_length == 0 or self.step_count >= 200  

#         return self._get_state(), reward, terminated, False, {}

# env = AdapterEnv(num_adapters=3, num_gpus=2)

# # Q-table
# Q = defaultdict(lambda: np.zeros(env.action_space.n))

# alpha = 0.1
# gamma = 0.95
# epsilon = 0.2
# episodes = 5000

# for ep in range(episodes):
#     state, _ = env.reset()
#     done = False

#     while not done:
#         if random.random() < epsilon:
#             action = env.action_space.sample()
#         else:
#             action = np.argmax(Q[state])

#         next_state, reward, done, _, _ = env.step(action)

#         # Q-learning update
#         Q[state][action] += alpha * (
#             reward + gamma * np.max(Q[next_state]) - Q[state][action]
#         )

#         state = next_state

#     if ep % 500 == 0:
#         print(f"Episode {ep}")
# print("\n--- Q-table inspection ---")

# sample_state = next(iter(Q))
# print("Sample state:")
# print(sample_state)

# print("\nQ-values:")
# for i, v in enumerate(Q[sample_state]):
#     print(f"Action {i:2d}: {v:.3f}")


In [None]:
def evaluate_policy(num_episodes=200):
    #Evaluate the learned policy over many episodes 
    rewards = []
    steps_to_completion = []
    
    for _ in range(num_episodes):
        state, _ = env.reset()
        episode_reward = 0
        steps = 0
        
        while True:
            action = np.argmax(Q[state])
            state, reward, done, _, _ = env.step(action)
            episode_reward += reward
            steps += 1
            
            if done:
                rewards.append(episode_reward)
                steps_to_completion.append(steps)
                break
    
    print(f"\nPolicy Evaluation ({num_episodes} episodes):")
    print(f"Average reward: {np.mean(rewards):.2f} ± {np.std(rewards):.2f}")
    print(f"Average steps to complete: {np.mean(steps_to_completion):.2f}")
    print(f"Success rate: {100 * np.mean([r > 0 for r in rewards]):.1f}%")
    
    return np.mean(rewards)

avg_reward = evaluate_policy()

In [None]:
import gymnasium as gym
from collections import defaultdict
import random
import numpy as np

MAX_GPU_VRAM_MB = 2
MAX_DISK_MB = 100

class AdapterEnv(gym.Env):

    def __init__(self, num_adapters=3, num_gpus=2):
        super().__init__()

        self.num_adapters = num_adapters
        self.num_gpus = num_gpus

        # Different adapter sizes for more realism
        self.adapter_vram = np.array([0.8, 1.0, 1.2]) / MAX_GPU_VRAM_MB  # Different sizes
        self.adapter_load_cost = np.array([5.0, 6.37, 7.5]) / MAX_DISK_MB

        # 4 actions × adapter × gpu
        self.num_actions = 4 * num_adapters * num_gpus
        self.action_space = gym.spaces.Discrete(self.num_actions)
        
        # Define observation space properly
        self.observation_space = gym.spaces.Dict({
            'loaded': gym.spaces.MultiBinary(num_gpus * num_adapters),
            'free_vram': gym.spaces.Box(low=0.0, high=1.0, shape=(num_gpus,)),
            'queue': gym.spaces.Discrete(100)  # Max queue length
        })

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.adapter_loaded = np.zeros((self.num_gpus, self.num_adapters), dtype=int)
        self.free_vram = np.ones(self.num_gpus)
        self.queue_length = np.random.randint(5, 15)
        self.step_count = 0
        self.total_reward = 0.0
        
        return self._get_state(), {}

    def _get_state(self):
        # NO ROUNDING - keep full precision!
        return (
            tuple(self.adapter_loaded.flatten()),
            tuple(self.free_vram),  # No rounding!
            self.queue_length
        )

    def step(self, action):
        action_type = action // (self.num_adapters * self.num_gpus)
        rem = action % (self.num_adapters * self.num_gpus)
        adapter_id = rem // self.num_gpus
        gpu_id = rem % self.num_gpus

        reward = 0.0
        processed_request = False

        # ---- LOAD ----
        if action_type == 0:
            cost = self.adapter_vram[adapter_id]
            if self.free_vram[gpu_id] >= cost and self.adapter_loaded[gpu_id, adapter_id] == 0:
                self.adapter_loaded[gpu_id, adapter_id] = 1
                self.free_vram[gpu_id] -= cost
                reward = +0.3  # Smaller reward for preparation
            else:
                reward = -0.3  # Smaller penalty

        # ---- EVICT ----
        elif action_type == 1:
            if self.adapter_loaded[gpu_id, adapter_id] == 1:
                self.adapter_loaded[gpu_id, adapter_id] = 0
                self.free_vram[gpu_id] += self.adapter_vram[adapter_id]
                # Reward eviction only if it's useful (low VRAM)
                if self.free_vram[gpu_id] < 0.3:  # Was running out of VRAM
                    reward = +0.1
                else:
                    reward = -0.1  # Penalize unnecessary eviction
            else:
                reward = -0.2

        # ---- REUSE ----
        elif action_type == 2:
            if self.adapter_loaded[gpu_id, adapter_id] == 1 and self.queue_length > 0:
                self.queue_length -= 1  # PROCESS REQUEST FIRST
                processed_request = True
                #reward = +0.3  # LESS than load+send combined (0.3+1.0=1.3)
            else:
                reward = -0.3

        # ---- SEND ----
        elif action_type == 3:
            if self.adapter_loaded[gpu_id, adapter_id] == 1 and self.queue_length > 0:
                self.queue_length -= 1
                processed_request = True
                reward = +1.0  # Primary way to process requests
            else:
                reward = -0.5

        # ---- Arrival process ----
        arrivals = np.random.poisson(0.3)
        self.queue_length += arrivals
        self.queue_length = max(self.queue_length, 0)
        
        # Efficiency penalty (encourages faster processing)
        reward -= 0.02  # Small time penalty
        
        # Bonus for processing requests
        if processed_request:
            reward += 0.1  # Small bonus for being productive
            
        self.step_count += 1
        self.total_reward += reward
        
        # Episode ends when work is done OR timeout
        terminated = self.queue_length == 0 or self.step_count >= 200
        
        # Terminal reward (only if queue actually cleared)
        if terminated and self.queue_length == 0:
            # Scale terminal reward by efficiency
            efficiency = self.total_reward / self.step_count if self.step_count > 0 else 0
            terminal_bonus = 10.0 + 5.0 * efficiency  # 10-15 bonus
            reward += terminal_bonus
        reward -= 0.05
        return self._get_state(), reward, terminated, False, {}

# Testing function
def test_environment():
    env = AdapterEnv()
    
    print("Testing environment actions:")
    
    # Test 1: Load then Send (should work)
    state, _ = env.reset()
    print(f"Initial state: Queue={state[2]}, Free VRAM={state[1]}")
    
    # Load adapter 0 on GPU 0 (action 0)
    next_state, reward, done, _, _ = env.step(0)
    print(f"Action 0 (Load A0→G0): Reward={reward:.2f}, Queue={next_state[2]}")
    
    # Send adapter 0 from GPU 0 (action 18: 3*6 + 0*2 + 0 = 18)
    next_state, reward, done, _, _ = env.step(18)
    print(f"Action 18 (Send A0→G0): Reward={reward:.2f}, Queue={next_state[2]}")
    
    # Test 2: Try to Send without loading (should fail)
    env.reset()
    next_state, reward, done, _, _ = env.step(18)
    print(f"\nSend without loading: Reward={reward:.2f} (should be negative)")
    
    # Test 3: Try to Reuse without loading
    env.reset()
    # Reuse action for adapter 0 on GPU 0: action_type=2, adapter=0, gpu=0
    reuse_action = 2 * 6 + 0 * 2 + 0  # = 12
    next_state, reward, done, _, _ = env.step(reuse_action)
    print(f"Reuse without loading: Reward={reward:.2f} (should be negative)")
    
    return env

# Run tests
env = test_environment()

# Now train with the fixed environment
print("\n" + "="*50)
print("Training Q-learning agent...")
print("="*50)

# Q-table
Q = defaultdict(lambda: np.zeros(env.action_space.n))

alpha = 0.1
gamma = 0.95
epsilon = 0.2
episodes = 5000

# Track learning progress
episode_rewards = []
episode_lengths = []

for ep in range(episodes):
    state, _ = env.reset()
    done = False
    episode_reward = 0
    steps = 0

    while not done:
        # Epsilon-greedy
        if random.random() < epsilon or state not in Q:
            action = env.action_space.sample()
        else:
            # Handle states not in Q-table
            action = np.argmax(Q[state])

        next_state, reward, done, _, _ = env.step(action)

        # Q-learning update (handle terminal states)
        if done:
            # Terminal state has value 0
            target = reward
        else:
            target = reward + gamma * np.max(Q[next_state])
        
        Q[state][action] += alpha * (target - Q[state][action])

        state = next_state
        episode_reward += reward
        steps += 1

    episode_rewards.append(episode_reward)
    episode_lengths.append(steps)
    
    # Decay epsilon
    epsilon = max(0.01, epsilon * 0.999)
    
    if ep % 500 == 0:
        avg_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) >= 100 else 0
        print(f"Episode {ep}: Reward={episode_reward:.1f}, "
              f"Avg100={avg_reward:.1f}, Epsilon={epsilon:.3f}")

print("\n" + "="*50)
print("Training Complete!")
print("="*50)

# Analyze results
print(f"\nFinal 100 episodes average reward: {np.mean(episode_rewards[-100:]):.2f}")
print(f"Final 100 episodes average length: {np.mean(episode_lengths[-100:]):.2f}")

# Inspect learned policy
print("\n--- Q-table inspection ---")
if Q:
    sample_state = next(iter(Q))
    print(f"Sample state: Queue={sample_state[2]}, Free VRAM={sample_state[1]}")
    
    print("\nTop 5 actions for this state:")
    q_values = Q[sample_state]
    top_actions = np.argsort(q_values)[-5:][::-1]
    
    for action in top_actions:
        action_type = action // (env.num_adapters * env.num_gpus)
        rem = action % (env.num_adapters * env.num_gpus)
        adapter_id = rem // env.num_gpus
        gpu_id = rem % env.num_gpus
        action_names = ['Load', 'Evict', 'Reuse', 'Send']
        print(f"  Action {action:2d}: {action_names[action_type]} "
              f"A{adapter_id}→G{gpu_id}: {q_values[action]:.3f}")
else:
    print("Q-table is empty!")

# Evaluate policy
def evaluate_policy(Q, env, num_episodes=100):
    total_rewards = []
    
    for _ in range(num_episodes):
        state, _ = env.reset()
        done = False
        episode_reward = 0
        
        while not done:
            if state in Q:
                action = np.argmax(Q[state])
            else:
                action = env.action_space.sample()
            
            state, reward, done, _, _ = env.step(action)
            episode_reward += reward
        
        total_rewards.append(episode_reward)
    
    print(f"\nPolicy Evaluation ({num_episodes} episodes):")
    print(f"  Average reward: {np.mean(total_rewards):.2f} ± {np.std(total_rewards):.2f}")
    print(f"  Min reward: {np.min(total_rewards):.2f}")
    print(f"  Max reward: {np.max(total_rewards):.2f}")
    print(f"  Success rate: {100 * np.mean([r > 0 for r in total_rewards]):.1f}%")
    
    return total_rewards

# Evaluate
rewards = evaluate_policy(Q, env)

In [None]:
#visualization
from matplotlib.animation import FuncAnimation
from IPython.display import HTML
import matplotlib.pyplot as plt
import numpy as np

def simple_animate_episode(Q, env, max_steps=30):
     
    state, _ = env.reset()
    states_history = [state]
    actions_history = []
    rewards_history = []
    
    done = False
    step = 0
    
    while not done and step < max_steps:
        if state in Q:
            action = np.argmax(Q[state])
        else:
            action = env.action_space.sample()
        
        actions_history.append(action)
        state, reward, done, _, _ = env.step(action)
        states_history.append(state)
        rewards_history.append(reward)
        step += 1
    
    # Create figure with only 2x2 grid
    fig, axes = plt.subplots(2, 2, figsize=(10, 8))
    
    # 1. VRAM bar chart (top left)
    vram_bars = axes[0, 0].bar(range(env.num_gpus), [0, 0], color=['blue', 'orange'])
    axes[0, 0].set_title('GPU VRAM Usage')
    axes[0, 0].set_xlabel('GPU ID')
    axes[0, 0].set_ylabel('Free VRAM')
    axes[0, 0].set_ylim(0, 1)
    axes[0, 0].set_xticks(range(env.num_gpus))
    axes[0, 0].set_xticklabels([f'GPU {i}' for i in range(env.num_gpus)])
    
    # 2. Queue plot (top right) - Track queue separately
    # Initialize a test env to track queue
    test_env = AdapterEnv(num_adapters=env.num_adapters, num_gpus=env.num_gpus)
    test_state, _ = test_env.reset()
    tracked_queue = [test_env.queue_length]
    
    # Replay to track queue
    for i, action in enumerate(actions_history):
        test_state, reward, done, _, _ = test_env.step(action)
        tracked_queue.append(test_env.queue_length)
    
    queue_line, = axes[0, 1].plot([], [], 'r-', linewidth=2, marker='o', markersize=4)
    axes[0, 1].set_title('Queue Length Over Time')
    axes[0, 1].set_xlabel('Step')
    axes[0, 1].set_ylabel('Queue Length')
    axes[0, 1].grid(True, alpha=0.3)
    # Set y-lim based on tracked data
    max_queue = max(tracked_queue) if tracked_queue else 20
    axes[0, 1].set_ylim(0, max_queue * 1.1 if max_queue > 0 else 20)
    
    # 3. Action distribution (bottom left)
    action_types = ['Load', 'Evict', 'Reuse', 'Send']
    action_colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']
    action_counts = [0, 0, 0, 0]
    action_bars = axes[1, 0].bar(action_types, action_counts, color=action_colors)
    axes[1, 0].set_title('Action Types Taken')
    axes[1, 0].set_ylabel('Count')
    axes[1, 0].set_ylim(0, 5)  # Start with small limit
    
    # 4. Text info (bottom right)
    info_text = axes[1, 1].text(0.05, 0.95, '', fontsize=11, 
                                transform=axes[1, 1].transAxes,
                                verticalalignment='top',
                                bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.9))
    axes[1, 1].axis('off')  # Hide axes for text box
    
    # Add a title to the whole figure
    fig.suptitle(f'Adapter Management Agent - Episode {0}', fontsize=14, fontweight='bold', y=0.98)
    
    def update(frame):
        if frame >= len(states_history):
            return
        
        state = states_history[frame]
        
        # Parse state
        if len(state) >= 2:
            loaded = state[0]
            free_vram = state[1]
        else:
            loaded = (0, 0, 0, 0, 0, 0)
            free_vram = (1.0, 1.0)
        
        # 1. Update VRAM bars
        for i, bar in enumerate(vram_bars):
            if i < len(free_vram):
                bar.set_height(free_vram[i])
        
        # 2. Update queue plot
        if frame < len(tracked_queue):
            queue_data = tracked_queue[:frame+1]
            queue_line.set_data(range(len(queue_data)), queue_data)
            axes[0, 1].set_xlim(0, len(states_history))
        
        # 3. Update action counts
        if frame > 0 and frame-1 < len(actions_history):
            action = actions_history[frame-1]
            action_type = action // (env.num_adapters * env.num_gpus)
            if action_type < len(action_counts):
                action_counts[action_type] += 1
                
                # Update bar heights
                for i, bar in enumerate(action_bars):
                    bar.set_height(action_counts[i])
                
                # Adjust y-limit dynamically
                max_count = max(action_counts) if action_counts else 1
                axes[1, 0].set_ylim(0, max_count * 1.2)
        
        # 4. Update info text
        info_str = f"Step: {frame}\n"
        info_str += f"Free VRAM: GPU0={free_vram[0]:.2f}, GPU1={free_vram[1]:.2f}\n"
        if frame < len(tracked_queue):
            info_str += f"Queue: {tracked_queue[frame]}\n"
        
        if frame > 0 and frame-1 < len(actions_history):
            action = actions_history[frame-1]
            action_type = action // (env.num_adapters * env.num_gpus)
            rem = action % (env.num_adapters * env.num_gpus)
            adapter_id = rem // env.num_gpus
            gpu_id = rem % env.num_gpus
            
            action_names = ['Load', 'Evict', 'Reuse', 'Send']
            info_str += f"\nLast Action: {action_names[action_type]}\n"
            info_str += f"  Adapter {adapter_id} → GPU {gpu_id}\n"
            info_str += f"Reward: {rewards_history[frame-1]:.2f}\n"
            info_str += f"Total Reward: {sum(rewards_history[:frame]):.2f}"
        
        info_text.set_text(info_str)
        
        return queue_line, *vram_bars, *action_bars, info_text
    
    anim = FuncAnimation(fig, update, frames=len(states_history), 
                        interval=500, blit=False, repeat=False)
    
    plt.tight_layout(rect=[0, 0, 1, 0.96])  # Adjust for suptitle
    
 
     
    
    return HTML(anim.to_jshtml())

# Generate animation
simple_animate_episode(Q, env, max_steps=20)

In [3]:
import gymnasium as gym
from collections import defaultdict, deque
import numpy as np
import pandas as pd
 
class AdapterEnv(gym.Env):
     
    
    def __init__(self, arrivals_by_time, num_adapters=3, num_gpus=2, max_steps=200):
        super().__init__()
        self.num_adapters = num_adapters
        self.num_gpus = num_gpus
        self.max_steps = max_steps
        self.arrivals_by_time = arrivals_by_time
        self.max_time = max(arrivals_by_time.keys()) if arrivals_by_time else 0
        
        # Adapter VRAM costs (normalized to MAX_GPU_VRAM_MB=16.0)
        self.adapter_vram = np.array([0.8, 1.0, 1.2]) / 16.0

        
        # Action space: [LOAD, EVICT, REUSE, SEND] × adapter × gpu
        self.action_space = gym.spaces.Discrete(4 * num_adapters * num_gpus)
        
        # State: Tuple format for Q-table compatibility
        self.observation_space = gym.spaces.Tuple((
            gym.spaces.MultiBinary(num_gpus * num_adapters),
            gym.spaces.Box(low=0.0, high=1.0, shape=(num_gpus,)),
            gym.spaces.Discrete(100),
            gym.spaces.Discrete(num_adapters + 1)   
        ))

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.adapter_loaded = np.zeros((self.num_gpus, self.num_adapters), dtype=int)
        self.free_vram = np.ones(self.num_gpus)
        self.queue = deque()
        self.time = 0
        self.step_count = 0
        self.total_reward = 0.0
        
        self._arrival_step()
        return self._get_state(), {}

    def _arrival_step(self):
        """Inject tasks from Google trace at current time step"""
        if self.time in self.arrivals_by_time:
            self.queue.extend(self.arrivals_by_time[self.time])

    def _get_state(self):
        """Returns hashable state tuple with next adapter needed"""
        next_adapter_id = self.queue[0]["adapter_id"] if self.queue else self.num_adapters
        return (
            tuple(self.adapter_loaded.flatten()),
            tuple(self.free_vram),
            min(len(self.queue), 99),
            next_adapter_id
        )

    def step(self, action):
        # Decode action: 0=LOAD, 1=EVICT, 2=REUSE, 3=SEND
        action_type = action // (self.num_adapters * self.num_gpus)
        rem = action % (self.num_adapters * self.num_gpus)
        adapter_id = rem // self.num_gpus
        gpu_id = rem % self.num_gpus
        
        reward = -0.05  # Time penalty
        processed_task = False
        
        # ---- LOAD (type 0) ----
        if action_type == 0:
            cost = self.adapter_vram[adapter_id]
            if self.adapter_loaded[gpu_id, adapter_id] == 0 and self.free_vram[gpu_id] >= cost:
                self.adapter_loaded[gpu_id, adapter_id] = 1
                self.free_vram[gpu_id] -= cost
                reward += 0.3
            else:
                reward -= 0.5
        
        # ---- EVICT (type 1) ----
        elif action_type == 1:
            if self.adapter_loaded[gpu_id, adapter_id] == 1:
                self.adapter_loaded[gpu_id, adapter_id] = 0
                self.free_vram[gpu_id] += self.adapter_vram[adapter_id]
                reward += 0.05
            else:
                reward -= 0.3
        
        # ---- REUSE (type 2) ----
        elif action_type == 2:
            if self.queue:
                task = self.queue[0]
                if (task["adapter_id"] == adapter_id and 
                    self.adapter_loaded[gpu_id, adapter_id] == 1):
                    self.queue.popleft()
                    processed_task = True
                    reward += 1.0
        
        # ---- SEND (type 3) ----
        elif action_type == 3:
            if self.queue:
                task = self.queue[0]
                if (task["adapter_id"] == adapter_id and 
                    self.adapter_loaded[gpu_id, adapter_id] == 1):
                    self.queue.popleft()
                    processed_task = True
                    reward += 1.0
                else:
                    reward -= 0.5
            else:
                reward -= 0.3
        
        # Advance simulation
        self.time += 1
        self._arrival_step()
        self.step_count += 1
        self.total_reward += reward
        
        terminated = (
            self.step_count >= self.max_steps or
            (not self.queue and self.time > self.max_time)
        )
        
        return self._get_state(), reward, terminated, False, {'processed': processed_task}


def load_google_trace(csv_path, max_rows=50000):
    """
    Load Google Cluster Dataset with proper column handling.
    FIX: Avoid nested pd.read_csv() calls that cause TypeError
    """
    import os
    if not os.path.exists(csv_path):
        raise FileNotFoundError(f"CSV not found: {csv_path}")
    
    # Load first row to check for unnamed index column
    try:
        sample_df = pd.read_csv(csv_path, nrows=1)
        has_unnamed = 'Unnamed: 0' in sample_df.columns
    except Exception as e:
        print(f"Warning: Could not detect header: {e}")
        has_unnamed = False
    
    # Load full dataset
    print(f"Loading {csv_path}...")
    df = pd.read_csv(csv_path, index_col=0 if has_unnamed else None)
    
    if max_rows and len(df) > max_rows:
        df = df.head(max_rows).copy()
        print(f"Limited to {max_rows} rows")
    
    # Validate required columns
    required_cols = ['time', 'collection_type', 'assigned_memory']
    missing = [col for col in required_cols if col not in df.columns]
    if missing:
        raise ValueError(f"CSV missing required columns: {missing}")
    
    # Process trace
    arrivals_by_time = defaultdict(list)
    max_mem = df['assigned_memory'].max()
    if max_mem == 0:
        max_mem = 1.0  # Avoid division by zero
    
    for idx, row in df.iterrows():
        t = int(row['time'])
        
        # Map collection_type to adapter ID
        adapter_id = int(row['collection_type']) % 3
        
        task = {
            'adapter_id': adapter_id,
            'vram': row['assigned_memory'] / max_mem,
            'collection_id': row.get('collection_id', idx)
        }
        arrivals_by_time[t].append(task)
    
    print(f"✓ Loaded {len(df)} tasks across {len(arrivals_by_time)} time steps")
    return arrivals_by_time


def test_env_heuristic(env, max_steps=100):
    """Test with greedy heuristic policy """
    state, _ = env.reset()
    
    A = env.num_adapters * env.num_gpus
    G = env.num_gpus
    
    total_reward = 0
    
    print(f"\n=== GOOGLE TRACE HEURISTIC TEST ===")
    print(f"Start: Queue={len(env.queue) if hasattr(env, 'queue') else state[2]} | "
          f"VRAM: {[f'{v:.2f}' for v in state[1]]}")
    print("-" * 60)
    
    for step in range(max_steps):
        action = 0
        action_type_name = "WAIT"
        adapter_id = 0
        gpu_id = 0
        
        if env.queue:
            next_adapter = env.queue[0]['adapter_id']
            loaded = np.array(state[0]).reshape(env.num_gpus, env.num_adapters)
            
            # 1. SEND if possible
            sent = False
            for g in range(env.num_gpus):
                if loaded[g, next_adapter] == 1:
                    action = 3 * A + next_adapter * G + g
                    action_type_name = "SEND"
                    adapter_id = next_adapter
                    gpu_id = g
                    sent = True
                    break
            
            # 2. LOAD if needed
            if not sent:
                cost = env.adapter_vram[next_adapter]
                for g in range(env.num_gpus):
                    if env.free_vram[g] >= cost:
                        action = 0 * A + next_adapter * G + g
                        action_type_name = "LOAD"
                        adapter_id = next_adapter
                        gpu_id = g
                        sent = True
                        break
            
            # 3. EVICT fallback
            if not sent:
                for g in range(env.num_gpus):
                    for a in range(env.num_adapters):
                        if loaded[g, a] == 1:
                            action = 1 * A + a * G + g
                            action_type_name = "EVICT"
                            adapter_id = a
                            gpu_id = g
                            sent = True
                            break
                    if sent:
                        break
        
        # Execute action
        prev_queue_len = len(env.queue) if hasattr(env, 'queue') else state[2]
        prev_vram = state[1]
        
        state, reward, done, _, _ = env.step(action)
        total_reward += reward
        
        # Calculate what changed
        new_queue_len = len(env.queue) if hasattr(env, 'queue') else state[2]
        queue_change = prev_queue_len - new_queue_len
        
        # Log this step
        print(f"Step {step:3d}: {action_type_name} A{adapter_id}→G{gpu_id} | "
              f"Reward: {reward:+.2f} | "
              f"Queue:{new_queue_len} |"
              f"VRAM: {[f'{v:.2f}' for v in state[1]]}")
        
        if done:
            print(f"Episode terminated at step {step}")
            break
    
    print("-" * 60)
    print(f"Total reward: {total_reward:.2f}")
    print(f"Average reward per step: {total_reward/(step+1):.2f}")
    
    return total_reward


def train_q_learning(env, episodes=5000):
    """Q-learning training loop"""
    
    
    Q = defaultdict(lambda: np.zeros(env.action_space.n))
    alpha = 0.1
    gamma = 0.95
    epsilon = 0.2
    epsilon_decay = 0.995
    
    rewards_history = []
    epsilon_history = []
    for ep in range(episodes):
        state, _ = env.reset()
        episode_reward = 0
        
        while True:
            if np.random.random() < epsilon:
                action = env.action_space.sample()
            else:
                action = np.argmax(Q[state])
            
            next_state, reward, done, _, _ = env.step(action)
            episode_reward += reward
            
            Q[state][action] += alpha * (
                reward + gamma * np.max(Q[next_state]) - Q[state][action]
            )
            
            state = next_state
            
            if done:
                break
        
        rewards_history.append(episode_reward)
        epsilon_history.append(epsilon)
        epsilon *= epsilon_decay
        
        if ep % 500 == 0:
            avg = np.mean(rewards_history[-500:])
            print(f"Ep {ep:4d} | Avg Reward: {avg:6.2f} | Epsilon: {epsilon:.3f}")
    
    return Q, rewards_history , epsilon_history  


# === MAIN EXECUTION ===
if __name__ == "__main__":
    CSV_PATH = "/kaggle/input/google-cluster-sample/borg_traces_data.csv"   
    
    print("Loading Google Cluster Dataset...")
    arrivals_by_time = load_google_trace(CSV_PATH, max_rows=5000)
    
    env = AdapterEnv(arrivals_by_time, num_adapters=3, num_gpus=2, max_steps=200)
    
    # Test heuristic
    test_env_heuristic(env)
    
    # Train
    print("\nStarting Q-learning...")
    Q_table, rewards ,  epsilon_history= train_q_learning(env, episodes=5000)

Loading Google Cluster Dataset...
Loading /kaggle/input/google-cluster-sample/borg_traces_data.csv...
Limited to 5000 rows
✓ Loaded 5000 tasks across 4175 time steps

=== GOOGLE TRACE HEURISTIC TEST ===
Start: Queue=681 | VRAM: ['1.00', '1.00']
------------------------------------------------------------
Step   0: LOAD A1→G0 | Reward: +0.25 | Queue:681 |VRAM: ['0.94', '1.00']
Step   1: SEND A1→G0 | Reward: +0.95 | Queue:680 |VRAM: ['0.94', '1.00']
Step   2: LOAD A0→G0 | Reward: +0.25 | Queue:680 |VRAM: ['0.89', '1.00']
Step   3: SEND A0→G0 | Reward: +0.95 | Queue:679 |VRAM: ['0.89', '1.00']
Step   4: SEND A0→G0 | Reward: +0.95 | Queue:678 |VRAM: ['0.89', '1.00']
Step   5: SEND A1→G0 | Reward: +0.95 | Queue:677 |VRAM: ['0.89', '1.00']
Step   6: SEND A0→G0 | Reward: +0.95 | Queue:676 |VRAM: ['0.89', '1.00']
Step   7: SEND A0→G0 | Reward: +0.95 | Queue:675 |VRAM: ['0.89', '1.00']
Step   8: SEND A0→G0 | Reward: +0.95 | Queue:674 |VRAM: ['0.89', '1.00']
Step   9: SEND A0→G0 | Reward: +0.95 

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.animation import FuncAnimation
from IPython.display import HTML
from collections import deque

def simple_animate_episode(Q, env, max_steps=30):
    """Create a simple animated visualization of a single episode"""
    
    # Run episode and collect data
    state, _ = env.reset()
    states_history = [state]
    actions_history = []
    rewards_history = []
    
    done = False
    step = 0
    
    while not done and step < max_steps:
        if state in Q:
            action = np.argmax(Q[state])
        else:
            action = env.action_space.sample()
        
        actions_history.append(action)
        state, reward, done, _, _ = env.step(action)
        states_history.append(state)
        rewards_history.append(reward)
        step += 1
    
    # Create figure with 2x2 grid
    fig, axes = plt.subplots(2, 2, figsize=(10, 8))
    
    # 1. VRAM bar chart (top left)
    vram_bars = axes[0, 0].bar(range(env.num_gpus), [0] * env.num_gpus, 
                               color=['blue', 'orange'])
    axes[0, 0].set_title('GPU VRAM Usage')
    axes[0, 0].set_xlabel('GPU ID')
    axes[0, 0].set_ylabel('Free VRAM')
    axes[0, 0].set_ylim(0, 1)
    axes[0, 0].set_xticks(range(env.num_gpus))
    axes[0, 0].set_xticklabels([f'GPU {i}' for i in range(env.num_gpus)])
    
    # 2. Queue plot (top right) - Track queue separately
    # Initialize a test env to track queue
    test_env = AdapterEnv(env.arrivals_by_time, num_adapters=env.num_adapters, num_gpus=env.num_gpus)
    test_state, _ = test_env.reset()
    tracked_queue = [len(test_env.queue)]
    
    # Replay to track queue
    for i, action in enumerate(actions_history):
        test_state, reward, done, _, _ = test_env.step(action)
        tracked_queue.append(len(test_env.queue))
    
    queue_line, = axes[0, 1].plot([], [], 'r-', linewidth=2, marker='o', markersize=4)
    axes[0, 1].set_title('Queue Length Over Time')
    axes[0, 1].set_xlabel('Step')
    axes[0, 1].set_ylabel('Queue Length')
    axes[0, 1].grid(True, alpha=0.3)
    # Set y-lim based on tracked data
    max_queue = max(tracked_queue) if tracked_queue else 20
    axes[0, 1].set_ylim(0, max_queue * 1.1 if max_queue > 0 else 20)
    
    # 3. Action distribution (bottom left)
    action_types = ['Load', 'Evict', 'Reuse', 'Send']
    action_colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']
    action_counts = [0, 0, 0, 0]
    action_bars = axes[1, 0].bar(action_types, action_counts, color=action_colors)
    axes[1, 0].set_title('Action Types Taken')
    axes[1, 0].set_ylabel('Count')
    axes[1, 0].set_ylim(0, 5)  # Start with small limit
    
    # 4. Text info (bottom right)
    info_text = axes[1, 1].text(0.05, 0.95, '', fontsize=11, 
                                transform=axes[1, 1].transAxes,
                                verticalalignment='top',
                                bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.9))
    axes[1, 1].axis('off')  # Hide axes for text box
    
    # Add a title to the whole figure
    fig.suptitle('Adapter Management Agent - Episode Simulation', 
                 fontsize=14, fontweight='bold', y=0.98)
    
    def update(frame):
        if frame >= len(states_history):
            return
        
        state = states_history[frame]
        
        # Parse state
        loaded_adapters = np.array(state[0]).reshape(env.num_gpus, env.num_adapters)
        free_vram = np.array(state[1])
        
        # 1. Update VRAM bars
        for i, bar in enumerate(vram_bars):
            if i < len(free_vram):
                bar.set_height(free_vram[i])
        
        # 2. Update queue plot
        if frame < len(tracked_queue):
            queue_data = tracked_queue[:frame+1]
            queue_line.set_data(range(len(queue_data)), queue_data)
            axes[0, 1].set_xlim(0, max(1, len(states_history)))
        
        # 3. Update action counts
        if frame > 0 and frame-1 < len(actions_history):
            action = actions_history[frame-1]
            action_type = action // (env.num_adapters * env.num_gpus)
            if action_type < len(action_counts):
                action_counts[action_type] += 1
                
                # Update bar heights
                for i, bar in enumerate(action_bars):
                    bar.set_height(action_counts[i])
                
                # Adjust y-limit dynamically
                max_count = max(action_counts) if action_counts else 1
                axes[1, 0].set_ylim(0, max_count * 1.2)
        
        # 4. Update info text
        info_str = f"Step: {frame}\n"
        info_str += f"Free VRAM: GPU0={free_vram[0]:.2f}, GPU1={free_vram[1]:.2f}\n"
        if frame < len(tracked_queue):
            info_str += f"Queue: {tracked_queue[frame]}\n"
        
        if frame > 0 and frame-1 < len(actions_history):
            action = actions_history[frame-1]
            action_type = action // (env.num_adapters * env.num_gpus)
            rem = action % (env.num_adapters * env.num_gpus)
            adapter_id = rem // env.num_gpus
            gpu_id = rem % env.num_gpus
            
            action_names = ['Load', 'Evict', 'Reuse', 'Send']
            info_str += f"\nLast Action: {action_names[action_type]}\n"
            info_str += f"  Adapter {adapter_id} → GPU {gpu_id}\n"
            info_str += f"Reward: {rewards_history[frame-1]:.2f}\n"
            info_str += f"Total Reward: {sum(rewards_history[:frame]):.2f}"
        
        info_text.set_text(info_str)
        
        return queue_line, *vram_bars, *action_bars, info_text
    
    anim = FuncAnimation(fig, update, frames=len(states_history), 
                        interval=500, blit=False, repeat=False)
    
    plt.tight_layout(rect=[0, 0, 1, 0.96])  # Adjust for suptitle
    
    return HTML(anim.to_jshtml())

# === USAGE ===
# After training:
print("\n  Generating animation...")
animation = simple_animate_episode(Q_table, env, max_steps=30)
display(animation)  


  Generating animation...
