In [None]:
#empty cache just in case
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install -U bitsandbytes

In [None]:

!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install transformers peft accelerate datasets


In [None]:
!pip install -U transformers==4.44.2
# !pip install -U bitsandbytes==0.43.3
!pip install -U accelerate peft
# import bitsandbytes as bnb
# print(bnb.__version__)



In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer , BitsAndBytesConfig
import torch

torch.cuda.empty_cache()

model_name = "Qwen/Qwen2.5-1.5B-Instruct"
#configuration for quantizing the qwen model (windows setup)
# bnb_cfg = BitsAndBytesConfig(
#     load_in_4bit=True,                # 4-bit
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4",        
#     bnb_4bit_compute_dtype=torch.bfloat16 
# )

#linux quantization:
bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True,                
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",        
    bnb_4bit_compute_dtype=torch.bfloat16 
)

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    # quantization_config=bnb_cfg,
    #torch_dtype=torch.float16,   
    quantization_config= bnb_cfg ,   
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token


In [None]:
#simple adapter cache to fix later:
adapter_cache = {}
def get_adapter(adapter_name):
    if adapter_name in adapter_cache:
        return adapter_cache[adapter_name]
    model = load_adapter(f"./adapters/{adapter_name}")
    adapter_cache[adapter_name] = model
    if len(adapter_cache) > CACHE_LIMIT:
        evict_one()  # define later
    return model


In [None]:
#adapter from: https://huggingface.co/DreamGallery/Qwen-Qwen2.5-1.5B-Instruct-1727452927
# from peft import PeftModel

# model = PeftModel.from_pretrained(
#     base_model,
#     "DreamGallery/Qwen-Qwen2.5-1.5B-Instruct-1727452927",  # adapter path
#     trust_remote_code=True
# )


In [None]:
prompt = "Explain the difference between L1 and L2 regularization in machine learning."
inputs = tokenizer(prompt, return_tensors="pt").to(base_model.device)
with torch.no_grad():
    out = base_model.generate(**inputs, max_new_tokens=64)
print(tokenizer.decode(out[0], skip_special_tokens=True))


In [None]:
#dataset of prompts:
from datasets import load_dataset

ds1 = load_dataset("fka/awesome-chatgpt-prompts")

In [None]:
from datasets import load_dataset

ds = load_dataset("RUC-DataLab/DataScience-Instruct-500K")

In [None]:
from datasets import load_dataset

ds2 = load_dataset("FreedomIntelligence/medical-o1-reasoning-SFT", "en")

In [None]:
import pandas as pd
ds = pd.DataFrame(ds)
print(ds.head())

In [None]:
def tokenize_ds(batch):
    prompts = []
    targets = []

    for messages, evaluation in zip(batch["messages"], batch["evaluation"]):
        if isinstance(messages, list):
            prompt = "\n".join([f"{m.get('role', '')}: {m.get('content', '')}" for m in messages])
        else:
            prompt = str(messages)

        prompts.append(prompt)
        targets.append(str(evaluation))

    return tokenizer(
        prompts,
        text_target=targets,
        truncation=True,
        padding="max_length",
        max_length=512,
    )

def tokenize_ds1(batch):
    prompts = []
    targets = []

    for act, prompt in zip(batch["act"], batch["prompt"]):
        if isinstance(act, list):
            inst = " ".join(act)
        else:
            inst = str(act)

        prompts.append(inst)
        targets.append(str(prompt))

    return tokenizer(
        prompts,
        text_target=targets,
        truncation=True,
        padding="max_length",
        max_length=512,
    )
def tokenize_ds3(batch):
    return tokenizer(
        batch["Question"],
        text_target=batch["Response"],
        truncation=True,
        padding="max_length",
        max_length=512,
    )



train1 = ds.map(tokenize_ds, batched=True, batch_size=256)
train2 = ds1.map(tokenize_ds1, batched=False)
# train3 = ds2.map(tokenize_ds2, batched=False)

In [None]:
from peft import LoraConfig 
lora_cfg_1 = LoraConfig(
    r=4,
    lora_alpha=16,
    target_modules=["q_proj","v_proj"],
    lora_dropout=0.1,
    task_type="CAUSAL_LM"
)
lora_cfg_2 = LoraConfig(
    r=4,
    lora_alpha=16,
    target_modules=["q_proj","v_proj"],
    lora_dropout=0.1,
    task_type="CAUSAL_LM"
)


In [None]:
# import bitsandbytes as bnb
# print(bnb.__version__)

print("CUDA available:", torch.cuda.is_available())
print("Current device:", torch.cuda.get_device_name(0))

In [None]:
!nvidia-smi

In [None]:
from peft import get_peft_model
from transformers import TrainingArguments, Trainer
torch.cuda.empty_cache()

model1 = get_peft_model(base_model, lora_cfg_1)
args1 = TrainingArguments(
    per_device_train_batch_size=1,
    num_train_epochs=1,
    learning_rate=2e-4,
    output_dir="./adapter1",
    gradient_accumulation_steps=1,
    logging_dir="./logs",         # where logs are saved
    logging_steps=5,              # print every 5 steps
    report_to="none",             # disable WandB, keep console output
    disable_tqdm=False            # show progress bar
)
small_train1 = train1["train"].select(range(100))  # use only first 100 samples for faster runtime 

trainer1 = Trainer(model=model1, args=args1, train_dataset=small_train1)
trainer1.train()
model1.save_pretrained("./adapters/adapter1")

del model1
torch.cuda.empty_cache()

model2 = get_peft_model(base_model, lora_cfg_2)
args2 = TrainingArguments(
    per_device_train_batch_size=1,
    num_train_epochs=1,
    learning_rate=2e-4,
    output_dir="./adapter2",
    gradient_accumulation_steps=1,
    logging_dir="./logs",         # where logs are saved
    logging_steps=5,              # print every 5 steps
    report_to="none",             # disable WandB, keep console output
    disable_tqdm=False            # show progress bar
)
small_train2 = train2["train"].select(range(100))
trainer2 = Trainer(model=model2, args=args2, train_dataset=small_train2)
trainer2.train()
model2.save_pretrained("./adapters/adapter2")


#adapter 3:

model3 = get_peft_model(base_model, lora_cfg_2)
args3 = TrainingArguments(
    per_device_train_batch_size=1,
    num_train_epochs=1,
    learning_rate=2e-4,
    output_dir="./adapter3",
    gradient_accumulation_steps=1,
    logging_dir="./logs",         # where logs are saved
    logging_steps=5,              # print every 5 steps
    report_to="none",             # disable WandB, keep console output
    disable_tqdm=False            # show progress bar
)
small_train3 = train3["train"].select(range(100))
trainer3 = Trainer(model=model3, args=args3, train_dataset=small_train3)
trainer3.train()
model3.save_pretrained("./adapters/adapter3")


In [None]:
from peft import PeftModel
#model 2 is on GPU:
print(next(model2.parameters()).device)
#model 1 is on GPU:
model1_loaded = PeftModel.from_pretrained(base_model, "./adapters/adapter1")
print(next(model1_loaded.parameters()).device)


In [None]:
from transformers import  AutoTokenizer
from peft import PeftModel

model_name = "Qwen/Qwen2.5-1.5B-Instruct"
adapter_path = "./adapters/adapter1"

tokenizer = AutoTokenizer.from_pretrained(model_name)


model = PeftModel.from_pretrained(base_model, adapter_path)
model.eval()

text ="""### Instruction:
explain the difference between L1 and L2 regularization in machine learning.

### Input:

### Response:"""
 
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens=150, temperature=0.7, top_p=0.9)
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(decoded)

In [None]:
!git clone https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2


In [None]:

from sentence_transformers import SentenceTransformer , util

model = SentenceTransformer("./all-MiniLM-L6-v2")
adapter_tasks = [
    "role-playing instruction following across many creative and utility roles",
    "data science and machine learning problem solving with step-by-step reasoning",
    "medical diagnosis and clinical reasoning with detailed chain-of-thought explanations",
    "summarization of long articles"
]
adapter_embs = model.encode(adapter_tasks)

print(adapter_embs)
def select_adapter(user_prompt):
    prompt_emb = model.encode([user_prompt], normalize_embeddings=True)[0]
    sims = util.cos_sim(prompt_emb, adapter_embs)[0]
    top = sims.argmax().item()
    best_score = sims[top].item()
    return adapter_tasks[top], best_score
task, score = select_adapter("Can you summarize this text?")
print(task, score)


In [None]:
prompt = "Explain how blood pressure works in the human body."
adapter, scores = select_adapter(prompt)
print(adapter)   
print(scores)    


In [None]:
import numpy as np
import random

class AdapterEnv:

    def __init__(self, adapters, gpus):
     

        self.adapters = adapters
        self.gpus = gpus
        self.queue_length = 0
        self.avg_latency = 0
        self.throughput = 0
        self.baseline_inference_time = 1.0  # normalized baseline

        self.current_query = None
        self.current_step = 0

    # STATE
    def _get_state(self):
        """Return the full state vector."""

        # Adapter characteristics
        adapter_vec = []
        for a in self.adapters.values():
            adapter_vec.extend([
                a["size_mb"],
                a["mem_required"],
                a["load_time"],
                a["latency"],
                a["domain_id"],
                1 if a["loaded"] else 0,
                a["which_gpu"] if a["loaded"] else -1
            ])

        # GPU characteristics
        gpu_vec = []
        for g in self.gpus.values():
            gpu_vec.extend([
                g["free_vram"],
                g["used_vram"],
                len(g["loaded_adapters"]),
                g["eviction_time"]
            ])

        # System context
        sys_vec = [
            self.queue_length,
            self.avg_latency,
            self.throughput
        ]

        return np.array(adapter_vec + gpu_vec + sys_vec, dtype=np.float32)


    # ACTION SPACE

    def action_space(self):
        """
        Actions are encoded like this:
        type 0 → selection (pick adapter id or  -1 = no-op)
        type 1 → placement (load / evict / reuse / swap)
        type 2 → routing (send query to GPU id)
        """

        selection_actions = list(self.adapters.keys()) + [-1]  # -1 = do nothing
        placement_actions = ["load", "evict", "reuse", "swap"]
        routing_actions = list(self.gpus.keys())

        return {
            "selection": selection_actions,
            "placement": placement_actions,
            "routing": routing_actions
        }

    def sample_action(self):
        """Randomly choose valid actions."""
        return {
            "selection": random.choice(self.action_space()["selection"]),
            "placement": random.choice(self.action_space()["placement"]),
            "routing": random.choice(self.action_space()["routing"])
        }

    # STEP 
    def step(self, action, user_query):
        """
        user_query contains:
            - domain_id
            - estimated_load
        """

        self.current_query = user_query
        selected_adapter = action["selection"]
        placement = action["placement"]
        chosen_gpu = action["routing"]

        reward = 0
        cost = 0

        # SELECTION COST
        if selected_adapter != -1:
            correct = (self.adapters[selected_adapter]["domain_id"] ==
                       user_query["domain_id"])
            reward += 1 if correct else -1

        # PLACEMENT COST (LOAD / EVICT / REUSE / SWAP)
  
        if placement == "load":
            a = self.adapters[selected_adapter]
            g = self.gpus[chosen_gpu]
            # simulate load time and VRAM use
            cost += a["load_time"]
            g["free_vram"] -= a["mem_required"]
            g["used_vram"] += a["mem_required"]
            a["loaded"] = True
            a["which_gpu"] = chosen_gpu
            g["loaded_adapters"].append(selected_adapter)

        elif placement == "evict":
            a = self.adapters[selected_adapter]
            g = self.gpus[a["which_gpu"]]
            # eviction time
            cost += g["eviction_time"]
            g["free_vram"] += a["mem_required"]
            g["used_vram"] -= a["mem_required"]
            a["loaded"] = False
            a["which_gpu"] = -1
            if selected_adapter in g["loaded_adapters"]:
                g["loaded_adapters"].remove(selected_adapter)

        elif placement == "swap":
            cost += 2  # arbitrary extra penalty

        
        # ROUTING COST (INFERENCE)
        if selected_adapter != -1:
            adapter_latency = self.adapters[selected_adapter]["latency"]
        else:
            adapter_latency = 2  # penalize doing nothing

        inference_time = adapter_latency
        cost += inference_time / self.baseline_inference_time

        # GPU balancing reward
        gpu_loads = [g["used_vram"] for g in self.gpus.values()]
        balance = np.std(gpu_loads)
        reward -= balance  # penalize imbalance

        # Final reward = positive selection score - costs
        final_reward = reward - cost

        # update system context
        self.avg_latency = 0.9 * self.avg_latency + 0.1 * inference_time
        self.queue_length = max(0, self.queue_length - 1)

        # next state
        next_state = self._get_state()
        done = False

        return next_state, final_reward, done, {}

    # ----------------------------------------------------------------
    # RESET
    # ----------------------------------------------------------------
    def reset(self):
        self.queue_length = 0
        self.avg_latency = 0
        self.throughput = 0
        for a in self.adapters.values():
            a["loaded"] = False
            a["which_gpu"] = -1
        for g in self.gpus.values():
            g["loaded_adapters"].clear()
            g["used_vram"] = 0
            g["free_vram"] = g["total_vram"]
        return self._get_state()


In [None]:
import pandas as pd

df = pd.read_csv('/kaggle/input/google-cluster-sample/borg_traces_data.csv')
df.head()


In [8]:
from typing import Tuple
import gymnasium as gym
import numpy as np
from stable_baselines3 import DQN


class AdapterEnv(gym.Env):
    metadata = {"render_modes": []}

    def __init__(self, num_adapters: int, num_gpus: int):
        super().__init__()

        self.num_adapters = num_adapters
        self.num_gpus = num_gpus

        # State
        self.adapter_loaded = np.zeros((num_gpus, num_adapters), dtype=np.float32)
        self.free_vram = np.ones(num_gpus, dtype=np.float32)
        self.queue_length = 0.0
        self.avg_latency = 0.0
        self.throughput = 0.0

        # Flattened discrete action space
        self.action_space = gym.spaces.Discrete(num_gpus * num_adapters)

        # Observation space
        self.adapter_feat_len = 4
        self.gpu_feat_len = 3
        obs_size = (num_adapters * self.adapter_feat_len) + \
                   (num_gpus * self.gpu_feat_len) + 3

        self.observation_space = gym.spaces.Box(
            low=0.0, high=1.0, shape=(obs_size,), dtype=np.float32
        )

    # ---------------------------------------------
    def _get_obs(self):
        adapter_stats = np.random.rand(self.num_adapters * self.adapter_feat_len).astype(np.float32)

        gpu_stats = []
        for gpu in range(self.num_gpus):
            gpu_stats.extend([
                self.free_vram[gpu],
                np.sum(self.adapter_loaded[gpu]),
                0.0
            ])
        gpu_stats = np.array(gpu_stats, dtype=np.float32)

        system_stats = np.array([
            self.queue_length,
            self.avg_latency,
            self.throughput
        ], dtype=np.float32)

        return np.concatenate([adapter_stats, gpu_stats, system_stats]).astype(np.float32)

    # ---------------------------------------------
    def reset(self, seed=None, options=None):
        super().reset(seed=seed)

        self.adapter_loaded[:] = 0.0
        self.free_vram[:] = 1.0
        self.queue_length = 0.5
        self.avg_latency = 0.5
        self.throughput = 0.5

        return self._get_obs(), {}

    # ---------------------------------------------
    def step(self, action):
        # Decode flattened action
        gpu_id = action // self.num_adapters
        adapter_id = action % self.num_adapters

        if self.adapter_loaded[gpu_id, adapter_id] == 0.0:
            self.adapter_loaded[gpu_id, adapter_id] = 1.0
            self.free_vram[gpu_id] -= 0.1
            self.queue_length -= 0.05
            reward = 1.0
        else:
            reward = -0.5

        self.queue_length = np.clip(self.queue_length, 0, 1)
        self.free_vram = np.clip(self.free_vram, 0, 1)

        self.avg_latency = 1.0 - self.queue_length
        self.throughput = 1.0 - self.avg_latency

        return self._get_obs(), float(reward), False, False, {}

    def render(self):
        pass


# -------------------------- Train -------------------------- #
env = AdapterEnv(num_adapters=10, num_gpus=4)
model = DQN("MlpPolicy", env, learning_rate=1e-3, verbose=1) #remember to change to PPO or A2C for more actions (evict and stuff)
model.learn(total_timesteps=50_000)
model.save("adapter_dqn")
print("Training complete.")


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Training complete.
