### E-LlaMA-13B Expert Creation
### Binary Mask

In [1]:
# Dependencies
#!pip install --upgrade pip
#!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121  # For CUDA 12.1
#!pip install transformers accelerate safetensors bitsandbytes xformers
#!pip install scipy sentencepiece
#!pip install ipython rich matplotlib pandas tqdm
#sudo apt-get install gcsfuse
#sudo apt-get update
#sudo apt-get install fuse
#sudo modprobe fuse

In [1]:
# load dependencies
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

In [2]:
# Connect to Bucket - only needed if copying files from bucket

# Auto-mount GCS bucket on login
#sudo mkdir -p $HOME/MeLLaMA-13B
#sudo fusermount -u $HOME/MeLLaMA-13B 2>/dev/null
#sudo gcsfuse --implicit-dirs dhxj_models $HOME/MeLLaMA-13B

In [1]:
# Activate MoeME env

# Mount SSD to VM
!sudo ln -s /mnt/models ~/models

# Ensure models folder is visible in explorer
!sudo ln -s /mnt/models ~/models

ln: failed to create symbolic link '/home/dyh2111/models/models': File exists


In [4]:
#Get cuda ver - should be 12.4
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Mar_28_02:18:24_PDT_2024
Cuda compilation tools, release 12.4, V12.4.131
Build cuda_12.4.r12.4/compiler.34097967_0


In [5]:
# Install Torch for CUDA 12.4
#!pip3 install torch torchvision torchaudio

In [3]:
# Verify GPU works
torch.cuda.is_available()

True

In [7]:
# Get GPU info
!nvidia-smi -L

GPU 0: NVIDIA L4 (UUID: GPU-4b52dab4-145c-2bf0-ad48-073aa89568d2)


In [5]:
# Set model path
model_path = "/mnt/models/MeLLaMA-13B"

In [6]:
# Load Baseline MeLLaMA-13B Model

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Load model with architecture access
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16,
    device_map="auto"
)

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


In [7]:
# View all params - prints them out
for name, param in model.named_parameters():
    print(name, param.shape)


model.embed_tokens.weight torch.Size([32000, 5120])
model.layers.0.self_attn.q_proj.weight torch.Size([5120, 5120])
model.layers.0.self_attn.k_proj.weight torch.Size([5120, 5120])
model.layers.0.self_attn.v_proj.weight torch.Size([5120, 5120])
model.layers.0.self_attn.o_proj.weight torch.Size([5120, 5120])
model.layers.0.mlp.gate_proj.weight torch.Size([13824, 5120])
model.layers.0.mlp.up_proj.weight torch.Size([13824, 5120])
model.layers.0.mlp.down_proj.weight torch.Size([5120, 13824])
model.layers.0.input_layernorm.weight torch.Size([5120])
model.layers.0.post_attention_layernorm.weight torch.Size([5120])
model.layers.1.self_attn.q_proj.weight torch.Size([5120, 5120])
model.layers.1.self_attn.k_proj.weight torch.Size([5120, 5120])
model.layers.1.self_attn.v_proj.weight torch.Size([5120, 5120])
model.layers.1.self_attn.o_proj.weight torch.Size([5120, 5120])
model.layers.1.mlp.gate_proj.weight torch.Size([13824, 5120])
model.layers.1.mlp.up_proj.weight torch.Size([13824, 5120])
model.l

In [9]:
# Test prompt on the model
# Note: 4096 Max token length

def prompt_model(model, tokenizer, prompt, max_new_tokens=50):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [10]:
prompt = "The following are questions from the medical board exam, choose the best answer."

In [11]:
# Test prompt

question1 = 'A 27-year-old woman comes to the office for counseling prior to conception. She states that a friend recently delivered a newborn with a neural tube defect and she wants to decrease her risk for having a child with this condition. She has no history of major medical illness and takes no medications. Physical examination shows no abnormalities. It is most appropriate to recommend that this patient begin supplementation with a vitamin that is a cofactor in which of the following processes?\
(A) Biosynthesis of nucleotides\
(B) Protein gamma glutamate carboxylation\
(C) Scavenging of free radicals\
(D) Transketolation\
(E) Triglyceride lipolysis'

response = prompt_model(model, tokenizer, f"Instructions: {prompt} Question: {question1}")
print(response)

Instructions: The following are questions from the medical board exam, choose the best answer. Question: A 27-year-old woman comes to the office for counseling prior to conception. She states that a friend recently delivered a newborn with a neural tube defect and she wants to decrease her risk for having a child with this condition. She has no history of major medical illness and takes no medications. Physical examination shows no abnormalities. It is most appropriate to recommend that this patient begin supplementation with a vitamin that is a cofactor in which of the following processes?(A) Biosynthesis of nucleotides(B) Protein gamma glutamate carboxylation(C) Scavenging of free radicals(D) Transketolation(E) Triglyceride lipolysis Answer: A The recommended daily allowance (RDA) for folic acid is 400 micrograms per day for women of childbearing age. The RDA for vitamin B12 is 2.4 micro


In [12]:
# Test prompt

question = "A 50-year-old man comes to the office because of a 2-month history of increasing daytime somnolence. He has\
obstructive sleep apnea for which he has only intermittently used a continuous positive airway pressure device. He is\
170 cm (5 ft 7 in) tall and weighs 181 kg (400 lb); BMI is 63 kg/m2\
. His temperature is 37°C (98.6°F), pulse is 100/min,\
respirations are 12/min, and blood pressure is 135/80 mm Hg. Physical examination shows a gray-blue tinge to the lips,\
earlobes, and nail beds. Cardiac examination shows no other abnormalities. Arterial blood gas analysis on room air\
shows a pH of 7.31, PCO2 of 70 mm Hg, and PO2 of 50 mm Hg. Which of the following additional findings would be\
most likely in this patient?"

response = prompt_model(model, tokenizer, f"Instructions: {prompt} Question: {question}")
print(response)

Instructions: The following are questions from the medical board exam, choose the best answer. Question: A 50-year-old man comes to the office because of a 2-month history of increasing daytime somnolence. He hasobstructive sleep apnea for which he has only intermittently used a continuous positive airway pressure device. He is170 cm (5 ft 7 in) tall and weighs 181 kg (400 lb); BMI is 63 kg/m2. His temperature is 37°C (98.6°F), pulse is 100/min,respirations are 12/min, and blood pressure is 135/80 mm Hg. Physical examination shows a gray-blue tinge to the lips,earlobes, and nail beds. Cardiac examination shows no other abnormalities. Arterial blood gas analysis on room airshows a pH of 7.31, PCO2 of 70 mm Hg, and PO2 of 50 mm Hg. Which of the following additional findings would bemost likely in this patient?
A. Oxygen saturation of 80% on room air.
B. Oxygen saturation of 90% on room air.
C. Oxygen saturation of 90% on 1


In [19]:
from torchinfo import summary

summary(model, input_size=(1, tokenizer.model_max_length), dtypes=[torch.long])
dummy_input = tokenizer("Hello world", return_tensors="pt")
summary(model, input_data=dummy_input)


TypeError: rand(): argument 'size' failed to unpack the object at pos 2 with error "Overflow when unpacking long"