## Install all the necessary packages for inference and fine-tuning (may not use all of them)

In [1]:
import random
from textwrap import dedent
from typing import Dict, List

import matplotlib as mpl
import matplotlib.colors as colors
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch
from colored import Back, Fore, Style
from datasets import Dataset, load_dataset
from matplotlib.ticker import PercentFormatter
from peft import (
    LoraConfig,
    PeftModel,
    TaskType,
    get_peft_model,
    prepare_model_for_kbit_training,
)
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline,
)
from trl import DataCollatorForCompletionOnlyLM, SFTConfig, SFTTrainer

  from .autonotebook import tqdm as notebook_tqdm


## Prepare for model inference profiling

In [3]:
SEED = 42
access_token = "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"

def seed_everything(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

seed_everything(SEED)
PAD_TOKEN = "<|pad|>"
MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"

In [4]:
print("Downloding the toknizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True, token=access_token)
tokenizer.add_special_tokens({"pad_token": PAD_TOKEN})
tokenizer.padding_side = "left"
print("Done")

print("Downloding the model...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map = "auto",
    token=access_token
)
print("Done")

# check the mapping of the model
print(model.hf_device_map)

Downloding the toknizer...
Done
Downloding the model...


Loading checkpoint shards: 100%|██████████| 4/4 [00:08<00:00,  2.10s/it]


Done
{'model.embed_tokens': 0, 'model.layers.0': 0, 'model.layers.1': 0, 'model.layers.2': 0, 'model.layers.3': 0, 'model.layers.4': 0, 'model.layers.5': 0, 'model.layers.6': 0, 'model.layers.7': 0, 'model.layers.8': 0, 'model.layers.9': 0, 'model.layers.10': 0, 'model.layers.11': 0, 'model.layers.12': 0, 'model.layers.13': 0, 'model.layers.14': 1, 'model.layers.15': 1, 'model.layers.16': 1, 'model.layers.17': 1, 'model.layers.18': 1, 'model.layers.19': 1, 'model.layers.20': 1, 'model.layers.21': 1, 'model.layers.22': 1, 'model.layers.23': 1, 'model.layers.24': 1, 'model.layers.25': 1, 'model.layers.26': 1, 'model.layers.27': 1, 'model.layers.28': 1, 'model.layers.29': 1, 'model.layers.30': 1, 'model.layers.31': 1, 'model.norm': 1, 'model.rotary_emb': 1, 'lm_head': 1}


In [5]:
# Now, only consider simple examples (only questions)
# system prompt from : https://www.reddit.com/r/LocalLLaMA/comments/1cry85p/lmstudio_better_system_prompt_for_llama_3_8b_and/
# llama 3 formatting : https://www.llama.com/docs/model-cards-and-prompt-formats/meta-llama-3/

def format_example(input: dict, tokenizer: AutoTokenizer):
    prompt = dedent(
        f"""
    {input["question"]}
        """
    )
    messages = [
        {
            "role": "system",
            "content": "You are a knowledgeable, efficient, and direct AI assistant. Provide concise answers, focusing on the key information needed. Offer suggestions tactfully when appropriate to improve outcomes. Engage in productive collaboration with the user."
        },
        {
            "role": "user",
            "content": prompt
        },
        {
            "role": "assistant",
            "content": ""
        }
    ]
    return tokenizer.apply_chat_template(messages, tokenize=False).rstrip("<|eot_id|>").strip()

def count_tokens(input: str, tokenizer: AutoTokenizer) -> int:
    return len(
        tokenizer(
            input,
            add_special_tokens=True,
            return_attention_mask=False
        )['input_ids']
    )

# testcode
input_example = {
    "question": "What is the capital of France?"
}
formatted_input = format_example(input_example, tokenizer)
print(formatted_input)
print(f'length: {count_tokens(formatted_input, tokenizer)}') 

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are a knowledgeable, efficient, and direct AI assistant. Provide concise answers, focusing on the key information needed. Offer suggestions tactfully when appropriate to improve outcomes. Engage in productive collaboration with the user.<|eot_id|><|start_header_id|>user<|end_header_id|>

What is the capital of France?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
length: 84


In [6]:
# prepare custom dataset(dummy) for speed measurement
def prepare_dataset_for_speed_eval(input: dict, batch_size: int, tokenizer: AutoTokenizer):
    input_batch = []
    for _ in range(batch_size):
        formatted_input = format_example(input, tokenizer)
        input_batch.append(formatted_input)
    
    return tokenizer(input_batch, padding=True, return_tensors="pt")

In [7]:
BATCH_SIZE = 8
input_example = {
    "question": "Who is the richest person in the world? Is it Elon Musk?"
}
dataset = prepare_dataset_for_speed_eval(input_example, BATCH_SIZE, tokenizer).to("cuda")
print(dataset['input_ids'].shape)
print(dataset['attention_mask'])

torch.Size([8, 91])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

## Profile prefill/decoding stage of the LLM (Demo version)

In [8]:

import json

BATCH_SIZE = 4
WARMUP_STEPS = 50
PREFILL_STEPS = 20
DECODING_STEPS = 20
MAX_NEW_TOKENS = 50

prefill_latencies_board = []
decoding_latencies_board = []

questions = [
    {"question": "Do you like music?"},
    {"question": "What time do you wake up?"},
    {"question": "Which season do you like most?"},
    {"question": "What do you usually eat for breakfast?"},
    {"question": "Can you name three things you enjoy about your job or studies?"},
    {"question": "If you had a superpower, what would it be, and how would you use it?"},
    {"question": "How do you spend your weekends, and what makes them enjoyable?"},
    {"question": "Can you share a memorable experience from your childhood in a few sentences?"},
    {"question": "If you could change one thing about the world, what would it be and why do you think it’s important?"},
    {"question": "Imagine you have unlimited resources for a year. What would you do, and how would it impact others?"},
    {"question": "If you were tasked with organizing a community event to bring people together, what type of event would you plan, and how would you ensure its success?"},
    {"question": "Suppose you were given the opportunity to learn any skill instantly. Which skill would you choose, and how do you think it would improve your life?"},
    {"question": "Imagine you could meet a famous historical figure from the past. Who would it be, what questions would you ask them, and why would you choose this person?"},
    {"question": "If you were to write a book about your life, what would the title be, and what key lessons or experiences would you include in the story?"},
    {"question": "Suppose you had to design a plan to make your community more environmentally friendly. What specific changes or initiatives would you propose, and how would you encourage others to participate?"},
    {"question": "If you could live in a different era or time period, which one would you choose, and what aspects of life during that time appeal to you the most?"},
    {"question": "Imagine you are offered a chance to create a new invention that could solve a pressing global problem. What would your invention do, and how would it impact society?"},
    {"question": "Consider a situation where you are asked to give a speech to inspire young people to follow their dreams. What message would you share, and what examples from your own life would you use to motivate them?"},
    {"question": "If you were in charge of a large project that required managing a diverse team of people, how would you foster collaboration, resolve conflicts, and ensure the project's success?"},
    {"question": "Picture a scenario where you have unlimited resources to improve education globally. What changes would you implement, how would you address inequalities, and what long-term outcomes would you hope to achieve from your initiatives?"},
]


datasets = [prepare_dataset_for_speed_eval(question, BATCH_SIZE, tokenizer).to("cuda") for question in questions]
dataset_lengths_in_tokens = [count_tokens(format_example(question, tokenizer), tokenizer) for question in questions]

# pad_token_matching
model.generation_config.pad_token_id = tokenizer.pad_token_id
model.resize_token_embeddings(len(tokenizer))

# Profiling prefill
with torch.inference_mode():
    for i, dataset in enumerate(datasets):
        prefill_latencies = []
        prefill_decoding_latencies = []
        
        # warmup
        if i == 0:
            print("Warming up...")
            torch.cuda.synchronize()
            for _ in range(WARMUP_STEPS):
                model.generate(**dataset, max_new_tokens = 1)
            torch.cuda.synchronize()
            print("Warming up done \n")
        
        print(f'Profiling step {i+1}')
        # actual measurement(prefill)
        print("Start measuring prefill latency...")
        for _ in range(PREFILL_STEPS):
            torch.cuda.synchronize()
            start = torch.cuda.Event(enable_timing=True)
            end = torch.cuda.Event(enable_timing=True)
            start.record()
            outputs = model.generate(**dataset, max_new_tokens = 1)
            end.record()
            torch.cuda.synchronize()
            prefill_latencies.append(start.elapsed_time(end))
        print("Prefill Measurement done")
        
        # actual measurement(decoding)
        print("Start measuring decoding latency...")
        for _ in range(DECODING_STEPS):
            torch.cuda.synchronize()
            start = torch.cuda.Event(enable_timing=True)
            end = torch.cuda.Event(enable_timing=True)
            start.record()
            outputs = model.generate(**dataset, max_length=MAX_NEW_TOKENS+dataset_lengths_in_tokens[i])
            end.record()
            torch.cuda.synchronize()
            prefill_decoding_latencies.append(start.elapsed_time(end))
        print("Decoding Measurement done")
        
        # Record the latencies
        decoding_latencies_per_token = [(a - b)/MAX_NEW_TOKENS for a, b in zip(prefill_decoding_latencies, prefill_latencies)]
        avg_prefil = sum(prefill_latencies) / PREFILL_STEPS
        avg_decode = sum(prefill_decoding_latencies) / (DECODING_STEPS * MAX_NEW_TOKENS)
        prefill_latencies_board.append(avg_prefil)
        decoding_latencies_board.append(avg_decode)
        
# Save the results
results = {
    "prefill_latencies": prefill_latencies_board,
    "decoding_latencies": decoding_latencies_board,
    "dataset_lengths_in_tokens": dataset_lengths_in_tokens
}
with open("profiling_results.json", "w") as f:
    json.dump(results, f)
    
print("Result saved, profiling done")


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Warming up...
Warming up done 

Profiling step 1
Start measuring prefill latency...
Prefill Measurement done
Start measuring decoding latency...
Decoding Measurement done
Profiling step 2
Start measuring prefill latency...
Prefill Measurement done
Start measuring decoding latency...
Decoding Measurement done
Profiling step 3
Start measuring prefill latency...
Prefill Measurement done
Start measuring decoding latency...
Decoding Measurement done
Profiling step 4
Start measuring prefill latency...
Prefill Measurement done
Start measuring decoding latency...
Decoding Measurement done
Profiling step 5
Start measuring prefill latency...
Prefill Measurement done
Start measuring decoding latency...
Decoding Measurement done
Profiling step 6
Start measuring prefill latency...
Prefill Measurement done
Start measuring decoding latency...
Decoding Measurement done
Profiling step 7
Start measuring prefill latency...
Prefill Measurement done
Start measuring decoding latency...
Decoding Measurement 