In [2]:
from fpga_qwen import *

# print hyperparameters
print(f"DECODER_NUM_WQ:     {DECODER_NUM_WQ}")
print(f"DECODER_NUM_WS:     {DECODER_NUM_WS}")
print(f"DECODER_W_BYTES:    {DECODER_W_BYTES}")
print(f"CLS_NUM_WQ:         {CLS_NUM_WQ}")
print(f"CLS_NUM_WS:         {CLS_NUM_WS}")
print(f"CLS_W_BYTES:        {CLS_W_BYTES}")
print(f"DECODER_NUM_X:      {DECODER_NUM_X}")
print(f"DECODER_X_BITS:     {DECODER_X_BITS}")
print(f"DECODER_X_BYTES:    {DECODER_X_BYTES}")
print(f"CLS_NUM_Y:          {CLS_NUM_Y}")
print(f"CLS_Y_BITS:         {CLS_Y_BITS}")
print(f"CLS_Y_BYTES:        {CLS_Y_BYTES}")
print(f"KV_CACHE_BYTES:     {KV_CACHE_BYTES}")
print(f"EMBEDDING_BYTES:    {EMBEDDING_BYTES}")
print(f"TOTAL_BYTES:        {TOTAL_BYTES}")

# print the addresses with 64bit hex
print(f"ADDR_DECODER_X:    {ADDR_DECODER_X:016X}")
print(f"ADDR_DECODER_Y:    {ADDR_DECODER_Y:016X}")
print(f"ADDR_CLS_Y:        {ADDR_CLS_Y:016X}")
print(f"ADDR_DECODER_W:    {ADDR_DECODER_W:016X}")
print(f"ADDR_CLS_W:        {ADDR_CLS_W:016X}")
print(f"ADDR_KV_CACHE:     {ADDR_KV_CACHE:016X}")
# reset kv cache to 0
ddr_kv_cache[:] = 0

# create qwen fpga accelerator
accelerator = QWEN_ACCELERATOR(qwen, tokenizer, S)

DECODER_NUM_WQ:     16285696
DECODER_NUM_WS:     2035712
DECODER_W_BYTES:    11705344
CLS_NUM_WQ:         136134656
CLS_NUM_WS:         17016832
CLS_W_BYTES:        97846784
DECODER_NUM_X:      7168
DECODER_X_BITS:     229376
DECODER_X_BYTES:    28672
CLS_NUM_Y:          1215488
CLS_Y_BITS:         38895616
CLS_Y_BYTES:        4861952
KV_CACHE_BYTES:     16515072
EMBEDDING_BYTES:    272269312
TOTAL_BYTES:        378775040
ADDR_DECODER_X:    00000000375B0000
ADDR_DECODER_Y:    00000000375B8000
ADDR_CLS_Y:        0000000037D00000
ADDR_DECODER_W:    0000000038200000
ADDR_CLS_W:        0000000048E00000
ADDR_KV_CACHE:     000000004EC00000


In [3]:
def apply_chat_template(prompt):
    # compose the prompt
    messages = [
        {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ]
    tokens = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    # do generation
    input_ids = tokenizer.encode(tokens, add_special_tokens=True)
    return input_ids

In [11]:
"""A DEMO for in-time generation"""
# prompt = "To be, or not to be, that is the question. 这句话出自哪里？"
# prompt = "Please introduce Peking University."
# prompt = "Complete the text: In the beginning God created the heavens and the earth. Now the earth was formless and empty, darkness was over the surface of the deep, and the Spirit of God was hovering over the waters." 
# prompt = "give the source code of  'quick sort' in c++ as short as possible, no comment."
# prompt = "What is one plus one? You only need to give the answer."
# prompt = "1+1等于多少？"
prompt = "How many tastes does oreo have?"

input_ids = apply_chat_template(prompt)
for output_ids in accelerator.generate_iter(input_ids, include_prefill=True):
    output_seq = tokenizer.decode(output_ids, skip_special_tokens=False)
    print(output_seq, end="")

<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
give the source code of  'quick sort' in c++ as short as possible, no comment.<|im_end|>
<|im_start|>assistant
```cpp
#include <iostream>
#include <vector>

void quickSort(std::vector<int>& arr, int low, int high) {
    if (low < high) {
        int pi = partition(arr, low, high);
        quickSort(arr, low, pi - 1);
        quickSort(arr, pi + 1, high);
    }
}

int partition(std::vector<int>& arr, int low, int high) {
    int pivot = arr[high];
    int i = low - 1;
    for (int j = low; j < high; j++) {
        if (arr[j] < pivot) {
            i++;
            std::swap(arr[i],

In [6]:
"""directly generate all output"""
prompt = "中国的首都是？"
# prompt = "What is the capital of China?"

input_ids = apply_chat_template(prompt)
output_ids = accelerator.generate(input_ids, include_prefill=True)
output_seq = tokenizer.decode(output_ids, skip_special_tokens=False)
print(output_seq)

<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
中国的首都是？<|im_end|>
<|im_start|>assistant
中国的首都是北京。<|im_end|>


In [9]:
# test throughput

# Clocks.fclk0_mhz = 375

for _ in range(10):
    start = time.time()
    accelerator.run_prefill(0)
    end = time.time()
    gen_throughput = 1/(end-start)
    pre_throughput = gen_throughput * T
    print(f"Prefill Throughput: {pre_throughput:.2f} tok/s")
    
for _ in range(10):
    start = time.time()
    accelerator.run_hardware(0)
    end = time.time()
    gen_throughput = 1/(end-start)
    pre_throughput = gen_throughput * T
    print(f"Decode Throughput: {gen_throughput:.2f} tok/s")


Prefill Throughput: 165.01 tok/s
Prefill Throughput: 165.14 tok/s
Prefill Throughput: 165.69 tok/s
Prefill Throughput: 165.50 tok/s
Prefill Throughput: 165.67 tok/s
Prefill Throughput: 165.28 tok/s
Prefill Throughput: 165.60 tok/s
Prefill Throughput: 165.60 tok/s
Prefill Throughput: 165.73 tok/s
Prefill Throughput: 165.52 tok/s
Decode Throughput: 9.44 tok/s
Decode Throughput: 9.44 tok/s
Decode Throughput: 9.40 tok/s
Decode Throughput: 9.45 tok/s
Decode Throughput: 9.41 tok/s
Decode Throughput: 9.42 tok/s
Decode Throughput: 9.39 tok/s
Decode Throughput: 9.44 tok/s
Decode Throughput: 9.41 tok/s
Decode Throughput: 9.46 tok/s
