In [1]:
import os
import sys
import numpy as np
import pandas as pd
import torch as th
from torch.utils.tensorboard import SummaryWriter
from datasets import (load_dataset, load_from_disk, Dataset)
from transformers import (AutoTokenizer, AutoModel, AutoModelForCausalLM, BitsAndBytesConfig,
                          TrainingArguments, DataCollatorWithPadding, DataCollatorForLanguageModeling,
                          DataCollatorForSeq2Seq, DataCollatorForTokenClassification)
from transformers.integrations import TensorBoardCallback
from peft import (LoraConfig, get_peft_model, PeftModel, TaskType, prepare_model_for_kbit_training)
from vllm import (LLM, SamplingParams)
from vllm_warpper import vLLMWrapper

In [2]:
device = th.device("cuda" if th.cuda.is_available() else "cpu")
devive_cnt = th.cuda.device_count()
print(f"device = {device}; devive_cnt = {devive_cnt}")
print(th.__version__)
print(th.version.cuda)

device = cuda; devive_cnt = 1


In [3]:
path_project = os.getcwd()
path_data = os.path.join(os.path.dirname(path_project), "data")
path_model = os.path.join(os.path.dirname(path_project), "model")
path_output = os.path.join(os.path.dirname(path_project), "output")

# step-1: LLM

In [4]:
checkpoint = "Qwen1.5-4B-Chat"

In [6]:
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=os.path.join(path_model, checkpoint),
    cache_dir=path_model,
    force_download=False,
    local_files_only=True
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
if tokenizer.padding_side != "right":
    tokenizer.padding_side = "right"

In [8]:
model_base = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=os.path.join(path_model, checkpoint),
    cache_dir=path_model,
    force_download=False,
    local_files_only=True,
    device_map="auto",
    torch_dtype=th.bfloat16
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [19]:
allocated_memory = th.cuda.memory_reserved()
cached_memory = th.cuda.memory_cached()
print(f"已分配的GPU内存：{allocated_memory / 1024**3:.2f}G, 已缓存的GPU内存：{cached_memory / 1024**3:.2f}G")

已分配的GPU内存：8.73G, 已缓存的GPU内存：8.73G


# step-2: prompt

In [9]:
schema = """
{
 "infoList": [
    {
     "ssid": "string",
     "securityProtocol": "string",
     "bandwidth": "string"
    }
    ]
}
"""
schema = schema.replace("\n", "").replace(" ", "")

In [10]:
content_sys = (
    "You are a helpful assistant that answers in JSON. "
    f"Here's the json schema you must adhere to: \n<json>\n{schema}\n</json>\n"
    )
print(content_sys)

You are a helpful assistant that answers in JSON. Here's the json schema you must adhere to: 
<json>
{"infoList":[{"ssid":"string","securityProtocol":"string","bandwidth":"string"}]}
</json>



In [11]:
content_usr = (
    "I'm currently configuring a wireless access point for our office network and I "
    "need to generate a JSON object that accurately represents its settings. "
    "The access point's SSID should be 'OfficeNetSecure', it uses WPA2-Enterprise "
    "as its security protocol, and it's capable of a bandwidth of up to 1300 Mbps "
    "on the 5 GHz band. This JSON object will be used to document our network "
    "configurations and to automate the setup process for additional access "
    "points in the future."
    )
suffix = "\nPlease provide a JSON presentation based on the schema.\n"
content_usr += suffix
print(content_usr)

I'm currently configuring a wireless access point for our office network and I need to generate a JSON object that accurately represents its settings. The access point's SSID should be 'OfficeNetSecure', it uses WPA2-Enterprise as its security protocol, and it's capable of a bandwidth of up to 1300 Mbps on the 5 GHz band. This JSON object will be used to document our network configurations and to automate the setup process for additional access points in the future.
Please provide a JSON presentation based on the schema.



In [13]:
messages = [
    {"role": "system", "content": content_sys},
    {"role": "user", "content": content_usr}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
print(text)

<|im_start|>system
You are a helpful assistant that answers in JSON. Here's the json schema you must adhere to: 
<json>
{"infoList":[{"ssid":"string","securityProtocol":"string","bandwidth":"string"}]}
</json>
<|im_end|>
<|im_start|>user
I'm currently configuring a wireless access point for our office network and I need to generate a JSON object that accurately represents its settings. The access point's SSID should be 'OfficeNetSecure', it uses WPA2-Enterprise as its security protocol, and it's capable of a bandwidth of up to 1300 Mbps on the 5 GHz band. This JSON object will be used to document our network configurations and to automate the setup process for additional access points in the future.
Please provide a JSON presentation based on the schema.
<|im_end|>
<|im_start|>assistant



# step-3: inference-huggingface

In [14]:
max_new_tokens = 128
top_p = 0.9
temperature = 0.1
repetition_penalty = 1.5

In [15]:
model_inputs = tokenizer([text], return_tensors="pt").to(device)

In [16]:
model_base.eval()
with th.inference_mode():
    generated_ids = model_base.generate(
        model_inputs.input_ids,
        max_new_tokens=max_new_tokens,
        top_p=top_p,
        # temperature=temperature,
        # repetition_penalty=repetition_penalty
    )

In [17]:
generated_ids = [O[len(I): ] for (I, O) in zip(model_inputs.input_ids, generated_ids)]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)

{
    "ssid": "OfficeNetSecure",
    "securityProtocol": "WPA2-Enterprise",
    "bandwidth": "1300 Mbps"
}


# step-4: inference-vllm

In [None]:
model_base_v = LLM(model=os.path.join(path_model, checkpoint))

In [None]:
model_base = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=os.path.join(path_model, checkpoint),
    cache_dir=path_model,
    force_download=False,
    local_files_only=True,
    device_map="auto",
    torch_dtype=th.bfloat16
)

In [None]:
allocated_memory = th.cuda.memory_reserved()
cached_memory = th.cuda.memory_cached()
print(f"已分配的GPU内存：{allocated_memory / 1024**3:.2f}G, 已缓存的GPU内存：{cached_memory / 1024**3:.2f}G")