In [3]:
import numpy as np
import os
import time
import gc
from transformers import AutoTokenizer
from threading import Lock
from utils.session import Session
from config import InferenceConfig
import torch

class Inference:
    def __init__(self, config: InferenceConfig) -> None:
        # Initialize the necessary variables
        self.max_input_length = config.max_input_length
        self.max_output_length = config.max_output_length
        self.tokenizer = AutoTokenizer.from_pretrained(
            config.tokenizer_dir, trust_remote_code=True
        )
        self.session = Session.fromConfig(config)
        self.session_type = config.session_type
        self.kv_cache_length = config.kv_cache_length
        self.state: dict = {"code": 200, "isEnd": False, "message": ""}
        self.reset()
        self.lock = Lock()
        self.first = True
        print("[INFO] init success")

    def generate_cache(self, prompt: str):
        if len(prompt) == 0:
            return
        self.first = False
        input_ids = np.asarray(self.tokenizer.encode(prompt), dtype=np.int64).reshape(1, -1)
        logits = self.session.run(input_ids)[0]
        next_token = self.sample_logits(logits[0][-1:])
        return next_token, logits

    def sample_logits(self, logits: np.ndarray, sampling_method: str = "greedy", sampling_value: float = None, temperature: float = 1.0) -> np.ndarray:
        if temperature == 0 or sampling_method == "greedy":
            return np.argmax(logits, axis=-1).astype(np.int64)
        elif sampling_method == "top_k" or sampling_method == "top_p":
            assert sampling_value is not None
            logits = logits.astype(np.float32)
            logits /= temperature
            probs = np.exp(logits) / np.sum(np.exp(logits))
            sorted_probs = np.sort(probs)[:, ::-1]
            sorted_indices = np.argsort(probs)[:, ::-1]

            if sampling_method == "top_k":
                index_of_interest = int(sampling_value)
            elif sampling_method == "top_p":
                p = sampling_value
                cumulative_probs = np.cumsum(sorted_probs, axis=-1)
                for index_of_interest, cumulative_prob in enumerate(cumulative_probs[0]):
                    if cumulative_prob > p:
                        break

            probs_of_interest = sorted_probs[:, : index_of_interest + 1]
            indices_of_interest = sorted_indices[:, : index_of_interest + 1]
            probs_of_interest /= np.sum(probs_of_interest)
            return np.array([np.random.choice(indices_of_interest[0], p=probs_of_interest[0])])
        else:
            raise Exception(f"Unknown sampling method {sampling_method}")

    def predict(self, prompt, history=None, system_prompt="You are a helpful assistant.", max_new_tokens=1024):
        if history is None:
            history = []
        messages = [{"role": "system", "content": system_prompt}]
        for (use_msg, bot_msg) in history:
            messages.append({"role": "user", "content": use_msg})
            messages.append({"role": "assistant", "content": bot_msg})
        messages.append({"role": "user", "content": prompt})

        text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        if self.session_type == "pytorch":
            input_ids = self.tokenizer([text], return_tensors="pt")["input_ids"].to(torch.long).reshape(1, -1)
        else:
            raise Exception(f"unknown session_type {self.session_type}")

        input_ids = input_ids[:, -self.max_input_length:]
        self.first = False
        ids_list = []
        input_length = input_ids.shape[1]
        max_output_len = self.max_output_length - input_length
        max_output_len = min(max_output_len, max_new_tokens)

        for i in range(max_output_len):
            logits = self.session.run(input_ids)
            input_ids = self.sample_logits(logits[0][-1:])
            input_ids = input_ids.reshape(1, -1)
            ids_list.append(input_ids[0].item())
            text_out = self.tokenizer.decode(ids_list)

            # Early stop if EOS token is generated
            if input_ids[0] == self.tokenizer.eos_token_id:
                break

        return text_out

    def reset(self):
        self.first = True
        self.session.run_times = 0
        self.session.reset()

    def getState(self):
        with self.lock:
            return self.state.copy()

# Hardcoded configuration (you can set values directly here)
config = InferenceConfig(
    hf_model_dir="/data/shaos/data/Qwen2.5-VL-3B-Instruct",
    om_model_path="path_to_om_model",
    onnx_model_path="path_to_onnx_model",
    cpu_thread=4,
    session_type="pytorch",
    max_batch=1,
    max_output_length=2048,
    max_input_length=1024,
    kv_cache_length=2048,
    max_prefill_length=4,
    dtype="float32",
    torch_dtype="float32",
    #tokenizer_dir="path_to_tokenizer"
)

# Running the prediction
inference = Inference(config)
prompt = "What is the capital of France?"
output = inference.predict(prompt)
print("Generated Output:", output)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] init success
Generated Output: The capital of France is Paris.<|im_end|>


In [None]:
import numpy as np
import os
import time
import gc
from transformers import AutoTokenizer
from threading import Lock
from utils.session import Session
from config import InferenceConfig
import torch
import torch_npu

class Inference:
    def __init__(self, config: InferenceConfig) -> None:
        # Initialize the necessary variables
        self.max_input_length = config.max_input_length
        self.max_output_length = config.max_output_length
        self.tokenizer = AutoTokenizer.from_pretrained(
            config.tokenizer_dir, trust_remote_code=True
        )
        self.session = Session.fromConfig(config)
        self.session_type = config.session_type
        self.kv_cache_length = config.kv_cache_length
        self.state: dict = {"code": 200, "isEnd": False, "message": ""}
        self.reset()
        self.lock = Lock()
        self.first = True
        print("[INFO] init success")

        # Set device to NPU
        self.device = torch.device("npu" if torch.npu.is_available() else "cpu")
        print("[INFO] NPU context set successfully")

    def generate_cache(self, prompt: str):
        if len(prompt) == 0:
            return
        self.first = False
        input_ids = np.asarray(self.tokenizer.encode(prompt), dtype=np.int64).reshape(1, -1)
        logits = self.session.run(input_ids)[0]
        next_token = self.sample_logits(logits[0][-1:])
        return next_token, logits

    def sample_logits(self, logits: np.ndarray, sampling_method: str = "greedy", sampling_value: float = None, temperature: float = 1.0) -> np.ndarray:
        if temperature == 0 or sampling_method == "greedy":
            return np.argmax(logits, axis=-1).astype(np.int64)
        elif sampling_method == "top_k" or sampling_method == "top_p":
            assert sampling_value is not None
            logits = logits.astype(np.float32)
            logits /= temperature
            probs = np.exp(logits) / np.sum(np.exp(logits))
            sorted_probs = np.sort(probs)[:, ::-1]
            sorted_indices = np.argsort(probs)[:, ::-1]

            if sampling_method == "top_k":
                index_of_interest = int(sampling_value)
            elif sampling_method == "top_p":
                p = sampling_value
                cumulative_probs = np.cumsum(sorted_probs, axis=-1)
                for index_of_interest, cumulative_prob in enumerate(cumulative_probs[0]):
                    if cumulative_prob > p:
                        break

            probs_of_interest = sorted_probs[:, : index_of_interest + 1]
            indices_of_interest = sorted_indices[:, : index_of_interest + 1]
            probs_of_interest /= np.sum(probs_of_interest)
            return np.array([np.random.choice(indices_of_interest[0], p=probs_of_interest[0])])
        else:
            raise Exception(f"Unknown sampling method {sampling_method}")

    def predict(self, prompt, history=None, system_prompt="You are a helpful assistant.", max_new_tokens=1024):
        if history is None:
            history = []
        messages = [{"role": "system", "content": system_prompt}]
        for (use_msg, bot_msg) in history:
            messages.append({"role": "user", "content": use_msg})
            messages.append({"role": "assistant", "content": bot_msg})
        messages.append({"role": "user", "content": prompt})

        text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        if self.session_type == "pytorch":
            input_ids = self.tokenizer([text], return_tensors="pt")["input_ids"].to(torch.long).reshape(1, -1)
        else:
            raise Exception(f"unknown session_type {self.session_type}")

        input_ids = input_ids[:, -self.max_input_length:]
        self.first = False
        ids_list = []
        input_length = input_ids.shape[1]
        max_output_len = self.max_output_length - input_length
        max_output_len = min(max_output_len, max_new_tokens)

        for i in range(max_output_len):
            logits = self.session.run(input_ids)
            input_ids = self.sample_logits(logits[0][-1:])
            input_ids = input_ids.reshape(1, -1)
            ids_list.append(input_ids[0].item())
            text_out = self.tokenizer.decode(ids_list)

            # Early stop if EOS token is generated
            if input_ids[0] == self.tokenizer.eos_token_id:
                break

        return text_out

    def reset(self):
        self.first = True
        self.session.run_times = 0
        self.session.reset()

    def getState(self):
        with self.lock:
            return self.state.copy()

# Hardcoded configuration (you can set values directly here)
config = InferenceConfig(
    hf_model_dir="/data/shaos/data/Qwen2.5-VL-3B-Instruct",
    om_model_path="path_to_om_model",
    onnx_model_path="path_to_onnx_model",
    cpu_thread=4,
    session_type="pytorch",
    max_batch=1,
    max_output_length=2048,
    max_input_length=1024,
    kv_cache_length=2048,
    max_prefill_length=4,
    dtype="float32",
    torch_dtype="float32",
    #tokenizer_dir="path_to_tokenizer"
)

# Running the prediction
inference = Inference(config)
prompt = "What is the capital of France?"
output = inference.predict(prompt)
print("Generated Output:", output)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] init success
[INFO] NPU context set successfully
