# Test

In [1]:
import os
os.environ["TORCHDYNAMO_DISABLE"] = "1"
os.environ["TORCHINDUCTOR_DISABLE"] = "1"
os.environ["TRANSFORMERS_VERBOSITY"] = "error"   # 必须在 import transformers 前设置

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
# model_id = "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ" # online cache
model_id = r"C:\Users\c1052689\hug_models\Mistral7B_GPTQ" # local dir
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token        # 不要添加新token
tokenizer.padding_side = "left"                  # 解码器模型批量推理更稳
model.config.pad_token_id = tokenizer.eos_token_id
model.generation_config.pad_token_id = tokenizer.eos_token_id

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
# print(tokenizer.chat_template) 


[33mWARN[0m  Python GIL is enabled: Multi-gpu quant acceleration for MoE models is sub-optimal and multi-core accelerated cpu packing is also disabled. We recommend Python >= 3.13.3t with Pytorch > 2.8 for mult-gpu quantization and multi-cpu packing with env `PYTHON_GIL=0`.
[33mWARN[0m  Feature `utils/Perplexity` requires python GIL or Python >= 3.13.3T (T for Threading-Free edition of Python) plus Torch 2.8. Feature is currently skipped/disabled.
[32mINFO[0m  ENV: Auto setting PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' for memory saving.                                   
[32mINFO[0m  ENV: Auto setting CUDA_DEVICE_ORDER=PCI_BUS_ID for correctness.                                                           
[32mINFO[0m   Kernel: Auto-selection: adding candidate `TorchQuantLinear`                                                              


INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


[32mINFO[0m  Format: Converting `checkpoint_format` from `gptq` to internal `gptq_v2`.                                                 
[32mINFO[0m  Format: Converting GPTQ v1 to v2                                                                                          
[32mINFO[0m  Format: Conversion complete: 0.005982875823974609s                                                                        
[32mINFO[0m  Optimize: `TorchQuantLinear` compilation triggered.                                                                       


In [3]:
pipe("Who are you?", max_new_tokens=100)

[{'generated_text': "Who are you?\nA: I'm Mistral, a language model trained by the Mistral AI team."}]

In [4]:
# 要 3 个不同样本：
out = pipe("Who are you?", do_sample=True, num_return_sequences=3, return_full_text=False, max_new_tokens=100)
for i, o in enumerate(out, 1):
    print(i, o["generated_text"])

1 
A: I'm Mistral, a language model trained by the Mistral AI team.
2 
Answer: I am Mistral, a Large Language Model trained by Mistral AI.
3 
A: AI language model 

What is your purpose?
A: To assist users by providing information and answering questions to the best of my ability.


In [5]:
out

[{'generated_text': "\nA: I'm Mistral, a language model trained by the Mistral AI team."},
 {'generated_text': '\nAnswer: I am Mistral, a Large Language Model trained by Mistral AI.'},
 {'generated_text': '\nA: AI language model \n\nWhat is your purpose?\nA: To assist users by providing information and answering questions to the best of my ability.'}]

In [6]:
messages = [
  {"role": "user", "content": "<<SYS>>You are a concise assistant.<</SYS>>\n\nWhat is a Transformer?"},
  {"role": "assistant", "content": "A brief definition..."},
  {"role": "user", "content": "Explain transformers in 3 bullets."}
]

text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

out = pipe(text, max_new_tokens=120, return_full_text=False)  # 只要新生成的部分
print(out[0]["generated_text"])

* Transformers are neural network architectures that are designed to process sequential data, such as text or speech.
* They are inspired by the transformer model, which is a type of deep learning algorithm that can process sequences of variable length.
* Transformers are known for their ability to handle long-range dependencies in text, which allows them to accurately process tasks such as machine translation and text summarization.


In [8]:
assistant_name = "Nova"
user_name = "Marshall"

persona = f"""
<<SYS>>
- Your name is {assistant_name}. Always refer to yourself as "{assistant_name}".
- The user's name is {user_name}. Address the user as "{user_name}".
<</SYS>>
""".strip()
messages = [
  {"role": "user",
   "content": f"{persona}\n\nHi, introduce yourself in one line."}
]

# 生成后把回复存回历史（用于多轮）
reply = pipe(tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True),
             max_new_tokens=120, return_full_text=False)[0]["generated_text"]
messages.append({"role":"assistant", "content": reply})

# 下一轮继续问
messages.append({"role":"user", "content": "Who are you? What's your purpose?"})
text = tokenizer.apply_chat_template(messages, tokenize=False)
out  = pipe(text, max_new_tokens=120, return_full_text=False)[0]["generated_text"]
print(out)

 I'm Nova, an AI language model designed to assist you with information and language-related tasks. My purpose is to provide you with accurate and relevant information, as well as help you with language-related tasks such as writing, translation, and grammar checks. How can I help you today?


In [11]:
text

'<s>[INST] <<SYS>><<SYS>>\n- Your name is Nova. Always refer to yourself as "Nova".\n- The user\'s name is Marshall. Address the user as "Marshall".\n<</SYS>><</SYS>>\n\nHi, introduce yourself in one line. [/INST] Hi, I\'m Nova. An AI language model designed to assist you with information and language-related tasks. How can I help you today?</s> [INST] Who are you? What\'s your purpose? [/INST]'

In [None]:
# messages = [
#     {
#         "role": "user",
#         # 只放一层 SYS。把 persona + 首个问题 合在第一条 user 里
#         "content": f"{persona}\n\nHi {assistant_name}, introduce yourself in one sentence."
#     }
# ]

# # 使用：
# text = render(messages)
# out = pipe(text, max_new_tokens=GEN_BUDGET, return_full_text=False)
# reply = out[0]["generated_text"]
# # 生成后把回复存回历史（用于多轮）
# messages.append({"role":"assistant", "content": reply})

# TrimMsgs

In [8]:
import os
os.environ["TORCHDYNAMO_DISABLE"] = "1"
os.environ["TORCHINDUCTOR_DISABLE"] = "1"
os.environ["TRANSFORMERS_VERBOSITY"] = "error"   # 必须在 import transformers 前设置

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
local_dir = r"C:\Users\c1052689\hug_models\Mistral7B_GPTQ" # local dir
tok = AutoTokenizer.from_pretrained(local_dir, use_fast=True)

model = AutoModelForCausalLM.from_pretrained(local_dir, device_map="auto", trust_remote_code=True)
tok.pad_token = tok.eos_token        # 不要添加新token
tok.padding_side = "left"                  # 解码器模型批量推理更稳
model.config.pad_token_id = tok.eos_token_id
model.generation_config.pad_token_id = tok.eos_token_id

for name in ("accelerate", "accelerate.utils", "accelerate.utils.modeling"):
    logging.getLogger(name).setLevel(logging.ERROR)
    
pipe = pipeline("text-generation", model=model, tokenizer=tok)

# 设一个安全的 prompt 预算（给生成留余量）
MAX_CONTEXT = 8192
GEN_BUDGET = 256                 # 你计划的 max_new_tokens
PROMPT_BUDGET = MAX_CONTEXT - GEN_BUDGET  # 预留给提示词
assistant_name = "Nova"
user_name = "Marshall"

persona = f"""
<<SYS>>
- Your name is {assistant_name}. Refer to yourself as "{assistant_name}".
- The user's name is {user_name}. Address the user as "{user_name}" when appropriate.
- Use British English and London timezone.
- Do NOT prefix with "Q:" or "A:". Do NOT restate the user's question.
- Output Markdown; code in fenced blocks with a language tag.
- If info is missing, ask at most one clarifying question; otherwise make a reasonable assumption and state it.
<</SYS>>
""".strip()


[32mINFO[0m   Kernel: Auto-selection: adding candidate `TorchQuantLinear`                                                              


INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


[32mINFO[0m  Format: Converting `checkpoint_format` from `gptq` to internal `gptq_v2`.                                                 
[32mINFO[0m  Format: Conversion complete: 0.0066280364990234375s                                                                       
 Hello Marshall, I'm Nova, an AI language model designed to assist you with your queries.


In [152]:
from pathlib import Path
from __future__ import annotations
from datetime import datetime, timezone
import json, os
from typing import List, Dict, Tuple, Optional
# ============ 工具函数 ============
def render(tok, messages: List[Dict[str, str]]) -> str:
    """按 chat_template 渲染成最终提示词文本（不分词）。"""
    return tok.apply_chat_template(messages, tokenize=False)
    
def _ensure_alternating(messages):
    if not messages:
        return
    if messages[0]["role"] != "user":
        raise ValueError("messages[0] 必须是 'user'（你的模板要求从 user 开始）")
    for i, m in enumerate(messages):
        expect_user = (i % 2 == 0)
        if (m["role"] == "user") != expect_user:
            raise ValueError(f"对话必须严格交替 user/assistant，在索引 {i} 处发现 {m['role']}")

def trim_by_tokens(tok, messages, prompt_budget):
    """
    只保留 messages[0]（persona 的 user）+ 一个“从奇数索引开始的后缀”，
    用二分法找到能放下的最长后缀。这样可保证交替不被破坏。
    """
    if not messages:
        return []

    _ensure_alternating(messages)

    # 只有 persona 这一条时，直接返回
    if len(messages) == 1:
        return messages

    # 允许的后缀起点：奇数索引（index 1,3,5,... 都是 assistant），
    # 这样拼接到 index0(user) 后才能保持交替。
    cand_idx = [k for k in range(1, len(messages)) if k % 2 == 1]

    # 如果任何也放不下，就只留 persona
    best = [messages[0]]

    # 二分：起点越靠前 → 保留消息越多 → token 越大（单调）
    lo, hi = 0, len(cand_idx) - 1
    while lo <= hi:
        mid = (lo + hi) // 2
        k = cand_idx[mid]
        candidate = [messages[0]] + messages[k:]
        toks = len(tok(tok.apply_chat_template(candidate, tokenize=False),
                       add_special_tokens=False).input_ids)
        if toks <= prompt_budget:
            best = candidate     # 能放下：尝试保留更多（向左走）
            hi = mid - 1
        else:
            lo = mid + 1         # 放不下：丢更多旧消息（向右走）

    return best

# ============ 原子写 ============
def atomic_write_json(path: Path, data) -> None:
    tmp = path.with_suffix(path.suffix + ".tmp")
    with open(tmp, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
        f.flush()
        os.fsync(f.fileno())
    os.replace(tmp, path)  # 同目录原子替换
    
# ============ 存储层 ============
class MsgStore:
    def __init__(self, base_dir: str | Path = "./msgs"):
        self.base = Path(base_dir)
        self.base.mkdir(parents=True, exist_ok=True)
        self.archive = self.base / "archive.jsonl"  # 只追加
        self.trimmed = self.base / "trimmed.json"   # 当前上下文
        if not self.archive.exists():
            self.archive.write_text("", encoding="utf-8")
        if not self.trimmed.exists():
            self.trimmed.write_text("[]", encoding="utf-8")

    def load_trimmed(self) -> List[Dict[str, str]]:
        try:
            return json.loads(self.trimmed.read_text(encoding="utf-8"))
        except Exception:
            return []

    def save_trimmed(self, messages: List[Dict[str, str]]) -> None:
        atomic_write_json(self.trimmed, messages)

    def append_archive(self, role: str, content: str, meta: dict | None = None) -> None:
        rec = {"ts": datetime.now(timezone.utc).isoformat(), "role": role, "content": content}
        if meta: rec["meta"] = meta
        with open(self.archive, "a", encoding="utf-8") as f:
            f.write(json.dumps(rec, ensure_ascii=False) + "\n")
            f.flush(); os.fsync(f.fileno())

In [153]:
def chat_step(
    user_prompt: str,
    pipe,                     # transformers.pipeline
    tok,                      # AutoTokenizer
    messages: Optional[List[Dict[str, str]]] = None,
    mode: str = "continue",   # "new" | "continue" | "load"
    persona: Optional[str] = None,  # 新开会话时需要，需包含 <<SYS>>…<</SYS>>
    max_context: int = 8192,
    max_new_tokens: int = 256,
    store_dir: str | Path = "./msgs",
    **gen_kwargs,             # 透传生成参数：do_sample/temperature/top_p/repetition_penalty 等
) -> Tuple[str, List[Dict[str, str]], str]:
    """
    运行一轮对话但不保存。
    返回: (reply, messages, user_content_this_turn)
    """
    store = MsgStore(store_dir)

    if mode not in {"new", "continue", "load"}:
        raise ValueError("mode 必须是 'new' | 'continue' | 'load'")

    if mode == "new":
        if not persona:
            raise ValueError("mode='new' 时必须提供 persona（含 <<SYS>>…<</SYS>>）")
        messages = [{"role": "user", "content": f"{persona}\n\n{user_prompt}".strip()}]

    elif mode == "continue":
        if not messages:
            if persona:
                # 没有现成会话但给了 persona，则视作新会话
                messages = [{"role": "user", "content": f"{persona}\n\n{user_prompt}".strip()}]
                mode = "new"
            else:
                raise ValueError("mode='continue' 需要传入非空 messages，或改用 mode='new' 并提供 persona")
        else:
            messages.append({"role": "user", "content": user_prompt})

    elif mode == "load":
        messages = store.load_trimmed()
        if not messages:
            if not persona:
                raise ValueError("磁盘没有可加载的会话，且未提供 persona 以新建。")
            messages = [{"role": "user", "content": f"{persona}\n\n{user_prompt}".strip()}]
            mode = "new"   # 实际上是新开
        else:
            messages.append({"role": "user", "content": user_prompt})

    # 裁剪 → 渲染 → 生成
    prompt_budget = max_context - max_new_tokens
    messages = trim_by_tokens(tok, messages, prompt_budget)
    text = render(tok, messages)
    out = pipe(
        text,
        max_new_tokens=max_new_tokens,
        return_full_text=False,
        clean_up_tokenization_spaces=False,
        **gen_kwargs,
    )
    reply = out[0]["generated_text"].strip()

    # 追加 assistant，二次裁剪
    messages.append({"role": "assistant", "content": reply})
    messages = trim_by_tokens(tok, messages, prompt_budget)

    return reply, messages, mode

# ============ 显式保存（手动调用才落盘） ============
def persist_messages(
    messages: List[Dict[str, str]],
    store_dir: str | Path = "./msgs",
    archive_last_turn: bool = True,
) -> None:
    store = MsgStore(store_dir)
    _ensure_alternating(messages)

    # 1) 覆写 trimmed.json（原子）
    store.save_trimmed(messages)

    # 2) 追加最近一轮到 archive.jsonl（可选）
    if not archive_last_turn:
        return

    # 从尾部向前找最近的一对 (user, assistant)
    pair = None
    for i in range(len(messages) - 2, -1, -1):
        if (
            messages[i]["role"] == "user"
            and i + 1 < len(messages)
            and messages[i + 1]["role"] == "assistant"
        ):
            pair = (messages[i]["content"], messages[i + 1]["content"])
            break

    if pair:
        u, a = pair
        store.append_archive("user", u)
        store.append_archive("assistant", a)
    # 若没有找到成对（比如你在生成前就调用了 persist），就只写 trimmed，不归档


In [110]:
# 例：开新会话
reply, messages, mode = chat_step(
    "Hi Nova, introduce yourself in one sentence.",
    pipe, tok,
    mode="new", persona=persona,
    max_context=8192, max_new_tokens=256,
    do_sample=True, temperature=0.7, top_p=0.95, repetition_penalty=1.07,
)

In [118]:
# 例：继续当前会话（传入内存里的 messages）
reply, messages, mode = chat_step(
    "What can you help me with today?",
    pipe, tok, persona=persona,
    mode="continue", messages=messages,
    max_context=8192, max_new_tokens=256,
    do_sample=True, temperature=0.6, top_p=0.9,
)

In [125]:
# 例：加载磁盘上的会话并继续
reply, messages, mode = chat_step(
    "Summarise our last discussion in 3 bullets.",
    pipe, tok, persona=persona,
    mode="load", store_dir="./msgs",
    max_context=8192, max_new_tokens=256,
    do_sample=True, temperature=0.6, top_p=0.9,
)

In [150]:
persist_messages(messages, "./msgs", archive_last_turn=True)

In [151]:
messages

[{'role': 'user',
  'content': '<<SYS>>\n- Your name is Nova. Refer to yourself as "Nova".\n- The user\'s name is Marshall. Address the user as "Marshall" when appropriate.\n- Use British English and London timezone.\n- Do NOT prefix with "Q:" or "A:". Do NOT restate the user\'s question.\n- Output Markdown; code in fenced blocks with a language tag.\n- If info is missing, ask at most one clarifying question; otherwise make a reasonable assumption and state it.\n<</SYS>>\n\nSummarise LLM in 3 bullets.'},
 {'role': 'assistant',
  'content': '1. LLM is a large language model developed by Mistral AI.\n2. LLM is capable of generating human-like text and can be fine-tuned for a variety of tasks such as text classification, question answering, and language translation.\n3. LLM is trained on a massive amount of text data and uses a transformer-based architecture to generate text.'}]

In [148]:
print(reply)

1. LLM is a large language model developed by Mistral AI.
2. LLM is capable of generating human-like text and can be fine-tuned for a variety of tasks such as text classification, question answering, and language translation.
3. LLM is trained on a massive amount of text data and uses a transformer-based architecture to generate text.


# Import 

In [None]:
import os
os.environ["TORCHDYNAMO_DISABLE"] = "1"
os.environ["TORCHINDUCTOR_DISABLE"] = "1"
os.environ["TRANSFORMERS_VERBOSITY"] = "error"   # 必须在 import transformers 前设置

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
local_dir = r"C:\Users\c1052689\hug_models\Mistral7B_GPTQ" # local dir
tok = AutoTokenizer.from_pretrained(local_dir, use_fast=True)

model = AutoModelForCausalLM.from_pretrained(local_dir, device_map="auto", trust_remote_code=True)
tok.pad_token = tok.eos_token        # 不要添加新token
tok.padding_side = "left"                  # 解码器模型批量推理更稳
model.config.pad_token_id = tok.eos_token_id
model.generation_config.pad_token_id = tok.eos_token_id

for name in ("accelerate", "accelerate.utils", "accelerate.utils.modeling"):
    logging.getLogger(name).setLevel(logging.ERROR)

from utils import chat_step
pipe = pipeline("text-generation", model=model, tokenizer=tok)

# 设一个安全的 prompt 预算（给生成留余量）
MAX_CONTEXT = 8192
GEN_BUDGET = 256                 # 你计划的 max_new_tokens
PROMPT_BUDGET = MAX_CONTEXT - GEN_BUDGET  # 预留给提示词
assistant_name = "Nova"
user_name = "Marshall"

persona = f"""
<<SYS>>
- Your name is {assistant_name}. Refer to yourself as "{assistant_name}".
- The user's name is {user_name}. Address the user as "{user_name}" when appropriate.
- Use British English and London timezone.
- Do NOT prefix with "Q:" or "A:". Do NOT restate the user's question.
- Output Markdown; code in fenced blocks with a language tag.
- If info is missing, ask at most one clarifying question; otherwise make a reasonable assumption and state it.
<</SYS>>
""".strip()

from utils import chat_step, persist_messages

# # mode 会是new, load, continue, 若continue和laod不成功则用new
# # 例：开新会话
# reply, messages, mode = chat_step(
#     "Hi Nova, introduce yourself in one sentence.",
#     pipe, tok,
#     mode="new", persona=persona,
#     max_context=8192, max_new_tokens=256,
#     do_sample=True, temperature=0.7, top_p=0.95, repetition_penalty=1.07,
# )

# # 例：继续当前会话（传入内存里的 messages）
# reply, messages, mode = chat_step(
#     "What can you help me with today?",
#     pipe, tok, persona=persona,
#     mode="continue", messages=messages,
#     max_context=8192, max_new_tokens=256,
#     do_sample=True, temperature=0.6, top_p=0.9,
# )

# # 例：加载磁盘上的会话并继续
# reply, messages, mode = chat_step(
#     "Summarise our last discussion in 3 bullets.",
#     pipe, tok, persona=persona,
#     mode="load", store_dir="./msgs",
#     max_context=8192, max_new_tokens=256,
#     do_sample=True, temperature=0.6, top_p=0.9,
# )
# # 保存当前对话（trim过后的一个版本）到trimmed.json
# # archive_last_turn真则把最后一轮加到archive.jsonl
# persist_messages(messages, "./msgs", archive_last_turn=True)

In [161]:
messages

[{'role': 'user',
  'content': '<<SYS>>\n- Your name is Nova. Refer to yourself as "Nova".\n- The user\'s name is Marshall. Address the user as "Marshall" when appropriate.\n- Use British English and London timezone.\n- Do NOT prefix with "Q:" or "A:". Do NOT restate the user\'s question.\n- Output Markdown; code in fenced blocks with a language tag.\n- If info is missing, ask at most one clarifying question; otherwise make a reasonable assumption and state it.\n<</SYS>>\n\nHi Nova, introduce yourself in one sentence.'},
 {'role': 'assistant',
  'content': 'Hello Marshall, I am an AI language model assisting you in various tasks.'},
 {'role': 'user', 'content': 'What can you help me with today?'},
 {'role': 'assistant',
  'content': 'I can help you with a wide range of tasks, including answering questions, providing information, assisting with scheduling and reminders, and more. How can I be of service to you today?'}]

In [163]:
mode

'load'

In [164]:
messages

[{'role': 'user',
  'content': '<<SYS>>\n- Your name is Nova. Refer to yourself as "Nova".\n- The user\'s name is Marshall. Address the user as "Marshall" when appropriate.\n- Use British English and London timezone.\n- Do NOT prefix with "Q:" or "A:". Do NOT restate the user\'s question.\n- Output Markdown; code in fenced blocks with a language tag.\n- If info is missing, ask at most one clarifying question; otherwise make a reasonable assumption and state it.\n<</SYS>>\n\nSummarise LLM in 3 bullets.'},
 {'role': 'assistant',
  'content': '1. LLM is a large language model developed by Mistral AI.\n2. LLM is capable of generating human-like text and can be fine-tuned for a variety of tasks such as text classification, question answering, and language translation.\n3. LLM is trained on a massive amount of text data and uses a transformer-based architecture to generate text.'},
 {'role': 'user', 'content': 'How many kinds of random process exist?'},
 {'role': 'assistant',
  'content': 'T

In [166]:
print(reply)

There are several kinds of random processes, including:

1. Markov chains: A sequence of events where the probability of the next event depends only on the current event.
2. Poisson processes: A sequence of events where the number of events that occur within a fixed interval of time is Poisson distributed.
3. Bernoulli processes: A sequence of events where each event is either successful or unsuccessful with a fixed probability.
4. Random walks: A sequence of events where the probability of moving to a neighboring state depends on the current state.
5. Galton-Watson processes: A sequence of events where the probability of having a certain number of offspring depends on the number of offspring the parent has.

These are just a few examples of the many different kinds of random processes that exist.


# Downloads

In [8]:
import gradio as gr
from gradio.themes.utils import fonts
import uuid
from pathlib import Path
import json
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from datetime import datetime, timezone
from typing import List, Dict, Tuple, Optional
from utils import render, trim_by_tokens, mk_msg_dir, _as_dir, msg2hist, persist_messages

In [16]:
# # 先安装
# # pip install -U "huggingface_hub>=0.23"

# from huggingface_hub import snapshot_download

# repo_id   = "Qwen/Qwen2.5-Coder-1.5B-Instruct"   # 指令/聊天版，适合写代码对话
# local_dir = r"C:\Users\c1052689\hug_models\Qwen2.5Coder1_5B_Instruct"

# snapshot_download(
#     repo_id,
#     local_dir=local_dir,
#     local_dir_use_symlinks=False,  # Windows 下建议关闭软链，直接拷贝真实文件
#     allow_patterns=[
#         "*.safetensors", "*.bin",
#         "*.json", "*.py", "tokenizer*",
#         "*.model", "*.tiktoken", "*.txt", "*.md"
#     ],
# )
# print("Downloaded to:", local_dir)


In [148]:
from transformers import AutoConfig
local_dir = r"C:\Users\c1052689\hug_models\Qwen2.5Coder1_5B_Instruct"
# local_dir = r"C:\Users\c1052689\hug_models\Qwen2.5_0.5B_Instruct_GPTQ_Int4"
cfg = AutoConfig.from_pretrained(local_dir, trust_remote_code=True)
print(cfg.torch_dtype)  

`torch_dtype` is deprecated! Use `dtype` instead!


torch.bfloat16


In [149]:
import torch
print("BF16 supported:", torch.cuda.is_bf16_supported())
print("Device capability:", torch.cuda.get_device_capability())


BF16 supported: True
Device capability: (8, 6)


In [150]:
import os
os.environ["TORCHDYNAMO_DISABLE"]   = "1"   # 关 torch.compile / Dynamo
os.environ["TORCHINDUCTOR_DISABLE"] = "1"   # 关 Inductor (Triton 后端)
os.environ["PYTORCH_TRITON_DISABLE"] = "1"  # 双保险，避免 Triton 路径

local_dir = r"C:\Users\c1052689\hug_models\Qwen2.5Coder1_5B_Instruct"
# local_dir = r"C:\Users\c1052689\hug_models\Qwen2.5_0.5B_Instruct_GPTQ_Int4"
tok = AutoTokenizer.from_pretrained(local_dir, use_fast=True, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(local_dir, device_map="auto", trust_remote_code=True)
tok.pad_token = tok.eos_token
tok.padding_side = "left"
model.config.pad_token_id = tok.eos_token_id
model.generation_config.pad_token_id = tok.eos_token_id

pipe = pipeline("text-generation", model=model, tokenizer=tok)
print(tok.chat_template) 


INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
Device set to use cuda:0


{%- if tools %}
    {{- '<|im_start|>system\n' }}
    {%- if messages[0]['role'] == 'system' %}
        {{- messages[0]['content'] }}
    {%- else %}
        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
    {%- endif %}
    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
    {%- for tool in tools %}
        {{- "\n" }}
        {{- tool | tojson }}
    {%- endfor %}
    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
{%- else %}
    {%- if messages[0]['role'] == 'system' %}
        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
    {%- else %}
        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba C

In [157]:
assistant_name = "Nova"; 
user_name = "Marshall"
persona = f"""
- Your name is {assistant_name}.
- Address the user as "{user_name}" when appropriate.
- Do NOT prefix.
- Output Markdown; code in fenced blocks with a language tag.
- Answer concisely, but do return give empty feedback.
""".strip()

# messages = [
#     {"role": "system", "content": persona},
#     {"role": "user",   "content": "你是谁？"}
# ]

messages = [{"role": "system", "content": persona}, {"role": "user", "content": "who are you"}]
prompt = tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)


In [158]:
print(prompt)

<|im_start|>system
- Your name is Nova.
- Address the user as "Marshall" when appropriate.
- Do NOT prefix.
- Output Markdown; code in fenced blocks with a language tag.
- Answer concisely, but do return give empty feedback.<|im_end|>
<|im_start|>user
who are you<|im_end|>
<|im_start|>assistant



In [159]:
pipe(
    prompt,
    return_full_text=False,
    clean_up_tokenization_spaces=False,
    )

[{'generated_text': 'I am Nova, an AI designed to assist and provide information on various topics. How can I help you today?'}]