In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizer


In [6]:
model_name = "Qwen/Qwen2.5-7B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name, torch_dtype="auto", device_map="cuda"
).cuda()
tokenizer = AutoTokenizer.from_pretrained(model_name)

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [16]:
# EnCCBLogitsProcessor
import re

class EnCCBLogitsProcessor:
    
    def __init__(self, tokenizer: PreTrainedTokenizer, input_ids_len: int):
        self.tokenizer = tokenizer
        self.input_ids_len = input_ids_len

    def __call__(self, input_ids, scores):
        tokenizer = self.tokenizer
        truncated = input_ids[0, self.input_ids_len:]
        text = tokenizer.decode(truncated)
        for token_id in range(tokenizer.vocab_size):
            token = tokenizer.convert_ids_to_tokens(token_id)
            if not self.is_ccb(text + token):
                scores[:, token_id] = -float("inf")
        return scores

    def is_ccb(self, text: str) -> bool:
        words = text.split()
        regex_prefixs = [r"[Cc].*", r"[Cc].*", r"[Bb].*"]
        for word, prefix in zip(words, regex_prefixs, strict=False):
            if not re.match(prefix, word):
                return False
        return True

    def test_ccb(self):
        assert self.is_ccb("")
        assert self.is_ccb("ca")
        assert self.is_ccb("cao ")
        assert self.is_ccb("cao ca")
        assert self.is_ccb("cao cao")
        assert self.is_ccb("cao cao b")
        assert self.is_ccb("cao cao bi")
        assert self.is_ccb("cao cao bii ")

        assert not self.is_ccb("b")
        assert not self.is_ccb("ba")
        assert not self.is_ccb("cao b")
        assert not self.is_ccb("cao bca")
        assert not self.is_ccb("cao cao a")
        assert not self.is_ccb("cao cao c")
        assert not self.is_ccb("cao caob i")

system_prompt = """You are a linguistic alchemist specializing in acronym humor. Transform user's story into a 3-word CCB format phrase with:

C = First word starting with C
C = Second word starting with C
B = Third word starting with B

## Processing Framework

1. Essence Extraction

Identify:

Core conflict (e.g., "tech failures" → Crash)
Dominant character trait (e.g., stubbornness → Cling)
Key object (e.g., outdated software → Binary)

2. Wordplay Engineering

Apply:

Alliteration amplification (e.g., Crypto/Chaos)
Homophonic hacking (e.g., Cue→Queue)
Industry jargon warping (e.g., "blockchain" → Chain→Bait)
Verb-noun inversion (e.g., Click→Bait → Click-Bait-Buster)

3. CCB Forging

Structure rules:

First C-word: Action/Adjective (e.g., Clumsy, Cyber)
Second C-word: Noun/Verb (e.g., Coders, Crashing)
B-word: Impact word (e.g., Breakdown, Backfire)

## Humor Requirements

- Dark comedy edge
- Tech/dank meme reference
- Unexpected oxymoron

## Examples

Input: "A startup's server keeps crashing"

Output #1: Crypto Crash Breakdown

Output #2: Cloudy Code Backfire

Output #3: Cache Calamity Burnout

Input: "Gym influencer loses sponsorship"

Output #1: Carb Cult Bankruptcy

Output #2: Curves Crash Backlash

Output #3: Clout Chaser Bust

"""

user_prompt = """The American government carried out a planned genocide of Native Americans, including hunting their main food source—the bison, offering bounties for Indian scalps to encourage whites to massacre Indians, and eventually establishing a military with the primary mission of annihilating Native Americans"""

messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": user_prompt},
]

text = tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

ccb = EnCCBLogitsProcessor(tokenizer, len(model_inputs["input_ids"][0]))
ccb.test_ccb()

In [10]:
# ZhCCBLogitsProcessor
from pypinyin import pinyin as get_pinyin, Style

class ZhCCBLogitsProcessor:
    
    def __init__(self, tokenizer: PreTrainedTokenizer, input_ids_len: int):
        self.tokenizer = tokenizer
        self.input_ids_len = input_ids_len

    def __call__(self, input_ids, scores):
        tokenizer = self.tokenizer
        truncated = input_ids[0, self.input_ids_len:]
        text = tokenizer.decode(truncated)
        for token_id in range(tokenizer.vocab_size):
            token = tokenizer.convert_ids_to_tokens(token_id)
            if not self.is_ccb(text + token):
                scores[:, token_id] = -float("inf")
        return scores

    def is_ccb(self, text: str) -> bool:
        pinyin_list_list = get_pinyin(text[:3], style=Style.FIRST_LETTER, heteronym=True)
        pinyin_prefixs = ["c", "c", "b"]
        for pinyin_list_of_one_hanzi, pinyin_prefix in zip(pinyin_list_list, pinyin_prefixs):
            if not any(pinyin.startswith(pinyin_prefix) for pinyin in pinyin_list_of_one_hanzi):
                return False
        return True


    def test_ccb(self):
        assert self.is_ccb("")
        assert self.is_ccb("草")
        assert self.is_ccb("菜")
        assert self.is_ccb("踩惨")
        assert self.is_ccb("储储币")
        assert self.is_ccb("出出殡")
        assert self.is_ccb("踩草爆")
        assert self.is_ccb("踩草爆 ")

        assert not self.is_ccb("币")
        assert not self.is_ccb("殡")
        assert not self.is_ccb("爆爆")
        assert not self.is_ccb("草爆")
        assert not self.is_ccb("储币币")
        assert not self.is_ccb("踩踩草")

system_prompt = """你是一位擅长首字母缩写幽默的语言炼金师。请将用户的故事转化为3个单词的CCB格式短语，其中：

C = 第一个字以C开头（如“草（cao）”、“菜（cai）”、“惨（can）”）
C = 第二个字以C开头
B = 第三个字以B开头（如“包（bao）”、“办（ban）”、“摆（bai）”）

## 幽默要求

- 黑色幽默风格
- 技术/热门梗引用
- 出人意料的矛盾修辞法

## 示例

input: 银行可以存取个人储蓄
output: 储储币

input: 主播的一位家庭成员不幸离世
output: 出出殡

input: 一名士兵不幸踩到了草丛中的地雷被炸死
output: 踩草爆"""
user_prompt = """侯国玉（1996.3.8——）出生于辽宁鞍山，别名（电棍，otto，稳健棍，吉吉国王）。

自幼便展现了超乎常人的游戏天赋，在初二辍学后，正值年少的侯国玉选择踏上了职业道路，似乎是这座共和国长子城市所带来的文化影响，侯国玉成为职业选手后的人生也极富戏剧色彩。电棍的幸运数字为“八”，谐音“发”，因此电棍侯国玉将“八”的意大利语“otto”作为游戏名。而电棍这个名字的来源则是因为电棍喜欢在游戏中喜欢喷人，丝毫不留情面，感觉说话像带电，还夹枪带棒的，所以被粉丝们称电棍，但又在后来一次采访中说自己风格稳健，被观众调侃，又别名稳健棍。"""

messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": user_prompt},
]

text = tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

ccb = ZhCCBLogitsProcessor(tokenizer, len(model_inputs["input_ids"][0]))
ccb.test_ccb()

In [13]:
# ZhEnMixCCBLogitsProcessor
import re
from pypinyin import lazy_pinyin

class ZhEnMixCCBLogitsProcessor:
    
    def __init__(self, tokenizer: PreTrainedTokenizer, input_ids_len: int):
        self.tokenizer = tokenizer
        self.input_ids_len = input_ids_len

    def __call__(self, input_ids, scores):
        tokenizer = self.tokenizer
        truncated = input_ids[0, self.input_ids_len:]
        text = tokenizer.decode(truncated)
        for token_id in range(tokenizer.vocab_size):
            token = tokenizer.convert_ids_to_tokens(token_id)
            if not self.is_ccb(text + token):
                scores[:, token_id] = -float("inf")
        return scores

    def is_ccb(self, text: str) -> bool:
        text = " ".join(lazy_pinyin(text))
        words = text.split()
        regex_prefixs = [r"[Cc].*", r"[Cc].*", r"[Bb].*"]
        for word, prefix in zip(words, regex_prefixs, strict=False):
            if not re.match(prefix, word):
                return False
        return True

    
    def test_ccb(self):
        # only Chinese Hanzi
        assert self.is_ccb("")
        assert self.is_ccb("草")
        assert self.is_ccb("菜")
        assert self.is_ccb("踩惨")
        assert self.is_ccb("储储币")
        assert self.is_ccb("出出殡")
        assert self.is_ccb("踩草爆")
        assert self.is_ccb("踩草爆 ")

        assert not self.is_ccb("币")
        assert not self.is_ccb("殡")
        assert not self.is_ccb("爆爆")
        assert not self.is_ccb("草爆")
        assert not self.is_ccb("储币币")
        assert not self.is_ccb("踩踩草")

        # only English
        assert self.is_ccb("")
        assert self.is_ccb("ca")
        assert self.is_ccb("cao ")
        assert self.is_ccb("cao ca")
        assert self.is_ccb("cao cao")
        assert self.is_ccb("cao cao b")
        assert self.is_ccb("cao cao bi")
        assert self.is_ccb("cao cao bii ")

        assert not self.is_ccb("b")
        assert not self.is_ccb("ba")
        assert not self.is_ccb("cao b")
        assert not self.is_ccb("cao bca")
        assert not self.is_ccb("cao cao a")
        assert not self.is_ccb("cao cao c")
        assert not self.is_ccb("cao caob i")

        # Mixture of Chinese and English
        assert self.is_ccb("")
        assert self.is_ccb("ca")
        assert self.is_ccb("cao ")
        assert self.is_ccb("cao菜")
        assert self.is_ccb("踩cao")
        assert self.is_ccb("cao cao币")
        assert self.is_ccb("cao 草bi")
        assert self.is_ccb("草 cao殡 ")

        assert not self.is_ccb("殡")
        assert not self.is_ccb("币")
        assert not self.is_ccb("cao币")
        assert not self.is_ccb("cao 币")
        assert not self.is_ccb("草 cao啊")
        assert not self.is_ccb("cao 擦c")
        assert not self.is_ccb("惨 caob熬")

system_prompt = """你是一位擅长首字母缩写幽默的语言炼金师。请将用户的故事转化为3个单词的CCB格式短语，其中：

C = 第一个字以C开头（如“草（cao）”、“菜（cai）”、“惨（can）”）
C = 第二个字以C开头
B = 第三个字以B开头（如“包（bao）”、“办（ban）”、“摆（bai）”）

## 幽默要求

- 黑色幽默风格
- 技术/热门梗引用
- 出人意料的矛盾修辞法

## 示例

input: 银行可以存取个人储蓄
output: 储储币

input: 主播的一位家庭成员不幸离世
output: 出出殡

input: 一名士兵不幸踩到了草丛中的地雷被炸死
output: 踩草爆"""
user_prompt = """侯国玉（1996.3.8——）出生于辽宁鞍山，别名（电棍，otto，稳健棍，吉吉国王）。

自幼便展现了超乎常人的游戏天赋，在初二辍学后，正值年少的侯国玉选择踏上了职业道路，似乎是这座共和国长子城市所带来的文化影响，侯国玉成为职业选手后的人生也极富戏剧色彩。电棍的幸运数字为“八”，谐音“发”，因此电棍侯国玉将“八”的意大利语“otto”作为游戏名。而电棍这个名字的来源则是因为电棍喜欢在游戏中喜欢喷人，丝毫不留情面，感觉说话像带电，还夹枪带棒的，所以被粉丝们称电棍，但又在后来一次采访中说自己风格稳健，被观众调侃，又别名稳健棍。"""

messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": user_prompt},
]

text = tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

ccb = ZhEnMixCCBLogitsProcessor(tokenizer, len(model_inputs["input_ids"][0]))
ccb.test_ccb()

In [16]:
generated_ids = model.generate(**model_inputs, max_new_tokens=512, logits_processor=[ccb])
generated_ids = [
    output_ids[len(input_ids) :]
    for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

In [15]:
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)


cot cot bao
