# 純文字模型

In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HybridCache, Gemma3ForCausalLM, GemmaTokenizerFast

PATH = "./gemma/gemma3_4b"

quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16
        )

model = Gemma3ForCausalLM.from_pretrained(
    PATH,
    quantization_config=quantization_config,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    low_cpu_mem_usage=True
    )
model = model.eval()
tokenizer = GemmaTokenizerFast.from_pretrained(PATH, truncate = True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
msg = """<start_of_turn>user
[一律用繁體中文回應]{prompt}<end_of_turn>
<start_of_turn>model
"""
def gemma_resp(prompt):
    MSG = msg.format(prompt=prompt)
    input_ids = torch.tensor(tokenizer.encode(MSG)).to(model.device)
    input_ids = input_ids.unsqueeze(0)
    attention_mask = torch.ones_like(input_ids)
    past_key_values = HybridCache(
        config = model.config,
        max_cache_len=1024,
        max_batch_size=1,
        device=model.device,
        dtype=torch.bfloat16
    )
    cache_position = torch.arange(
        input_ids.shape[-1], dtype=torch.long, device=model.device
    )
    eos_token_ids = [tokenizer.eos_token_id, 106]

    output_len = 32768
    res = list()
    for _ in range(output_len):
        with torch.no_grad():
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                use_cache=True,
                past_key_values=past_key_values,
                cache_position=cache_position
            )
            logits = outputs.logits
            next_token = torch.argmax(logits[:, -1, :], dim=-1, keepdim=True)
            token_id = next_token.item()
            if token_id in eos_token_ids:
                break
            res += [tokenizer.decode(token_id)]
            print(res[-1], end="", flush=True)
            input_ids = next_token
            attention_mask = None
            cache_position = cache_position[-1:] + 1
    return "".join(res)

In [9]:
res = gemma_resp("你能夠做什麼事情呢？")

我是一個大型語言模型，可以做很多事情呢！以下是一些我能做到的：

**1. 回答你的問題：**

*   **知識查詢：** 只要你問我任何問題，我都會盡力用我所學的知識來回答你。無論是歷史、科學、文化、還是其他任何領域，我都可以提供資訊。
*   **解釋複雜概念：** 如果你對某個概念感到困惑，我可以試著用簡單易懂的方式來解釋它。
*   **提供定義：** 只要你告訴我一個詞語，我就可以提供它的定義和解釋。

**2. 創作內容：**

*   **寫作：** 我可以幫你寫各種文章，例如：
    *   故事
    *   詩歌
    *   文章
    *   信件
    *   電子郵件
    *   廣告文案
*   **翻譯：** 我可以將文字從一種語言翻譯成另一種語言。
*   **摘要：** 我可以將長篇文章或段落簡短地總結。
*   **生成不同風格的文本：** 例如，我可以模仿莎士比亞的風格寫一首關於貓的詩，或者用現代口語寫一篇關於科技的文章。

**3. 協助你完成任務：**

*   **編寫程式碼：** 我可以幫你寫一些簡單的程式碼，例如 Python、JavaScript 等。
*   **解決數學問題：** 我可以解決一些簡單的數學問題。
*   **生成列表：** 我可以根據你的要求生成列表，例如購物清單、待辦事項清單等。
*   **提供建議：** 我可以根據你的描述，提供一些建議，例如旅遊建議、美食推薦等。

**4. 娛樂你：**

*   **講笑話：** 我可以講一些笑話逗你開心。
*   **玩文字遊戲：** 我可以和你一起玩一些文字遊戲。
*   **創作故事：** 我可以根據你的要求創作故事。

**5. 其他：**

*   **角色扮演：** 我可以扮演不同的角色，與你進行互動。
*   **提供創意：** 如果你卡住了，我可以提供一些創意，幫助你解決問題。
*   **學習：** 我可以根據你的提問，學習新的知識。

**請記住：** 我是一個語言模型，我的知識是基於我訓練時所學習的數據。我可能會犯錯，所以請務必對我的回答進行核實。

**現在，你想讓我做什麼呢？ 告訴我你的需求吧！**


# 多模態模型

In [1]:
import torch
from transformers import AutoProcessor, BitsAndBytesConfig, HybridCache, Gemma3ForConditionalGeneration, Cache
from PIL import Image
PATH = "./gemma/gemma3_4b"

quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )
model = Gemma3ForConditionalGeneration.from_pretrained(
    PATH,
    quantization_config=quantization_config,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    low_cpu_mem_usage=True
    )
model = model.eval()
processor = AutoProcessor.from_pretrained(PATH)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [9]:
import numpy as np
msg = "<start_of_turn>user\n{prompt}\n<end_of_turn><start_of_image>\n<start_of_turn>model"

def gemma_resp(prompt, image_path):
    # 讀圖片 + 縮放
    image = Image.open(image_path).convert("RGB")
    prompt = msg.format(prompt=prompt)
    inputs = processor(text=prompt, images=image, return_tensors="pt").to(model.device)
    input_ids = inputs["input_ids"]
    pixel_values = inputs["pixel_values"]
    attention_mask = torch.ones_like(input_ids)
    # cache_position = torch.tensor([input_ids.shape[-1]], device=model.device)
    seq_len = input_ids.shape[-1]
    cache_position = torch.arange(seq_len, device=model.device)
    
    with torch.no_grad():
        outputs = model(
            input_ids=input_ids,
            pixel_values=pixel_values,
            use_cache=True,
            attention_mask=attention_mask,
            cache_position=cache_position
        )
    logits = outputs.logits
    past_key_values = outputs.past_key_values
    next_token = torch.argmax(logits[:, -1, :], dim=-1, keepdim=True)
    input_ids = next_token

    eos_token_ids = {processor.tokenizer.eos_token_id, 106}

    output_len = 32768
    res = list()
    current_pos = seq_len
    for _ in range(output_len):
        attention_mask = torch.cat([attention_mask, torch.ones(1, 1, device=model.device, dtype=torch.long)], dim=-1)
        cache_position = torch.tensor([current_pos], device=model.device)
        with torch.no_grad():
            outputs = model(
                input_ids=input_ids,
                use_cache=True,
                past_key_values=past_key_values,
                cache_position=cache_position,
                attention_mask=attention_mask,
                pixel_values=None
            )
            logits = outputs.logits
            past_key_values = outputs.past_key_values
            next_token = torch.argmax(logits[:, -1, :], dim=-1, keepdim=True)
            token_id = next_token.item()
            if token_id in eos_token_ids:
                break
            
            res += [processor.decode(token_id, skip_special_tokens=True)]
            print(res[-1], end="", flush=True)
            input_ids = next_token
            current_pos += 1
    return "".join(res)

In [10]:
gemma_resp("[繁體中文回答] 這張圖有幾個人 ?", "C:/Users/jhbai/Pictures/CUTE/image.png")

這張圖中，我能清楚地看到一個女孩（一位）。 雖然畫面中有許多坦克和巨大的機器人，但只有一個明顯的人。


'這張圖中，我能清楚地看到一個女孩（一位）。 雖然畫面中有許多坦克和巨大的機器人，但只有一個明顯的人。\n'

# Batch Inference

In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HybridCache, Gemma3ForCausalLM, GemmaTokenizerFast

PATH = "./gemma/gemma3_4b"

quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16
        )

model = Gemma3ForCausalLM.from_pretrained(
    PATH,
    quantization_config=quantization_config,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    low_cpu_mem_usage=True
    )
model = model.eval()
tokenizer = GemmaTokenizerFast.from_pretrained(PATH, truncate = True)

# 新增 padding_side='left'，因為在生成任務中，將填充放在左側更為常見且高效
tokenizer.padding_side = 'left'
# 如果 tokenizer 沒有預設的 pad_token，通常會將其設定為 eos_token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

msg = """<start_of_turn>user
[一律用繁體中文回應]{prompt}<end_of_turn>
<start_of_turn>model
"""

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
def gemma_resp(prompts: list[str]):
    batch_size = len(prompts)
    formatted_prompts = [msg.format(prompt=p) for p in prompts]
    inputs = tokenizer(formatted_prompts, return_tensors="pt", padding=True).to(model.device)

    input_ids = inputs.input_ids
    attention_mask = inputs.attention_mask

    past_key_values = HybridCache(
        config = model.config,
        max_cache_len=1024,
        max_batch_size=batch_size,
        device=model.device,
        dtype=torch.bfloat16
    )
    # 不做 cache_position，避免與 padding 衝突
    eos_token_ids = [tokenizer.eos_token_id, 106]

    output_len = 32768
    res = [[] for _ in range(batch_size)]
    is_eos = torch.zeros(batch_size, dtype=torch.bool, device=model.device)

    for _ in range(output_len):
        with torch.no_grad():
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                use_cache=True,
                past_key_values=past_key_values,
            )
            logits = outputs.logits
            next_token = torch.argmax(logits[:, -1, :], dim=-1, keepdim=True)

            for i in range(batch_size):
                if not is_eos[i]:
                    token_id = next_token[i].item()
                    if token_id in eos_token_ids:
                        is_eos[i] = True 
                    else:
                        decoded_token = tokenizer.decode(token_id)
                        res[i].append(decoded_token)

            if is_eos.all():
                break
            
            input_ids = next_token
            # 在每次迭代時，將 attention_mask 擴展一個單位 (長度為1，值為1)，新 token 需要被計算
            attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1), dtype=torch.long, device=model.device)], dim=1)

    final_outputs = ["".join(r) for r in res]
    return final_outputs

In [4]:
# --- 使用範例 ---
prompts_to_run = ["[繁體中文回答]一句話說明什麼是Martingale", "[繁體中文回答]一句話解釋Bolzano-Wierstrass Theorem", "[繁體中文回答]一句話描述Compiler的運作精神"]
responses = gemma_resp(prompts_to_run)
for i, (prompt, response) in enumerate(zip(prompts_to_run, responses)):
    print(f"Prompt: {prompt}")
    print(f"Response: {response}")

Prompt: [繁體中文回答]一句話說明什麼是Martingale
Response: 馬丁格拉策略是一種賭博策略，它主張在輸局後，將下注金額翻倍，直到贏錢，以此來彌補之前的損失。

Prompt: [繁體中文回答]一句話解釋Bolzano-Wierstrass Theorem
Response: 波茲瑪-魏爾斯特拉斯定理指出，任何連續函數，在閉合區間內都必有最大值和最小值。

Prompt: [繁體中文回答]一句話描述Compiler的運作精神
Response: 編譯器就像一位翻譯家，將高階程式碼轉譯成電腦能理解的低階指令。



# Word Embedding

In [1]:
import torch
import torch.nn.functional as F
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HybridCache, Gemma3ForCausalLM, GemmaTokenizerFast

PATH = "./gemma/gemma3_4b"
quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16
        )
model = Gemma3ForCausalLM.from_pretrained(
    PATH,
    quantization_config=quantization_config,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    low_cpu_mem_usage=True,
    output_hidden_states=True  # 隱藏狀態
    )
model = model.eval()
tokenizer = GemmaTokenizerFast.from_pretrained(PATH, truncate = True)
tokenizer.padding_side = 'left'
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
msg = """<start_of_turn>user
[一律用繁體中文回應]{prompt}<end_of_turn>
<start_of_turn>model
"""

The following generation flags are not valid and may be ignored: ['output_hidden_states']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['output_hidden_states']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['output_hidden_states']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['output_hidden_states']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


In [3]:
def gemma_resp(prompts: list[str]):
    batch_size = len(prompts)
    formatted_prompts = [msg.format(prompt=p) for p in prompts]
    inputs = tokenizer(formatted_prompts, return_tensors="pt", padding=True).to(model.device)

    input_ids = inputs.input_ids
    attention_mask = inputs.attention_mask

    past_key_values = HybridCache(
        config = model.config,
        max_cache_len=1024,
        max_batch_size=batch_size,
        device=model.device,
        dtype=torch.bfloat16
    )
    # 不做 cache_position，避免與 padding 衝突
    eos_token_ids = [tokenizer.eos_token_id, 106]

    output_len = 32768
    res = [[] for _ in range(batch_size)]
    is_eos = torch.zeros(batch_size, dtype=torch.bool, device=model.device)

    for _ in range(output_len):
        with torch.no_grad():
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                use_cache=True,
                past_key_values=past_key_values,
            )
            logits = outputs.logits
            next_token = torch.argmax(logits[:, -1, :], dim=-1, keepdim=True)

            for i in range(batch_size):
                if not is_eos[i]:
                    token_id = next_token[i].item()
                    if token_id in eos_token_ids:
                        is_eos[i] = True 
                    else:
                        decoded_token = tokenizer.decode(token_id)
                        res[i].append(decoded_token)

            if is_eos.all():
                break
            
            input_ids = next_token
            # 在每次迭代時，將 attention_mask 擴展一個單位 (長度為1，值為1)，新 token 需要被計算
            attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1), dtype=torch.long, device=model.device)], dim=1)

    final_outputs = ["".join(r) for r in res]
    return final_outputs

In [4]:
def get_embeddings(texts: list[str]):
    original_padding_side = tokenizer.padding_side
    tokenizer.padding_side = 'right'
    inputs = tokenizer(
        texts, 
        return_tensors="pt", 
        padding=True, 
        truncation=True, 
        max_length=32768
    ).to(model.device)

    with torch.no_grad():
        outputs = model(**inputs)
    last_hidden_states = outputs.hidden_states[-1]

    # Mean Pooling
    attention_mask = inputs['attention_mask']
    expanded_mask = attention_mask.unsqueeze(-1).expand(last_hidden_states.size()).float() # (batch_size, seq_len, hidden_size) -> (batch_size, seq_len, 1)
    sum_embeddings = torch.sum(last_hidden_states * expanded_mask, 1)
    sum_mask = torch.clamp(expanded_mask.sum(1), min=1e-9)
    mean_pooled_embeddings = sum_embeddings / sum_mask
    normalized_embeddings = F.normalize(mean_pooled_embeddings, p=2, dim=1)

    tokenizer.padding_side = original_padding_side
    return normalized_embeddings

In [9]:
if __name__ == '__main__':
    # 1. 假設這是您的知識庫 (Knowledge Base)
    documents = [
        "機器人喜歡吃蔬菜",
        "機器人每天都會去爬山",
        "工程師最喜歡喝奶茶",
        "黑貓嚕嚕是我的本名",
        "機器人的薪水買不起房",
        "機器人討厭吃芭樂"
    ]
    doc_embeddings = get_embeddings(documents)

    query = "機器人的興趣是什麼？"
    query_embedding = get_embeddings([query])
    cosine_scores = F.cosine_similarity(query_embedding, doc_embeddings, dim=1)

    top_k = 3
    top_results = torch.topk(cosine_scores, k=min(top_k, len(documents)))

    print("\n--- 搜尋結果 (最相關的前 {} 筆) ---".format(top_k))
    for score, idx in zip(top_results.values, top_results.indices):
        print(documents[idx], score)
    print("="*20 + "AI回應" + "*"*20)

    retrieved_doc = [documents[i] for i in top_results.indices]
    qa_prompt = f"""
    根據以下資訊：
    "{retrieved_doc}"
    
    請回答這個問題："{query}"
    """
    answer = gemma_resp([qa_prompt])
    print(answer[0])


--- 搜尋結果 (最相關的前 3 筆) ---
機器人喜歡吃蔬菜 tensor(0.9360, device='cuda:0')
機器人的薪水買不起房 tensor(0.9283, device='cuda:0')
機器人每天都會去爬山 tensor(0.9228, device='cuda:0')
根據提供的資訊，機器人的興趣是：

*   **喜歡吃蔬菜**
*   **每天都會去爬山**

這兩個都是直接提到的興趣。

