# Gemma3N-E2N

In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HybridCache, Gemma3ForCausalLM, GemmaTokenizerFast, AutoProcessor, Gemma3nForCausalLM
# transformers >= 4.53.0
# timm==1.0.19
# torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124

PATH = "C:/Users/user/LLM/gemma3n"

quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16
        )

# model = Gemma3ForCausalLM.from_pretrained(
#     PATH,
#     quantization_config=quantization_config,
#     torch_dtype=torch.bfloat16,
#     device_map="auto",
#     low_cpu_mem_usage=True
#     )

model = Gemma3nForCausalLM.from_pretrained(PATH,
                                        device_map="auto", 
                                        torch_dtype=torch.bfloat16,
                                        low_cpu_mem_usage=True
                                        )

model = model.eval()
tokenizer = GemmaTokenizerFast.from_pretrained(PATH)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.26s/it]


In [2]:
msg = """<start_of_turn>user
[一律用繁體中文回應]{prompt}<end_of_turn>
<start_of_turn>model
"""
def gemma3_resp(prompt):
    MSG = msg.format(prompt=prompt)
    input_ids = torch.tensor(tokenizer.encode(MSG)).to(model.device)
    input_ids = input_ids.unsqueeze(0)
    past_key_values = HybridCache(
        config = model.config,
        max_cache_len=1024,
        max_batch_size=1,
        device=model.device,
        dtype=torch.bfloat16
    )
    eos_token_ids = [tokenizer.eos_token_id, 106]

    output_len = 32768
    res = list()
    for _ in range(output_len):
        with torch.no_grad():
            outputs = model(
                input_ids=input_ids,
                use_cache=True,
                past_key_values=past_key_values,
            )
            logits = outputs.logits
            next_token = torch.argmax(logits[:, -1, :], dim=-1, keepdim=True)
            token_id = next_token.item()
            if token_id in eos_token_ids:
                break
            res += [tokenizer.decode(token_id)]
            print(res[-1], end="", flush=True)
            input_ids = next_token
    return "".join(res)

In [3]:
import gc
msg = """<start_of_turn>user
{prompt}<end_of_turn><eos>
<start_of_turn>model
"""
prompt = "你可以做什麼事情？"
try:
    MSG = msg.format(prompt = prompt)
    input_ids = tokenizer.encode(MSG, return_tensors="pt").to(model.device)
    eos_token_ids = {tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<end_of_turn>")}
    res = list()
    past_key_values = HybridCache(
        config = model.config,
        max_cache_len=32796,
        max_batch_size=1,
        device=model.device,
        dtype=torch.bfloat16
    )
    with torch.no_grad():
        for _ in range(32796):
            outputs = model(input_ids=input_ids, use_cache=True, past_key_values=past_key_values)
            next_token = torch.argmax(outputs.logits[:, -1, :], dim=-1, keepdim=True)
            token_id = next_token.item()
            if token_id in eos_token_ids:
                break
            input_ids = next_token
            # input_ids = token_id
            res += [tokenizer.decode(token_id)]
            print(res[-1], end="", flush = True)

    
finally:
    del input_ids
    del outputs
    del next_token
    del token_id
    gc.collect()
    torch.cuda.empty_cache()

我是一個大型語言模型，由 Google DeepMind 訓練。我可以執行各種任務，包括：

*   **生成不同創意文本格式的內容：** 例如詩歌、程式碼、劇本、音樂作品、電子郵件、信件等。我會盡力滿足你的所有要求。
*   **回答你的問題，即使是開放式、具有挑戰性或奇怪的問題。** 我會盡我所能提供有用的資訊。
*   **翻譯語言。**
*   **總結文本。**
*   **遵循你的指示並完成你的請求，我會盡力按照你的要求執行。**

我還在不斷學習和改進中！

總而言之，我可以幫助你完成各種文字相關的任務。 你想讓我做些什麼呢？


# Gemma3N模型多模態

In [1]:
import torch
from PIL import ImageGrab
from transformers import AutoProcessor, BitsAndBytesConfig, HybridCache, Gemma3nForConditionalGeneration, Cache

PATH = "C:/Users/user/LLM/gemma3n"
# quantization_config = BitsAndBytesConfig(
#         load_in_4bit=True,
#         bnb_4bit_quant_type="nf4",
#         bnb_4bit_compute_dtype=torch.bfloat16,
#     )
model = Gemma3nForConditionalGeneration.from_pretrained(
    PATH,
    # quantization_config=quantization_config,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    low_cpu_mem_usage=True
    )
model = model.eval()
processor = AutoProcessor.from_pretrained(PATH)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 3/3 [00:04<00:00,  1.34s/it]
Some parameters are on the meta device because they were offloaded to the cpu.


In [2]:
import numpy as np
def gemma_resp(prompt):
    image = ImageGrab.grab().convert("RGB")
    prompt = processor.apply_chat_template(
        [
            {"role": "user",  "content": [{"type": "image", "image": image},
                                        {"type": "text", "text": prompt}]}
        ]
    )
    inputs = processor(text=prompt, images=image, return_tensors="pt").to(model.device)
    input_ids = inputs["input_ids"]
    pixel_values = inputs["pixel_values"].to(dtype=model.dtype)
    seq_len = input_ids.shape[-1]
    
    with torch.no_grad():
        outputs = model(
            input_ids=input_ids,
            pixel_values=pixel_values,
            use_cache=True,
        )
    logits = outputs.logits
    past_key_values = outputs.past_key_values
    next_token = torch.argmax(logits[:, -1, :], dim=-1, keepdim=True)
    input_ids = next_token

    eos_token_ids = {processor.tokenizer.eos_token_id, 106}

    output_len = 32768
    res = list()
    current_pos = seq_len
    for _ in range(output_len):
        with torch.no_grad():
            outputs = model(
                input_ids=input_ids,
                use_cache=True,
                past_key_values=past_key_values,
                pixel_values=None
            )
            logits = outputs.logits
            past_key_values = outputs.past_key_values
            next_token = torch.argmax(logits[:, -1, :], dim=-1, keepdim=True)
            token_id = next_token.item()
            if token_id in eos_token_ids:
                break
            
            res += [processor.decode(token_id, skip_special_tokens=True)]
            print(res[-1], end="", flush=True)
            input_ids = next_token
            current_pos += 1
    return "".join(res)

In [3]:
gemma_resp("[繁體中文回答] 這張圖內容是什麼 ?")

張圖顯示了一個 Python 程式碼編輯器，程式碼正在執行一個使用 `transformers` 庫的程式。程式碼似乎正在使用一個名為 `LLM_Code` 的模型，並進行一些與生成文本相關的操作。

以下是程式碼中一些關鍵部分的解釋：

* **`model = model.eval()`**: 這行程式碼將模型設定為評估模式，這意味著它將不執行任何訓練相關的操作。
* **`processor = AutoProcessor.from_pretrained("path/to/model")`**: 這行程式碼從指定的目錄中載入一個 `AutoProcessor` 物件，用於處理輸入文本。
* **`prompt = "user: [text], [type], [image], [image], [text], [text], prompt"`**: 這行程式碼定義了一個提示，用於指示模型生成文本。
* **`inputs = processor(prompt, image_images=True, return_tensors="pt")`**: 這行程式碼將提示和圖像作為輸入傳遞給 `processor`，並將輸出轉換為 PyTorch  tensors。
* **`outputs = model.generate(...)`**: 這行程式碼使用模型生成文本。
* **`with torch.no_grad(): ...`**: 這行程式碼禁用梯度計算，以提高程式碼的效率。

總體而言，這段程式碼正在使用一個大型語言模型生成文本，並使用圖像作為輸入。

'張圖顯示了一個 Python 程式碼編輯器，程式碼正在執行一個使用 `transformers` 庫的程式。程式碼似乎正在使用一個名為 `LLM_Code` 的模型，並進行一些與生成文本相關的操作。\n\n以下是程式碼中一些關鍵部分的解釋：\n\n* **`model = model.eval()`**: 這行程式碼將模型設定為評估模式，這意味著它將不執行任何訓練相關的操作。\n* **`processor = AutoProcessor.from_pretrained("path/to/model")`**: 這行程式碼從指定的目錄中載入一個 `AutoProcessor` 物件，用於處理輸入文本。\n* **`prompt = "user: [text], [type], [image], [image], [text], [text], prompt"`**: 這行程式碼定義了一個提示，用於指示模型生成文本。\n* **`inputs = processor(prompt, image_images=True, return_tensors="pt")`**: 這行程式碼將提示和圖像作為輸入傳遞給 `processor`，並將輸出轉換為 PyTorch  tensors。\n* **`outputs = model.generate(...)`**: 這行程式碼使用模型生成文本。\n* **`with torch.no_grad(): ...`**: 這行程式碼禁用梯度計算，以提高程式碼的效率。\n\n總體而言，這段程式碼正在使用一個大型語言模型生成文本，並使用圖像作為輸入。'

# Sentence Embedding

In [5]:
from sentence_transformers import SentenceTransformer
sentences = ["This is an example sentence", "Each sentence is converted"]

model = SentenceTransformer('C:/Users/user/LLM/all-MiniLM-L6-v2')
embeddings = model.encode(sentences)
embeddings.shape


  return forward_call(*args, **kwargs)


(2, 384)

# Small Gemma3

In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HybridCache, Gemma3ForCausalLM, GemmaTokenizerFast

PATH = "C:/Users/user/LLM/smallgemma3"

quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16
        )

model = Gemma3ForCausalLM.from_pretrained(
    PATH,
    quantization_config=quantization_config,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    low_cpu_mem_usage=True
    )
model = model.eval()
tokenizer = GemmaTokenizerFast.from_pretrained(PATH, truncate = True)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch

def gemma_resp(prompt, repetition_penalty=1.2): # 1. 新增 repetition_penalty 參數
    MSG = f"<start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>model\n"
    input_ids = torch.tensor(tokenizer.encode(MSG)).to(model.device)
    
    generated_token_ids = input_ids.tolist()

    input_ids = input_ids.unsqueeze(0)
    past_key_values = HybridCache(
        config=model.config,
        max_cache_len=1024,
        max_batch_size=1,
        device=model.device,
        dtype=torch.bfloat16
    )
    eos_token_ids = [tokenizer.eos_token_id] + tokenizer.encode('<end_of_turn>')

    output_len = 32768
    res = list()
    for _ in range(output_len):
        with torch.no_grad():
            outputs = model(
                input_ids=input_ids,
                use_cache=True,
                past_key_values=past_key_values,
            )
            logits = outputs.logits
            next_token_logits = logits[:, -1, :]

            # --- Repetition Penalty 核心邏輯開始 ---
            if repetition_penalty != 1.0 and len(generated_token_ids) > 0:
                unique_generated_ids = torch.tensor(
                    list(set(generated_token_ids)), 
                    device=model.device
                )
                
                # 對這些 token ID 對應的 logits 施加懲罰
                # 分數 > 0 時，除以 penalty (降低概率)
                # 分數 < 0 時，乘以 penalty (使其更負，進一步降低概率)
                # 這裡使用 gather 和 scatter_ 來高效地操作
                score = torch.gather(next_token_logits, 1, unique_generated_ids.unsqueeze(0))
                
                score[score > 0] /= repetition_penalty
                score[score < 0] *= repetition_penalty

                next_token_logits.scatter_(1, unique_generated_ids.unsqueeze(0), score)
            # --- Repetition Penalty 核心邏輯結束 ---
            
            next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)
            token_id = next_token.item()
            if token_id in eos_token_ids:
                break
            generated_token_ids.append(token_id)
            res += [tokenizer.decode(token_id)]
            print(res[-1], end="", flush=True)
            
            input_ids = next_token
    return "".join(res)

In [None]:
question = "幫我思考一下，想要做FastAPI的設計，實現串流效果？"
res = gemma_resp(question)

FastAPI 旨在提供一個簡單、快速且高效的 API 接口，它主要集中在以下幾個核心功能：

*   **簡單的 API 接口:**  這就是 FastAPI 的核心。它允許開發者簡單地測試和使用，並在短時間內快速測試。
*   **快速的 Stream Processing:**  Stream Processing 整合了 Stream 處理技術，可以快速地將數據轉換為 Stream 格式，然後