In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.utils.quantization_config import BitsAndBytesConfig
import os


GENERATOR_MODEL_NAME = "Qwen/Qwen1.5-1.8B-Chat"
LOCAL_GENERATOR_SAVE_PATH = "../models/generator_qwen"
os.makedirs(LOCAL_GENERATOR_SAVE_PATH, exist_ok=True)

USE_QUANTIZATION = True
QUANTIZATION_TYPE = "int8"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

Using device: cuda


In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    GENERATOR_MODEL_NAME,
    use_fast=True,
    trust_remote_code=True,
)
print("Tokenizer loaded.")

if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
    print(f"Set pad_token_id to eos_token_id: {tokenizer.eos_token_id}")

quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
    trust_remote_code=True,
)

print(f"Loading model {GENERATOR_MODEL_NAME}")
model_kwargs = {
    "device_map": "auto"
}

model_kwargs["quantization_config"] = quantization_config

model = AutoModelForCausalLM.from_pretrained(
    GENERATOR_MODEL_NAME,
    **model_kwargs
)
print("Generator loaded.")

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

Tokenizer loaded.
Loading model Qwen/Qwen1.5-1.8B-Chat


config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.67G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]

Generator loaded.


In [None]:
model.save_pretrained(LOCAL_GENERATOR_SAVE_PATH)
tokenizer.save_pretrained(LOCAL_GENERATOR_SAVE_PATH)
print("Model and tokenizer saved localy.")


Zapisywanie modelu i tokenizatora do: ../models/generator_qwen
Model i tokenizer zapisane lokalnie.
UWAGA: Model został załadowany z kwantyzacją. Standardowe `save_pretrained`
zapisuje wagi modelu (potencjalnie bez kwantyzacji lub w pełnej precyzji, jeśli to możliwe).
Aby ponownie załadować model z kwantyzacją z lokalnej ścieżki,
nadal będziesz musiał przekazać `quantization_config` do `from_pretrained`.
Lepszym podejściem może być poleganie na cache Hugging Face dla modeli kwantyzowanych,
lub badanie specyficznych metod zapisu/ładowania dla `bitsandbytes`.


In [5]:
def generate_response(prompt, max_new_tokens=200, temperature=0.7, top_p=0.9, do_sample=True):
    # format prompt to model prompt format
    messages = [{"role": "user", "content": prompt}]
    formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    # tokenize prompt
    inputs = tokenizer(formatted_prompt, return_tensors="pt", padding=False, truncation=True).to(DEVICE)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature if do_sample else 1.0,
            top_p=top_p if do_sample else 1.0,
            do_sample=do_sample,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
        
    # skip prompt tokens in response
    response_text = tokenizer.decode(outputs[0][inputs.input_ids.shape[-1]:], skip_special_tokens=True)
    return response_text

In [7]:
prompt1 = "What is the capital of France?"
response = generate_response(prompt1, max_new_tokens=50)
print(response)

print("\n" + "="*50 + "\n")

prompt2 = "Explain the concept of black holes in simple terms."
response = generate_response(prompt2, max_new_tokens=150)
print(response)

print("\n" + "="*50 + "\n")

prompt3 = "Write a short story about a friendly robot who discovers a hidden garden."
response = generate_response(prompt3, max_new_tokens=250, temperature=0.8)
print(response)

print("\n" + "="*50 + "\n")

prompt4 = "Translate the following English sentence to Polish: 'Hello, how are you today?'"
response = generate_response(prompt4, max_new_tokens=30)
print(response)


Paris


Black holes are regions in space where the gravitational pull is so strong that not even light can escape from it. They are formed when massive stars, such as our sun, exhaust their nuclear fuel and collapse under their own gravity, causing them to explode in a supernova explosion. The core of the star then塌缩 into a incredibly dense point, known as a singularity, which possesses an infinite amount of mass and energy.

In this singularity, gravity becomes so strong that not even the smallest particles, like quarks and leptons, can escape. This creates what is called spacetime curvature, or event horizon, which marks the boundary of the black hole's influence radius. As you get further away from the black hole, the gravitational force becomes


In the bustling city of New York, there lived a friendly robot named Robo. Robo had been designed to assist humans in various tasks, from cleaning and cooking to providing entertainment and companionship. Despite his many duties, Robo alwa