# Chat sanity check (Unsloth)

This notebook loads the LoRA adapter + tokenizer from `outputs_chess_distill` and lets you
quickly test whether chat capabilities were preserved after vocabulary changes.


In [3]:
from pathlib import Path

import torch
from unsloth import FastLanguageModel

MODEL_DIR = Path("outputs_chess_distill")
MAX_SEQ_LENGTH = 2048
LOAD_IN_4BIT = torch.cuda.is_available()
DTYPE = None  # Let Unsloth pick a sensible dtype

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=str(MODEL_DIR),
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=DTYPE,
    load_in_4bit=LOAD_IN_4BIT,
)

# Ensure embeddings match the extended tokenizer vocabulary.
input_emb = model.get_input_embeddings()
if input_emb is not None and input_emb.weight.shape[0] != len(tokenizer):
    model.resize_token_embeddings(len(tokenizer))
    if hasattr(model, "tie_weights"):
        model.tie_weights()

FastLanguageModel.for_inference(model)
model.eval()

print("Tokenizer vocab size:", len(tokenizer))
print("Model embedding size:", model.get_input_embeddings().weight.shape[0])



Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel


ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2026.1.2: Fast Qwen2 patching. Transformers: 4.56.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2026.1.2 patched 24 layers with 24 QKV layers, 24 O layers and 24 MLP layers.


Tokenizer vocab size: 153635
Model embedding size: 153635


In [4]:
def _format_chat(messages):
    if hasattr(tokenizer, "apply_chat_template"):
        return tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
        )
    # Fallback for tokenizers without a chat template.
    prompt_lines = [f"{m['role']}: {m['content']}" for m in messages]
    return "\n".join(prompt_lines) + "\nassistant:"

@torch.inference_mode()
def chat(messages, max_new_tokens=256, temperature=0.7, top_p=0.9):
    prompt = _format_chat(messages)
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=temperature > 0,
        temperature=temperature,
        top_p=top_p,
        pad_token_id=tokenizer.eos_token_id,
    )
    gen_ids = outputs[0][inputs["input_ids"].shape[1]:]
    return tokenizer.decode(gen_ids, skip_special_tokens=False)


In [8]:
messages = [
    {"role": "system", "content": "You are a concise, friendly assistant."},
    {"role": "user", "content": "Say hello and tell me one fun fact about chess."},
]
response = chat(messages, max_new_tokens=128)
print(response)


When you get up to your knees on the floor,g7h8biving it all to the winds, you can't moveg7h8bore thanc4f4g5f6h4g3g6g5g5g4h6h5h7h6g7g5f4f3f7f6g7g6g6f6f6f5h7h5g7g6h7g8h6h5g6g7h4h3g7g6g5g4g4g3h7h5g6f7g6g5h7h66g6g5g4g3g6h5g5g4h6h5g6g5g7g6g6g5f5f4g6g5f6g6g6g5g5g4h6h5g6g5g5g4f7f6h6g6g7g6h6g5h6h5h5h4f6f7g6h7h5g4g6g5f4f3g5g4g6f6g7g6f7f6h5h4g5g6h6h5f4g5g5g4g6f5h5h4f4f3g6f6g7g6h5h4g6f6g5g4g6g5f4f3g6g5f4f3g6g5h5h6f4g4g6g5f5f4h6h7g7g6f7f6g6g5g5g4g7g6hÂÖ≠ÂÖ≠ÂÖ≠g6g5g6g5h5h4g7g6g6h6ÂÖ≠ÂçÅÂÖ≠‰∏™ÂÖ≠ÂÖ≠ÂÖ≠ÂçÅg6g5ÂÖ≠‰∏™


In [6]:
# Optional quick chat loop (type 'exit' to stop).
def chat_loop(system_prompt="You are a helpful assistant."):
    messages = []
    if system_prompt:
        messages.append({"role": "system", "content": system_prompt})
    while True:
        user_text = input("User: ")
        if not user_text or user_text.strip().lower() in {"exit", "quit"}:
            break
        messages.append({"role": "user", "content": user_text})
        reply = chat(messages)
        print("Assistant:", reply)
        messages.append({"role": "assistant", "content": reply})


In [None]:
messages = [
    {"role": "system", "content": "You are a concise, friendly assistant."},
    {"role": "user", "content": "Say hello and tell me one fun fact about chess."},
]
response = chat(messages, max_new_tokens=128)
print(response)
