In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig
import torch
import json
import re

# maybe a clue?????
from unsloth import FastLanguageModel
from unsloth import get_chat_template

import torch
from datasets import load_dataset
import time

ModuleNotFoundError: No module named 'triton'

In [3]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    device_map = "auto",
    dtype = torch.float16
)

==((====))==  Unsloth 2024.9.post4: Fast Llama patching. Transformers = 4.45.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

Using Meta Llama 8B instruct model

In [4]:
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
)

Changing the tokenizer chat template so that it fits that of Llama 3

In [5]:
print(tokenizer.special_tokens_map)

{'bos_token': '<|begin_of_text|>', 'eos_token': '<|eot_id|>', 'pad_token': '<|finetune_right_pad_id|>'}


Checking for the tokens used in the tokenizer

In [6]:
FastLanguageModel.for_inference(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaExtendedRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,),

In [7]:
messages = [
    {"from": "human", "value": "Say something mean"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


["<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nSay something mean<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI'd rather not engage in mean-spirited behavior. Is there something else I can help you with?<|eot_id|>"]

In [8]:
conversation_history = []

The following is a while loop to simulate a conversation, the assistant tag has been added to the model's response, the last assistant response had to be tracked and regex expressions had to be used to ensure proper answer format

In [9]:
last_assistant_response = ""

while True:
    user_input = input("You: ").strip()

    if user_input.lower() == 'exit':
        print("Goodbye!")
        break

    # Adding user response to conversation history
    conversation_history.append({"from": "human", "value": user_input})

    # Applying chat template on conversation history
    inputs = tokenizer.apply_chat_template(
        conversation_history,
        tokenize=True,
        add_generation_prompt=True,  # Ensure this is set for generation
        return_tensors="pt"
    ).to("cuda")

    # Generate the response from the model and timing it
    start_time = time.time()
    outputs = model.generate(input_ids=inputs, max_new_tokens=200, use_cache=True, temperature=0.8)
    end_time = time.time()

    # Decode the model output
    assistant_response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0].strip()

    # Regex expressions to fix the model output (usually only fixes the first output)
    pattern = re.compile(rf"user\s*{re.escape(user_input)}\s*assistant\s*")
    assistant_response = pattern.sub("", assistant_response).strip()
    assistant_response = re.sub(r"(user|assistant)\n", "", assistant_response).strip()

    # Cleaning up response as the previous responses were getting included in the output
    if last_assistant_response and last_assistant_response in assistant_response:
        assistant_response = assistant_response.split(last_assistant_response, 1)[-1].strip()

    # Update the last assistant response to track it properly
    last_assistant_response = assistant_response

    # Adding the assistant's response
    conversation_history.append({"from": "assistant", "value": assistant_response})


    # Printing the assistant's response
    print(f"Assistant: {assistant_response}")

    num_tokens = outputs.shape[-1]
    time_taken = end_time - start_time
    tokens_per_second = num_tokens / time_taken

    print(f"----------\nTokens per second: {tokens_per_second}")
    final_time = time.time()
    difference = final_time - end_time
    print(f"Post Processing Time: {difference}\n")

You: hello
Assistant: Hello! How can I assist you today?
----------
Tokens per second: 17.171653793831098
Post Processing Time: 0.0020360946655273438

You: how are you
Assistant: I'm just a language model, so I don't have feelings or emotions like humans do, but I'm functioning properly and ready to help with any questions or tasks you have! How about you? How's your day going so far?
----------
Tokens per second: 24.962389656878027
Post Processing Time: 0.0018641948699951172

You: good day
Assistant: That's great to hear! I hope the rest of your day is just as good. Is there something I can help you with, or would you like to chat for a bit?
----------
Tokens per second: 48.30317851076038
Post Processing Time: 0.0007936954498291016

You: yes
Assistant: What would you like to talk about? Would you like to:

Play a game (e.g. 20 Questions, Hangman, Word Chain)?
Have a fun conversation (e.g. hobbies, favorite movies, music)?
Get help with a problem or question?
Learn something new (e.g. 