<a href="https://colab.research.google.com/github/Maziger/master-generative-ai-with-llm/blob/main/Notebooks/Mixtral_chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl (61.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m38.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.47.0


In [None]:
import torch, transformers, gradio as gr
from transformers import AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
from threading import Thread

def format_messages_from_gradio(history):
    """
    history is a list of [user, assistant] pairs from gr.Chatbot.
    Convert to a strictly alternating messages list for chat templates.
    """
    messages = []
    for user_msg, assistant_msg in history:
        if user_msg:      # always start with user
            messages.append({"role": "user", "content": user_msg})
        if assistant_msg: # then assistant reply for that turn (if present)
            messages.append({"role": "assistant", "content": assistant_msg})
    return messages

def format_chat_history(pipe, history) -> str:
    messages = format_messages_from_gradio(history)
    # add_generation_prompt=True requires the last role to be "user"
    # If history currently ends with an assistant (because we're about to generate),
    # we do nothing here because the user just added a new message with assistant=None.
    return pipe.tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

def model_loading_pipeline():
    model_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
    tokenizer = AutoTokenizer.from_pretrained(model_id)

    quant_cfg = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16
    )

    pipe = transformers.pipeline(
        task="text-generation",
        model=model_id,
        tokenizer=tokenizer,
        device_map="auto",
        model_kwargs={
            "torch_dtype": torch.float16,
            "quantization_config": quant_cfg,
        },
    )

    # Correct args: skip_prompt/skip_special_tokens, and 'timeout' (lowercase) if you want it
    streamer = TextIteratorStreamer(
        tokenizer,
        skip_prompt=True,
        skip_special_tokens=True,
        timeout=None,
    )
    return pipe, streamer

def launch_gradio_app(pipe, streamer):
    with gr.Blocks() as demo:
        chatbot = gr.Chatbot(height=450)
        msg = gr.Textbox(placeholder="Type your message…", scale=1)
        clear = gr.Button("Clear")

        def user(user_message, history):
            # Append a new turn with assistant placeholder
            return "", history + [[user_message, None]]

        def bot(history):
            # Build a proper chat prompt using the tokenizer's chat template
            prompt = format_chat_history(pipe, history)

            # Prepare UI slot for streaming tokens
            history[-1][1] = ""

            gen_kwargs = dict(
                max_new_tokens=512,
                do_sample=True,
                temperature=0.7,
                top_k=50,
                top_p=0.95,
                streamer=streamer,   # pass streamer at CALL time
                return_full_text=False,
            )

            # Important: pass the prompt as the *first positional arg* (or use key 'text')
            thread = Thread(target=pipe, args=(prompt,), kwargs=gen_kwargs)
            thread.start()

            for token in streamer:
                history[-1][1] += token
                yield history

        msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
            bot, chatbot, chatbot
        )
        clear.click(lambda: None, None, chatbot, queue=False)

    demo.queue()
    demo.launch(share=True, debug=True)

if __name__ == "__main__":
    pipe, streamer = model_loading_pipeline()
    launch_gradio_app(pipe, streamer)


Loading checkpoint shards:   0%|          | 0/19 [00:00<?, ?it/s]

Device set to use cuda:0
  chatbot = gr.Chatbot(height=450)


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://13dd47baa228f3e17a.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://13dd47baa228f3e17a.gradio.live
