<a href="https://colab.research.google.com/github/HamidShojanazeri/LLM_fine_tuning/blob/main/llama_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers>=4.31.0
!pip install bitsandbytes>=0.40.2
!pip install accelerate>=0.21.0
!pip install gradio
!pip install scipy
!pip install sentencePiece
!pip install peft
!pip install peft



In [None]:
from google.colab import drive
import os

drive.mount('/content/drive', force_remount=True)
os.chdir('PATH to your Folder')

Mounted at /content/drive




In [None]:
peft_model = 'llama-boz-3e4lr-fullparameter-llama-7b'

In [None]:
!ls llama-boz-3e4lr-fullparameter-llama-7b

adapter_config.json  adapter_model.bin	README.md


In [None]:
import os
from queue import Queue
from threading import Thread
import textwrap

import gradio as gr
from transformers import LlamaForCausalLM, LlamaTokenizer
from peft import PeftModel, PeftConfig

class StreamHandler:
    def __init__(self):
        self.queue = Queue()

    def put(self, item):
        self.queue.put({"type": "content", "content": item}, block=False)

    def end(self):
        self.queue.put({"type": "termination", "content": None}, block=False)


def format_prompt(history, message, system_prompt):
    B_INST, E_INST = "[INST]", "[/INST]"
    B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"

    prompt = f"{B_INST} {B_SYS}{system_prompt}{E_SYS} "
    for user_msg, asst_msg in history:
        user_msg = str(user_msg).strip()
        asst_msg = str(asst_msg).strip()
        prompt += f"{user_msg} {E_INST} {asst_msg} </s><s> {B_INST} "

    message = str(message).strip()
    prompt += f"{message} {E_INST} "
    return prompt


def build_generator(
    model_name, auth_token,peft_model=None, temperature=0.6, top_p=0.9, max_gen_len=200
):
    SYSTEM_PROMPT = """\
    You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

    If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""
    SYSTEM_PROMPT = textwrap.dedent(SYSTEM_PROMPT).strip()

    tokenizer = LlamaTokenizer.from_pretrained(model_name, token=auth_token)
    model = LlamaForCausalLM.from_pretrained(
        model_name, token=auth_token, load_in_8bit=True, device_map="auto"
    ).eval()
    if peft_model:
      model = PeftModel.from_pretrained(model, peft_model)


    # Alternative implementation using streaming
    def generate_process(inputs, stream_handler):
        model.generate(
            **inputs,
            max_new_tokens=max_gen_len,
            temperature=temperature,
            top_p=top_p,
            streamer=stream_handler,
        )

    def stream_response(message, history):
        prompt = format_prompt(history, message, SYSTEM_PROMPT)
        # inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        inputs = tokenizer(message, return_tensors="pt").to(model.device)
        stream_handler = StreamHandler()

        t = Thread(target=generate_process, args=(inputs, stream_handler))
        t.start()  # Running in background

        # The first item in the queue contains the content, so we can ignore it
        stream_handler.queue.get(block=True)

        # Start now
        token_ids = []
        while True:
            item = stream_handler.queue.get(block=True)
            if item["type"] == "termination":
                break
            token_id = item["content"][0].item()
            token_ids.append(token_id)
            yield tokenizer.decode(token_ids, skip_special_tokens=True)

        # Wait for the thread to finish
        t.join()

    return stream_response

In [None]:
print("Building generator...")
auth_token = ""
model_name = "meta-llama/Llama-2-7b-hf"
respond = build_generator(model_name=model_name, auth_token=auth_token,peft_model=peft_model)

print("Starting server...")
title = model_name.split("/")[-1].replace("-", " ") + " local"
desc = f"This Space demonstrates [{model_name}](https://huggingface.co/{model_name}) by Meta."
css = """.toast-wrap { display: none !important } """

Building generator...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Starting server...


In [None]:
ci = gr.ChatInterface(respond, title=title.title(), description=desc, css=css)
ci.queue().launch(inline=True, share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://33f344f78c57d449a4.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


