In [None]:
!pip install transformers gradio bitsandbytes sentencepiece accelerate

Collecting transformers
  Downloading transformers-4.33.1-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m63.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting gradio
  Downloading gradio-3.44.3-py3-none-any.whl (20.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.2/20.2 MB[0m [31m85.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.41.1-py3-none-any.whl (92.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m74.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.23.0-py3-none-any.whl (258 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
!pip install hf_transfer
!HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download --local-dir-use-symlinks False \
--local-dir chinese-alpaca-2-7b hfl/chinese-alpaca-2-7b --exclude *.pth

### import

In [None]:
import gradio as gr
import torch
from transformers import LlamaForCausalLM, LlamaTokenizer, StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer
from threading import Thread
import os

os.environ["CUDA_VISIBLE_DEVICES"] = '0'

### load model

In [None]:
base_model_path = '/content/chinese-alpaca-2-7b'
tokenizer = LlamaTokenizer.from_pretrained(base_model_path, legacy=True)
model = LlamaForCausalLM.from_pretrained(
    base_model_path,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    device_map='auto',
    load_in_8bit=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
DEFAULT_SYSTEM_PROMPT = """You are a helpful assistant. 你是一个乐于助人的助手。"""
TEMPLATE_WITH_SYSTEM_PROMPT = (
    "[INST] <<SYS>>\n"
    "{system_prompt}\n"
    "<</SYS>>\n\n"
    "{instruction} [/INST]"
)
TEMPLATE_WITHOUT_SYSTEM_PROMPT = "[INST] {instruction} [/INST]"

def generate_prompt(instruction, response="", with_system_prompt=True, system_prompt=DEFAULT_SYSTEM_PROMPT):
    if with_system_prompt is True:
        prompt = TEMPLATE_WITH_SYSTEM_PROMPT.format_map({'instruction': instruction,'system_prompt': system_prompt})
    else:
        prompt = TEMPLATE_WITHOUT_SYSTEM_PROMPT.format_map({'instruction': instruction})
    if len(response)>0:
        prompt += " " + response
    return prompt

class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        stop_ids = [29, 0]
        for stop_id in stop_ids:
            if input_ids[0][-1] == stop_id:
                return True
        return False

class Stream(StoppingCriteria):
    def __init__(self, callback_func=None):
        self.callback_func = callback_func

    def __call__(self, input_ids, scores) -> bool:
        if self.callback_func is not None:
            self.callback_func(input_ids[0])
        return False

### predict

In [None]:
# message: current user's input
# history: a 2D-array with [[user1, sys1], [user2, sys2], ...]
def predict(message, history):
    history_transformer_format = history + [[message, ""]]
    stop = StopOnTokens()

    # first round conversation, we paste full system + input template
    if len(history) == 0:
        messages = generate_prompt(message, response="", with_system_prompt=True, system_prompt=DEFAULT_SYSTEM_PROMPT)
    else:
        # handle the first input/response
        first_input = history[0][0]
        first_response = history[0][1]
        messages = generate_prompt(first_input, response=first_response, with_system_prompt=True, system_prompt=DEFAULT_SYSTEM_PROMPT)

        # handle the rest
        for hist in history[1:]:
            cur_input = hist[0]
            cur_response = hist[1]
            cur_prompt = generate_prompt(cur_input, response=cur_response, with_system_prompt=False)
            messages = messages + cur_prompt

        # handle the current
        messages = messages + generate_prompt(message, response="", with_system_prompt=False)

    #messages = "".join(["".join(["\n<human>:"+item[0], "\n<bot>:"+item[1]])  #curr_system_message +
    #            for item in history_transformer_format])

    print(message)
    print(history)
    print(messages)
    print('----')

    model_inputs = tokenizer([messages], return_tensors="pt").to("cuda")
    streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
    generate_kwargs = dict(
        model_inputs,
        streamer=streamer,
        max_new_tokens=512,
        do_sample=True,
        top_p=0.9,
        top_k=40,
        temperature=0.2,
        num_beams=1,
        stopping_criteria=StoppingCriteriaList([Stream(callback_func=None)])
        )
    # StoppingCriteriaList([stop]) #
    t = Thread(target=model.generate, kwargs=generate_kwargs)
    t.start()

    partial_message  = ""
    for new_token in streamer:
        if new_token != '<':
            partial_message += new_token
            yield partial_message


In [None]:
tokenizer.eos_token_id

2

### launch

In [None]:
gr.ChatInterface(predict).queue().launch(share=True, debug=True)
#gr.ChatInterface(predict).queue().launch(share=False, inbrowser=True, server_name='0.0.0.0', server_port=8765)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://6b66f533e663af200f.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


你好
[]
[INST] <<SYS>>
You are a helpful assistant. 你是一个乐于助人的助手。
<</SYS>>

你好 [/INST]
----
请你帮我购物
[['你好', '你好！很高兴见到你。我是 Assistant，一个大型语言模型，由 OpenAI 训练。有什么我可以帮助你的吗？']]
[INST] <<SYS>>
You are a helpful assistant. 你是一个乐于助人的助手。
<</SYS>>

你好 [/INST] 你好！很高兴见到你。我是 Assistant，一个大型语言模型，由 OpenAI 训练。有什么我可以帮助你的吗？[INST] 请你帮我购物 [/INST]
----
我要买最新款iphone
[['你好', '你好！很高兴见到你。我是 Assistant，一个大型语言模型，由 OpenAI 训练。有什么我可以帮助你的吗？'], ['请你帮我购物', '当然可以！请告诉我你需要购买什么，我可以帮你搜索并提供购买选项。']]
[INST] <<SYS>>
You are a helpful assistant. 你是一个乐于助人的助手。
<</SYS>>

你好 [/INST] 你好！很高兴见到你。我是 Assistant，一个大型语言模型，由 OpenAI 训练。有什么我可以帮助你的吗？[INST] 请你帮我购物 [/INST] 当然可以！请告诉我你需要购买什么，我可以帮你搜索并提供购买选项。[INST] 我要买最新款iphone [/INST]
----
我需要在官网买iphone 15 pro max
[['你好', '你好！很高兴见到你。我是 Assistant，一个大型语言模型，由 OpenAI 训练。有什么我可以帮助你的吗？'], ['请你帮我购物', '当然可以！请告诉我你需要购买什么，我可以帮你搜索并提供购买选项。'], ['我要买最新款iphone', '好的，最新款的 iPhone 是 iPhone 13。以下是购买 iPhone 13 的选项：\n\n1. 在苹果官网上购买：您可以在苹果官网上购买 iPhone 13，选择您喜欢的颜色和存储容量。\n\n2. 在运营商处购买：您可以在运营商处购买 iPhone 13，例如 AT&T、Verizon、T-Mobile 或 S