In [7]:
from typing import Optional, List, Tuple
from tqdm.notebook import tqdm
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import pipeline, Conversation, LlamaForCausalLM, LlamaTokenizer, LlamaTokenizerFast
from langchain import HuggingFaceHub
from langchain.llms import HuggingFacePipeline
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from langchain_core.prompts import PromptTemplate
import json
import textwrap


# loading model and tokenizer add global variables

In [2]:
MODEL_PATH = 'local_model/tiny_llama'
TOKENIZER_PATH = 'local_tokenizer/tiny_llama'
MAX_ANSWER_LEN = 128
MIN_ANSWER_LEN = 12
RETURN_SENTENCE = 1
BATCH_SIZE = 1
torch.manual_seed(21)
TOKENIZER = LlamaTokenizerFast.from_pretrained(TOKENIZER_PATH, model_max_length=MAX_ANSWER_LEN)
MODEL = LlamaForCausalLM.from_pretrained(MODEL_PATH,
                                        device_map='auto',
                                        torch_dtype=torch.float16,
                                        low_cpu_mem_usage=True,
                                        max_position_embeddings=MAX_ANSWER_LEN
                                        )
DEVICE =  'cuda' if torch.cuda.is_available() else 'cpu'

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


In [6]:
TOKEN_EOS = TOKENIZER.eos_token_id

In [3]:
MODEL

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Line

In [4]:
TOKENIZER

LlamaTokenizerFast(name_or_path='local_tokenizer/tiny_llama', vocab_size=32000, model_max_length=128, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '</s>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [5]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# youtube guild 

In [9]:
chat_bot = pipeline(task='text-generation',
                    model=MODEL,
                    tokenizer=TOKENIZER,
                    torch_dtype=torch.float16,
                    max_new_token_length=MAX_ANSWER_LEN,
                    do_sample = True,
                    top_k=5,
                    num_return_sentences=RETURN_SENTENCE,
                    eos_token_id=TOKEN_EOS)

## prompt variables

In [2]:
B_INST = '[INST]'
E_INST = '[/INST]'
B_SYS = '<<SYS>>\n'
E_SYS = '\n<</SYS>>\n\n'
DEFAULT_PROMPT = '''\\
Answer user complains as tech support worker 
'''

In [5]:
def get_prompt(instruct:str, new_sys_prompt:str=DEFAULT_PROMPT) -> str:
    SYS_PROMPT = B_SYS + new_sys_prompt + E_SYS
    template = B_INST + SYS_PROMPT + instruct + B_INST
    return template

def cut_off_text(text:str, prompt:str) -> str:
    idx = text.find(prompt)
    if idx:
        text = text[:idx]
    
    return text

def remove_sub(string:str, sub:str) -> str:
    return string.sub(sub, '')

def generator(text:str) -> str:
    prompt = get_prompt(text)
    with torch.autocast(DEVICE, dtype=torch.float16):
        inputs = TOKENIZER(prompt, return_tensors='pt').to(DEVICE)
        dirty_output = MODEL.generate(**inputs,
                                      max_new_tokens=MAX_ANSWER_LEN,
                                      eos_token_id=TOKENIZER.eos_token_id,
                                      pad_token_id=TOKENIZER.pad_token_id)
        output = TOKENIZER.batch_decode(dirty_output, skip_special_tokens=True)[0]
        output = cut_off_text(output, '</s>')
        output = remove_sub(output, prompt)
    return output

# langchain chatbot

In [6]:
from langchain.memory import ConversationBufferMemory
from langchain import LLMChain, PromptTemplate
from langchain import HuggingFacePipeline

In [None]:
instruct = 'Chat History:\n\n{chat_history} \n\nUser: {user_input}'
llm = HuggingFacePipeline(pipeline=chat_bot, model_kwargs={'temperature':0})

In [None]:
sys_prompt = DEFAULT_PROMPT

In [None]:
template = get_prompt(instruct, sys_prompt)
print(template)

In [None]:
prompt = PromptTemplate(
    input_variables=['chat_history', 'user_input'],
    template=template
)
memory = ConversationBufferMemory(memory_key='chat_history')

In [None]:
llm_chain = LLMChain(
    llm=llm,
    prompt=prompt,
    verbose=True,
    memory=memory
)

In [None]:
llm_chain.predict(user_input="The estimated delivery time keeps changing! Now it says it'll be another hour?")