In [1]:
import json
with open('pass.json') as pas:
    hugging_face_token = json.load(pas)['hugging_face_token']

In [2]:
from huggingface_hub import login, notebook_login
login(hugging_face_token, add_to_git_credential=True)

Token is valid (permission: read).
Your token has been saved in your configured git credential helpers (manager).
Your token has been saved to C:\Users\jrtit\.cache\huggingface\token
Login successful


In [3]:
from typing import Optional, List, Tuple
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm
from transformers import pipeline,  LlamaForCausalLM, LlamaTokenizerFast
from langchain import HuggingFaceHub
from langchain.llms import HuggingFacePipeline
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from langchain_core.prompts import PromptTemplate
import textwrap


# loading model and tokenizer add global variables

In [8]:
MODEL_PATH = 'local_model/tiny_llama'
TOKENIZER_PATH = 'local_tokenizer/tiny_llama'
MAX_ANSWER_LEN = 128
MIN_ANSWER_LEN = 12
RETURN_SENTENCE = 1
BATCH_SIZE = 1
torch.manual_seed(21)
TOKENIZER = LlamaTokenizerFast.from_pretrained(TOKENIZER_PATH, model_max_length=MAX_ANSWER_LEN)
MODEL = LlamaForCausalLM.from_pretrained(MODEL_PATH,
                                        device_map='auto',
                                        torch_dtype=torch.half,
                                        low_cpu_mem_usage=True,
                                        max_position_embeddings=MAX_ANSWER_LEN,
                                        
                                        )
DEVICE =  'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
TOKEN_EOS = TOKENIZER.eos_token_id

In [None]:
MODEL

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Line

In [None]:
TOKENIZER

LlamaTokenizerFast(name_or_path='local_tokenizer/tiny_llama', vocab_size=32000, model_max_length=128, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '</s>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

# YouTube guild  

In [None]:
chat_bot = pipeline(task='text-generation',
                    model=MODEL, # type: ignore
                    tokenizer=TOKENIZER,
                    torch_dtype=torch.float16,
                    #max_new_token_length=MAX_ANSWER_LEN,
                    do_sample = True,
                    top_k=5,
                    #num_return_sentences=RETURN_SENTENCE,
                    eos_token_id=TOKEN_EOS)

## prompt variables

In [None]:
B_INST = '[INST]'
E_INST = '[/INST]'
B_SYS = '<<SYS>>\n'
E_SYS = '\n<</SYS>>\n\n'
DEFAULT_PROMPT = '''\\
Answer user complains as tech support worker 
'''

In [None]:
def get_prompt(instruct:str, new_sys_prompt:str=DEFAULT_PROMPT) -> str:
    SYS_PROMPT = B_SYS + new_sys_prompt + E_SYS
    return B_INST + SYS_PROMPT + instruct + B_INST

def cut_off_text(text:str, prompt:str) -> str:
    if idx := text.find(prompt):
        text = text[:idx]
    
    return text

def remove_sub(string:str, subs:str) -> str:
    return string.replace(subs, '')

def generator(text:str) -> str:
    prompt = get_prompt(text)
    with torch.autocast(DEVICE, dtype=torch.float16):
        inputs = TOKENIZER(prompt, return_tensors='pt').to(DEVICE)
        dirty_output = MODEL.generate(**inputs, # type: ignore
                                        max_new_tokens=MAX_ANSWER_LEN,
                                        eos_token_id=TOKENIZER.eos_token_id,
                                        pad_token_id=TOKENIZER.pad_token_id)
        output = TOKENIZER.batch_decode(dirty_output, skip_special_tokens=True)[0]
        output = cut_off_text(output, '</s>')
        output = remove_sub(output, prompt)
    return output

# langchain chatbot

In [None]:
from langchain.memory import ConversationBufferMemory
from langchain import LLMChain, PromptTemplate
from langchain import HuggingFacePipeline

In [None]:
instruct = 'Chat History:\n{chat_history} \nUser: {user_input}'
llm = HuggingFacePipeline(pipeline=chat_bot, model_kwargs={'temperature':0})

In [None]:
sys_prompt = DEFAULT_PROMPT

In [None]:
template = get_prompt(instruct, sys_prompt)
print(template)

[INST]<<SYS>>
\
Answer user complains as tech support worker 

<</SYS>>

Chat History:

{chat_history} 

User: {user_input}[INST]


In [None]:
prompt = PromptTemplate(
    input_variables=['chat_history', 'user_input'],
    template=template
)
memory = ConversationBufferMemory(memory_key='chat_history')

In [None]:
llm_chain = LLMChain(
    llm=llm,
    prompt=prompt,
    verbose=True,
    memory=memory
)

In [None]:
llm_chain.predict(user_input="The estimated delivery time keeps changing! Now it says it'll be another hour?")



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m[INST]<<SYS>>
\
Answer user complains as tech support worker 

<</SYS>>

Chat History:

 

User: The estimated delivery time keeps changing! Now it says it'll be another hour?[INST][0m

[1m> Finished chain.[0m


"[INST]<<SYS>>\n\\\nAnswer user complains as tech support worker \n\n<</SYS>>\n\nChat History:\n\n \n\nUser: The estimated delivery time keeps changing! Now it says it'll be another hour?[INST]<<SYS>>\n\nChatbot: I'm sorry to disappoint you, I can't control the delivery timings for every individual customer. However, our delivery team is constantly monitoring the delivery schedule, and they will try to keep the estimated delivery time consistent with the current delivery schedules. If any issue or delay ar"

In [51]:
sting = "[INST]<<SYS>>\n\\\nAnswer user complains as tech support worker \n\n<</SYS>>\n\nChat History:\n\n \n\nUser: The estimated delivery time keeps changing! Now it says it'll be another hour?[INST]<<SYS>>\n\nChatbot: I'm sorry to disappoint you, I can't control the delivery timings for every individual customer. However, our delivery team is constantly monitoring the delivery schedule, and they will try to keep the estimated delivery time consistent with the current delivery schedules. If any issue or delay ar"

In [52]:
sting

"[INST]<<SYS>>\n\\\nAnswer user complains as tech support worker \n\n<</SYS>>\n\nChat History:\n\n \n\nUser: The estimated delivery time keeps changing! Now it says it'll be another hour?[INST]<<SYS>>\n\nChatbot: I'm sorry to disappoint you, I can't control the delivery timings for every individual customer. However, our delivery team is constantly monitoring the delivery schedule, and they will try to keep the estimated delivery time consistent with the current delivery schedules. If any issue or delay ar"