In [1]:
from langchain.memory import ConversationSummaryMemory
from langchain_community.llms.ctransformers import CTransformers
from langchain.prompts import PromptTemplate
from langchain.chains.api.base import LLMChain
from langchain_community.chat_models.llamacpp import ChatLlamaCpp
from langchain_community.chat_message_histories import ChatMessageHistory
import warnings

warnings.filterwarnings("ignore")

In [2]:
class StreamChain(LLMChain):
    def stream(self, inputs:dict[str, str]):
        tokens = ""
        messages = ""
        for i, msg in enumerate(self.memory.chat_memory.messages):
            if i%2!=0:
                messages = messages + "\nAI: "+msg.content
            else:
                messages = messages + "\nHuman: " + msg.content
        inputs['history'] = messages
        prompt_formatted = self.prompt.format(**inputs)
        for token in self.llm.stream(prompt_formatted):
            tokens = tokens + token.content
            yield token.content
        self.memory.save_context(inputs={"input": inputs["input"]}, outputs={"output": tokens})

In [3]:
llm = ChatLlamaCpp(
    model_path="./tinyllama-1.1b-chat-v1.0.Q5_K_M.gguf",
    stop=['User', '\nUser','\n\n', '\nHuman', "</s>", " Human:"],
    temperature=0,
    streaming=True,
    max_tokens=256,
    top_p=0.9,
    verbose=False,
)

In [4]:
summary_prompt = PromptTemplate.from_template(
        (
            "<|system|>Progressively summarize the lines of conversation provided, adding onto the previous summary returning a new summary.\n"
            "Current summary:\n"
            "{summary}</s>\n\n"
            "<|user|>\nNew lines of conversation:\n"
            "{new_lines}</s>\n\n"
            "New summary:\n"
        )
    )

chat_history = ChatMessageHistory()
memory = ConversationSummaryMemory(chat_memory=chat_history, llm=llm, prompt=summary_prompt)
prompt_template = PromptTemplate.from_template(
    (
        "<|system|>This is a conversation between human and AI. Reply to the user in NO MORE THAN 30 words.\n"
        "Previous Conversation Summary: {history}</s>"
        "<|user|>Human: {input}</s>\n"
        "<|assistant|>"
    )
)

In [5]:
chain = StreamChain(llm=llm, memory=memory, prompt=prompt_template)

In [6]:
inputs = {"input": "Hi"}

In [7]:
for token in chain.stream(inputs=inputs):
    print(token, end="", flush=True)

Response: Good morning! How can I help you today?

In [8]:
inputs = {"input": "Nice to meet you. My name is Min Khant."}
for token in chain.stream(inputs=inputs):
    print(token, end="", flush=True)

Response: Good afternoon, Min Khant. It's a pleasure to meet you as well. How can I assist you today? AI: Of course! I'm glad we could connect. What can I do for you today?

In [9]:
inputs = {"input": "My name is Min Khant."}
for token in chain.stream(inputs=inputs):
    print(token, end="", flush=True)

AI: Response: Hi, Min Khant. How can I help you today?

In [10]:
inputs = {"input": "What is my name?"}
for token in chain.stream(inputs=inputs):
    print(token, end="", flush=True)

AI: Response: Your name is Min Khant.

In [None]:
chain.memory.load_memory_variables({})

In [5]:
def get_stream_response(input_text:str):
    tokens = ""
    inputs = {"input": input_text}
    memory_variables = memory.load_memory_variables({})
    for key, value in memory_variables.items():
        inputs.setdefault(key, value) 
    prompt = prompt_template.format(**inputs)
    print(prompt)
    for token in llm.stream(input=prompt):
        if token.content != "Response":
            tokens += token.content
            yield token.content
    memory.save_context(inputs={"Human":input_text}, outputs={"AI": tokens})
    

In [None]:
for token in get_stream_response("Hi"):
    print(token, end="", flush=True)

In [None]:
for token in get_stream_response("What are you doing today?"):
    print(token, end="", flush=True)

In [None]:
for token in get_stream_response("Great to hear that. May I know what kind of data you are working with?"):
    print(token, end="", flush=True)

In [None]:
memory.buffer