In [1]:
import os, torch

#model_id = "microsoft/Phi-3.5-mini-instruct"
model_id = "microsoft/Phi-3-mini-4k-instruct"

os.environ['CUDA_VISIBLE_DEVICES'] = str(5)

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16
        )
model = AutoModelForCausalLM.from_pretrained(
            model_id,
            torch_dtype="auto",
            trust_remote_code=True,
            #quantization_config=bnb_config
        ).to('cuda')

tokenizer = AutoTokenizer.from_pretrained(model_id)

from langchain_huggingface.llms import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

pipe = pipeline("text-generation",
                model=model,
                tokenizer=tokenizer, 
                #device=0,
                max_new_tokens=1024)

llm = HuggingFacePipeline(pipeline=pipe)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


## Setup Loaders, VectorStore e Retriever

In [2]:
import bs4
from langchain import hub
from langchain_chroma import Chroma
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Load, chunk and index the contents of the blog.
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
embeddings_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

vectorstore = Chroma.from_documents(
    splits,
    embedding=embeddings_model,
)
# Retrieve and generate using the relevant snippets of the blog.
retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

USER_AGENT environment variable not set, consider setting it to identify your requests.


#### Test Retriever

In [3]:
retriever.invoke("What is Task Decomposition?")

[Document(metadata={'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/'}, page_content='Tree of Thoughts (Yao et al. 2023) extends CoT by exploring multiple reasoning possibilities at each step. It first decomposes the problem into multiple thought steps and generates multiple thoughts per step, creating a tree structure. The search process can be BFS (breadth-first search) or DFS (depth-first search) with each state evaluated by a classifier (via a prompt) or majority vote.\nTask decomposition can be done (1) by LLM with simple prompting like "Steps for XYZ.\\n1.", "What are the subgoals for achieving XYZ?", (2) by using task-specific instructions; e.g. "Write a story outline." for writing a novel, or (3) with human inputs.'),
 Document(metadata={'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/'}, page_content="(3) Task execution: Expert models execute on the specific tasks and log results.\nInstruction:\n\nWith the input and the inference results, the AI a

In [4]:
(retriever | format_docs).invoke("What is Task Decomposition?")

'Tree of Thoughts (Yao et al. 2023) extends CoT by exploring multiple reasoning possibilities at each step. It first decomposes the problem into multiple thought steps and generates multiple thoughts per step, creating a tree structure. The search process can be BFS (breadth-first search) or DFS (depth-first search) with each state evaluated by a classifier (via a prompt) or majority vote.\nTask decomposition can be done (1) by LLM with simple prompting like "Steps for XYZ.\\n1.", "What are the subgoals for achieving XYZ?", (2) by using task-specific instructions; e.g. "Write a story outline." for writing a novel, or (3) with human inputs.\n\n(3) Task execution: Expert models execute on the specific tasks and log results.\nInstruction:\n\nWith the input and the inference results, the AI assistant needs to describe the process and results. The previous stages can be formed as - User Input: {{ User Input }}, Task Planning: {{ Tasks }}, Model Selection: {{ Model Assignment }}, Task Execut

## RAG

#### Processing Inputs 

In [5]:
#Define the input processing part
input_processor = {"context": retriever | format_docs, "question": RunnablePassthrough()}

In [6]:
from langchain_core.runnables import RunnableParallel
# Define the input processing part as a RunnableParallel
input_processor = RunnableParallel(
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
)

# Run just this part to see intermediate results
intermediate_result = input_processor.invoke("What is Task Decomposition")
print("Intermediate result:")
print(intermediate_result)

Intermediate result:
{'context': 'Tree of Thoughts (Yao et al. 2023) extends CoT by exploring multiple reasoning possibilities at each step. It first decomposes the problem into multiple thought steps and generates multiple thoughts per step, creating a tree structure. The search process can be BFS (breadth-first search) or DFS (depth-first search) with each state evaluated by a classifier (via a prompt) or majority vote.\nTask decomposition can be done (1) by LLM with simple prompting like "Steps for XYZ.\\n1.", "What are the subgoals for achieving XYZ?", (2) by using task-specific instructions; e.g. "Write a story outline." for writing a novel, or (3) with human inputs.\n\n(3) Task execution: Expert models execute on the specific tasks and log results.\nInstruction:\n\nWith the input and the inference results, the AI assistant needs to describe the process and results. The previous stages can be formed as - User Input: {{ User Input }}, Task Planning: {{ Tasks }}, Model Selection: {{

In [7]:
next_result = prompt.invoke(intermediate_result)
print(f"{len(next_result.messages)} messages")
for m in next_result.messages:
    print(m.content)

1 messages
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: What is Task Decomposition 
Context: Tree of Thoughts (Yao et al. 2023) extends CoT by exploring multiple reasoning possibilities at each step. It first decomposes the problem into multiple thought steps and generates multiple thoughts per step, creating a tree structure. The search process can be BFS (breadth-first search) or DFS (depth-first search) with each state evaluated by a classifier (via a prompt) or majority vote.
Task decomposition can be done (1) by LLM with simple prompting like "Steps for XYZ.\n1.", "What are the subgoals for achieving XYZ?", (2) by using task-specific instructions; e.g. "Write a story outline." for writing a novel, or (3) with human inputs.

(3) Task execution: Expert models execute on the specific 

In [8]:
# Then you can run the rest of the chain if you want
final_result = (prompt | llm | StrOutputParser()).invoke(intermediate_result)
print("Final result:")
print(final_result)

The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
You are not running the flash-attention implementation, expect numerical differences.


Final result:
Human: You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: What is Task Decomposition 
Context: Tree of Thoughts (Yao et al. 2023) extends CoT by exploring multiple reasoning possibilities at each step. It first decomposes the problem into multiple thought steps and generates multiple thoughts per step, creating a tree structure. The search process can be BFS (breadth-first search) or DFS (depth-first search) with each state evaluated by a classifier (via a prompt) or majority vote.
Task decomposition can be done (1) by LLM with simple prompting like "Steps for XYZ.\n1.", "What are the subgoals for achieving XYZ?", (2) by using task-specific instructions; e.g. "Write a story outline." for writing a novel, or (3) with human inputs.

(3) Task execution: Expert models execute on the

## Back to the usual RAG
RunnablePassthrough allows the questions to get into the rest of the chain. Still unsure how that works

### Phi 3 template
**Had to use my own template** -> https://github.com/microsoft/Phi-3CookBook/blob/main/md/02.QuickStart/Huggingface_QuickStart.md

In [9]:
from langchain_core.prompts import PromptTemplate

template = """<|system|>
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.<|end|>
<|user|>
Question: {question}
Context: {context}<|end|>
<|assistant|>"""

prompt = PromptTemplate.from_template(template)

prompt.invoke({"context": "TEST CONTEXT", "question": "TEST QUESTION"}).text

"<|system|>\nYou are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.<|end|>\n<|user|>\nQuestion: TEST QUESTION\nContext: TEST CONTEXT<|end|>\n<|assistant|>"

In [10]:
from langchain_core.runnables import RunnableParallel
# Define the input processing part as a RunnableParallel
input_processor = RunnableParallel(
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
)

# Run just this part to see intermediate results
intermediate_result = input_processor.invoke("What is Task Decomposition")
print("Intermediate result:")
print(intermediate_result)

Intermediate result:
{'context': 'Tree of Thoughts (Yao et al. 2023) extends CoT by exploring multiple reasoning possibilities at each step. It first decomposes the problem into multiple thought steps and generates multiple thoughts per step, creating a tree structure. The search process can be BFS (breadth-first search) or DFS (depth-first search) with each state evaluated by a classifier (via a prompt) or majority vote.\nTask decomposition can be done (1) by LLM with simple prompting like "Steps for XYZ.\\n1.", "What are the subgoals for achieving XYZ?", (2) by using task-specific instructions; e.g. "Write a story outline." for writing a novel, or (3) with human inputs.\n\n(3) Task execution: Expert models execute on the specific tasks and log results.\nInstruction:\n\nWith the input and the inference results, the AI assistant needs to describe the process and results. The previous stages can be formed as - User Input: {{ User Input }}, Task Planning: {{ Tasks }}, Model Selection: {{

In [11]:
# Parse the last assistant message
def parse_assistant_phi_response(model_response: str):
    return model_response.split("<|assistant|>")[-1]

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
    | parse_assistant_phi_response
)

rag_chain.invoke("What is Task Decomposition?")

' Task decomposition is the process of breaking down a complex problem into smaller, more manageable sub-problems or steps. In the context of the Tree of Thoughts (Yao et al. 2023), task decomposition involves creating a tree structure where each node represents a thought step, and multiple thoughts can be generated for each step. This approach allows for exploring various reasoning possibilities at each stage of the problem-solving process.\n\nTo perform task decomposition, one can use a large language model (LLM) with simple prompting, task-specific instructions, or human inputs. For example, to decompose a problem into multiple thought steps, one could use prompts like "Steps for XYZ." or "What are the subgoals for achieving XYZ?" Additionally, task-specific instructions such as "Write a story outline" for writing a novel can also be used for task decomposition.\n\nIn the process of task execution, expert models execute the specific tasks and log the results. The AI assistant would 

## Using other methods

In [12]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain


template = """<|system|>
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.<|end|>
<|user|>
Question: {input}
Context: {context}<|end|>
<|assistant|>"""

prompt = PromptTemplate.from_template(template)

question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)


response = rag_chain.invoke({"input": "What is Task Decomposition?"})

In [13]:
response.keys()

dict_keys(['input', 'context', 'answer'])

In [14]:
response['context']

[Document(metadata={'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/'}, page_content='Tree of Thoughts (Yao et al. 2023) extends CoT by exploring multiple reasoning possibilities at each step. It first decomposes the problem into multiple thought steps and generates multiple thoughts per step, creating a tree structure. The search process can be BFS (breadth-first search) or DFS (depth-first search) with each state evaluated by a classifier (via a prompt) or majority vote.\nTask decomposition can be done (1) by LLM with simple prompting like "Steps for XYZ.\\n1.", "What are the subgoals for achieving XYZ?", (2) by using task-specific instructions; e.g. "Write a story outline." for writing a novel, or (3) with human inputs.'),
 Document(metadata={'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/'}, page_content="(3) Task execution: Expert models execute on the specific tasks and log results.\nInstruction:\n\nWith the input and the inference results, the AI a

In [15]:
system_command = "You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don\'t know the answer, just say that you don\'t know. Use three sentences maximum and keep the answer concise"

user_command = f"Question: {intermediate_result['question']}\nContext: {intermediate_result['context']}"

messages = [{"role": "system", "content": system_command},
		        {"role": "user", "content": user_command}]

generation_args = {
        "use_cache": True,
        "max_new_tokens": 1024,
        #"temperature": 0.7,
        #"repetition_penalty": 1.1,
        #"do_sample": True  # Enable sampling to use temperature
    }
pipe = pipeline(
		"text-generation",
		model=model,
		tokenizer=tokenizer,
	)
	    
output = pipe(messages, **generation_args)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [16]:
print(output[-1]['generated_text'][-1]['content'])

 User Input: "Explain the concept of task decomposition."

Task Planning: Task decomposition involves breaking down a complex problem into smaller, more manageable sub-tasks.

Model Selection: "id": "LLM", "reason": "The user's request requires a general explanation of task decomposition, which is best handled by an LLM due to its ability to generate human-like text."

Task Execution: "Predictions": "Task decomposition is a process of breaking down a complex problem into smaller, more manageable sub-tasks. This approach allows for easier problem-solving and can be applied in various fields such as project management, software development, and cognitive science. By focusing on individual sub-tasks, one can tackle each component more effectively, leading to a more efficient and organized problem-solving process."
