In [1]:
from langchain.document_loaders import WebBaseLoader

loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/")
data = loader.load()

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
all_splits = text_splitter.split_documents(data)

from langchain.embeddings import LlamaCppEmbeddings
from langchain.vectorstores import Chroma

n_gpu_layers = 1  # Metal set to 1 is enough.
n_batch = 512  # Should be between 1 and n_ctx, consider the amount of RAM of your Apple Silicon Chip.
use_mlock = True  # Force system to keep model in RAM.

# Make sure the model path is correct for your system!
model_path="./models/llama-2-7b-chat.Q4_K_M.gguf"

embedding = LlamaCppEmbeddings(
    model_path=model_path,
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    n_ctx=2048,
    #f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
    use_mlock = True
)

vectorstore = Chroma.from_documents(documents=all_splits, embedding=embedding)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from ./models/llama-2-7b-chat.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: - tensor    0:                token_embd.weight q4_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q6_K     [ 11008,  4096,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q4_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]

In [2]:
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.llms import LlamaCpp

callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

# Make sure the model path is correct for your system!
llm = LlamaCpp(
    model_path=model_path,
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    n_ctx=512,
    #f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
    callback_manager=callback_manager,
    verbose=True,
    use_mlock = True
)

user_question = "What are the approaches to Task Decomposition?"

from langchain.chains import RetrievalQA

from langchain.prompts import PromptTemplate

prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Helpful Answer:"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectorstore.as_retriever(),
    chain_type_kwargs={"prompt": PROMPT},
)

qa_chain({"query": user_question})

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from ./models/llama-2-7b-chat.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: - tensor    0:                token_embd.weight q4_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q6_K     [ 11008,  4096,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q4_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]

 Task decomposition is breaking down of complex tasks into smaller sub-tasks that can be performed independently or in parallel. There are several approaches to task decomposition, including:
1. Functional Decomposition: Breaking down a task into its constituent functions or steps and identifying the resources required for each function.
2. Process Decomposition: Divide a process into smaller, more manageable processes that can be performed independently or in parallel.
3. Work Breakdown Structure (WBS): A hierarchical decomposition of a project into smaller, more manageable tasks and sub-tasks.
4. Activity-Centered Decomposition: Breaking down a task into smaller activities or actions that are required to complete the task.


llama_print_timings:        load time =    3501.32 ms
llama_print_timings:      sample time =      16.11 ms /   153 runs   (    0.11 ms per token,  9496.03 tokens per second)
llama_print_timings: prompt eval time =    3501.06 ms /   344 tokens (   10.18 ms per token,    98.26 tokens per second)
llama_print_timings:        eval time =   12818.13 ms /   152 runs   (   84.33 ms per token,    11.86 tokens per second)
llama_print_timings:       total time =   16632.72 ms


{'query': 'What are the approaches to Task Decomposition?',
 'result': ' Task decomposition is breaking down of complex tasks into smaller sub-tasks that can be performed independently or in parallel. There are several approaches to task decomposition, including:\n1. Functional Decomposition: Breaking down a task into its constituent functions or steps and identifying the resources required for each function.\n2. Process Decomposition: Divide a process into smaller, more manageable processes that can be performed independently or in parallel.\n3. Work Breakdown Structure (WBS): A hierarchical decomposition of a project into smaller, more manageable tasks and sub-tasks.\n4. Activity-Centered Decomposition: Breaking down a task into smaller activities or actions that are required to complete the task.'}