In [1]:
# split
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/")
data = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
all_splits = text_splitter.split_documents(data)

In [2]:
# embedding
from langchain.vectorstores import Chroma
from langchain.embeddings import LlamaCppEmbeddings

embedding = LlamaCppEmbeddings(model_path="./models/llama-2-7b-chat.ggmlv3.q8_0.bin")

# vectorstore = Chroma.from_documents(documents=all_splits, embedding=embedding)


llama.cpp: loading model from ./models/llama-2-7b-chat.ggmlv3.q8_0.bin
llama_model_load_internal: format     = ggjt v3 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 512
llama_model_load_internal: n_embd     = 4096
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_head_kv  = 32
llama_model_load_internal: n_layer    = 32
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: n_gqa      = 1
llama_model_load_internal: rnorm_eps  = 1.0e-06
llama_model_load_internal: n_ff       = 11008
llama_model_load_internal: freq_base  = 10000.0
llama_model_load_internal: freq_scale = 1
llama_model_load_internal: ftype      = 7 (mostly Q8_0)
llama_model_load_internal: model size = 7B
llama_model_load_internal: ggml ctx size =    0.08 MB
llama_model_load_internal: mem required  = 7130.73 MB (+  512.00 MB per state)
llama_new_context_with_model: kv self size  =  512.00 MB
AVX =

In [3]:
from langchain.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

n_gpu_layers = 1  # Metal set to 1 is enough.
n_batch = 512  # Should be between 1 and n_ctx, consider the amount of RAM of your Apple Silicon Chip.
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

In [4]:
# Make sure the model path is correct for your system!
llm = LlamaCpp(
    model_path="./models/llama-2-7b-chat.ggmlv3.q8_0.bin",
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    n_ctx=2048,
    f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
    callback_manager=callback_manager,
    verbose=True,
)

llama.cpp: loading model from ./models/llama-2-7b-chat.ggmlv3.q8_0.bin
llama_model_load_internal: format     = ggjt v3 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 2048
llama_model_load_internal: n_embd     = 4096
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_head_kv  = 32
llama_model_load_internal: n_layer    = 32
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: n_gqa      = 1
llama_model_load_internal: rnorm_eps  = 1.0e-06
llama_model_load_internal: n_ff       = 11008
llama_model_load_internal: freq_base  = 10000.0
llama_model_load_internal: freq_scale = 1
llama_model_load_internal: ftype      = 7 (mostly Q8_0)
llama_model_load_internal: model size = 7B
llama_model_load_internal: ggml ctx size =    0.08 MB
llama_model_load_internal: mem required  = 7130.73 MB (+ 1024.00 MB per state)
llama_new_context_with_model: kv self size  = 1024.00 MB
AVX 

In [5]:
prompt = """
Question: A rap battle between Stephen Colbert and John Oliver
"""
llm(prompt)


Stephen Colbert: "Yo, John Oliver, you're always talkin' 'bout politics, but you never really get in the game. You just sit there, lookin' all smug and British, like a bloated penguin on a iceberg."
John Oliver: "Ha! That's rich, coming from the man who uses his platform to spread hate speech and misinformation. At least I actually have something to say, you faux-gressive fool!"
Stephen Colbert: "Faux-gressive? Ha! You're just mad 'cause you can't keep up with my quick wit and clever wordplay, like a giraffe tryin' to sprint with a stubbed toe."
John Oliver: "Quick wit and clever wordplay? You mean the same tired jokes about Trump's comb-over and Melania's accent? Get original, Stephen! I'm the one bringin' real issues to the table, like income inequality and climate change. You're just a circus clown, entertaining the masses with your juvenile antics."
Stephen Colbert: "Circus clown


llama_print_timings:        load time =   744.36 ms
llama_print_timings:      sample time =   161.02 ms /   256 runs   (    0.63 ms per token,  1589.83 tokens per second)
llama_print_timings: prompt eval time =   744.18 ms /    16 tokens (   46.51 ms per token,    21.50 tokens per second)
llama_print_timings:        eval time = 41317.85 ms /   255 runs   (  162.03 ms per token,     6.17 tokens per second)
llama_print_timings:       total time = 42937.77 ms


'\nStephen Colbert: "Yo, John Oliver, you\'re always talkin\' \'bout politics, but you never really get in the game. You just sit there, lookin\' all smug and British, like a bloated penguin on a iceberg."\nJohn Oliver: "Ha! That\'s rich, coming from the man who uses his platform to spread hate speech and misinformation. At least I actually have something to say, you faux-gressive fool!"\nStephen Colbert: "Faux-gressive? Ha! You\'re just mad \'cause you can\'t keep up with my quick wit and clever wordplay, like a giraffe tryin\' to sprint with a stubbed toe."\nJohn Oliver: "Quick wit and clever wordplay? You mean the same tired jokes about Trump\'s comb-over and Melania\'s accent? Get original, Stephen! I\'m the one bringin\' real issues to the table, like income inequality and climate change. You\'re just a circus clown, entertaining the masses with your juvenile antics."\nStephen Colbert: "Circus clown'