## Prep

In [None]:
# https://docs.llamaindex.ai/en/stable/examples/llm/llama_2_llama_cpp.html

# # INSTALL
# conda create llamaindex python=3.12.2
# pip install llama-index
# CMAKE_ARGS="-DLLAMA_METAL=on" pip install -U llama-cpp-python --no-cache-dir pip install 'llama-cpp-python[server]'
# pip install llama-index-llms-llama-cpp
# pip install llama-index-embeddings-huggingface
# pip install llama_index-llms-ollama
# pip install llama-index-llms-openai

In [None]:
import logging
import sys
import os.path
import time
from pprint import pprint

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.llms.ollama import Ollama
from llama_index.llms.openai import OpenAI
from llama_index.llms.llama_cpp.llama_utils import (
    messages_to_prompt,
    completion_to_prompt,
)
from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    StorageContext,
    load_index_from_storage,
    set_global_tokenizer,
    Settings,
)
from llama_index.core.embeddings import resolve_embed_model
from llama_index.core.llms import ChatMessage
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from transformers import AutoTokenizer
import tiktoken

In [None]:
def timed_stream_print(resp):
    start_time = time.time()
    token_count = 0
    for token in resp.response_gen:
        print(token, end="")
        token_count += 1
    time_elapsed = time.time() - start_time
    tokens_per_second = token_count / time_elapsed

    print(f"\n\nStreamed output at {tokens_per_second:.2f} tokens/s")

## Init LLM
### local - llama_cpp

In [None]:
model_url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF/resolve/main/llama-2-13b-chat.Q4_0.gguf"

Settings.llm = LlamaCPP(
    # You can pass in the URL to a GGML model to download it automatically
    model_url=model_url,
    # optionally, you can set the path to a pre-downloaded model instead of model_url
    model_path=None,
    temperature=0.1,
    max_new_tokens=256,
    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
    context_window=1000, #3900,
    # kwargs to pass to __call__()
    generate_kwargs={},
    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    # model_kwargs={"n_gpu_layers": 1},
    # transform inputs into Llama2 format
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=False,
)

### local - ollama

In [None]:
model = "llama2"
model = "mistral"

Settings.llm = Ollama(model=model, request_timeout=30.0)

### API - OpenAI

In [None]:
Settings.llm = OpenAI()

## Simple query without RAG

In [None]:
query = "Hello! Can you tell me a poem about cats and dogs?"
query = "Can you write me a extremely short poem about slow cars?"
query = "Who wrote the text?"

# # simple complete
# response = Settings.llm.complete(query)
# print(response.text)

# streamed complete
response_iter = Settings.llm.stream_complete(query)
for response in response_iter:
    print(response.delta, end="", flush=True)

In [None]:
query = "What is your name?"
# query = "Who wrote the text?"

# streamed chat
messages = [
    ChatMessage(
        role="system", content="You are a pirate with a colorful personality. The answers are always as short as possible."
    ),
    ChatMessage(role="user", content=query),
]
response_iter = Settings.llm.stream_chat(messages)
for response in response_iter:
    print(response.delta, end="", flush=True)

## Create RAG

In [None]:
# init tokenizer
Settings.tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-chat-hf").encode
Settings.tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
Settings.tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo").encode

In [None]:
# init embedding

# use Huggingface embeddings
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

# # bge-m3 embedding model
# Settings.embed_model = resolve_embed_model("local:BAAI/bge-small-en-v1.5")

In [None]:
# init vector store

DATA_DIR = "./data"

# # load documents
# documents = SimpleDirectoryReader(DATA_DIR).load_data()
# # create vector store index
# index = VectorStoreIndex.from_documents(documents, embed_model=embed_model)

# persisted store
PERSIST_DIR = f"./storage_{Settings.embed_model.model_name}"
if not os.path.exists(PERSIST_DIR):
    documents = SimpleDirectoryReader(DATA_DIR).load_data()
    index = VectorStoreIndex.from_documents(documents)
    index.storage_context.persist(persist_dir=PERSIST_DIR)
    print("created vectors store")
else:
    storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
    index = load_index_from_storage(storage_context)
    print("imported vectors store")

## Query RAG

In [None]:
query = "What did the author do growing up?"
query = "What happened at interleaf? Just answer in a short sentence."
query = "Who wrote the text?"

query = "What did the author do growing up? Give an extremely short answer."
query = "Wer ist der Author?"
query = "Von wann ist der Text?"
query = "Welche Farbe hat das Eichhörnchen?"

# # simple query engine
# query_engine = index.as_query_engine(llm=llm)
# response = query_engine.query(query)
# print(response)

# streamed query engine
query_engine = index.as_query_engine(streaming=True)
response = query_engine.query(query)

timed_stream_print(response)

In [None]:
# chat with context

chat_engine = index.as_chat_engine()
response = chat_engine.chat(query)
pprint(response)