In [7]:
import logging
import sys
import torch
torch.set_default_device('cuda')

# ERROR to show only errors, INFO to show all logs
logging.basicConfig(stream=sys.stdout, level=logging.INFO) 
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
# Set the log level for the sentence_transformers package
logging.getLogger('sentence_transformers').setLevel(logging.ERROR)
logging.getLogger().setLevel(logging.WARNING)

from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.llms.huggingface import HuggingFaceLLM

In [6]:
documents = SimpleDirectoryReader("data").load_data()

In [2]:
system_prompt = "You are a Q&A assistant. Your goal is to answer questions as accurately as possible based on the instructions and context provided."
# This will wrap the default prompts that are internal to llama-index
query_wrapper_prompt = "<|USER|>{query_str}<|ASSISTANT|>"

In [3]:
llm_path = "/home/whatx/SusGen/ckpts/Mistral-7B-Instruct-v0.2-hf"
llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.1, "do_sample": True},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name=llm_path,
    model_name=llm_path,
    device_map="auto",
    tokenizer_kwargs={"max_length": 4096},
    model_kwargs={"torch_dtype": torch.float16}
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


In [4]:
from langchain.embeddings import HuggingFaceEmbeddings
# from llama_index.core import ServiceContext, set_global_service_context

embed_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2"
)

In [8]:
from llama_index.core import Settings

Settings.llm = llm
Settings.embed_model = embed_model
Settings.chunk_size = 1024

index = VectorStoreIndex.from_documents(documents, settings=Settings)

In [9]:
query_engine = index.as_query_engine()
response = query_engine.query("what was the revenuew from aws in 2022?")
print(response)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


The revenue from AWS in 2022 was $80,096 million.
