In [1]:
#!pip install llama-index transformers accelerate bitsandbytes llama-cpp

# Building high level sample

In [2]:
# huggingface api token for downloading llama2
hf_token = "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"

In [3]:
import torch
from transformers import BitsAndBytesConfig
from llama_index.prompts import PromptTemplate
from llama_index.llms import HuggingFaceLLM

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

llm = HuggingFaceLLM(
    model_name="meta-llama/Llama-2-7b-chat-hf",
    tokenizer_name="meta-llama/Llama-2-7b-chat-hf",
    query_wrapper_prompt=PromptTemplate("<s> [INST] {query_str} [/INST] "),
    context_window=3900,
    model_kwargs={"token": hf_token, "quantization_config": quantization_config},
    tokenizer_kwargs={"token": hf_token},
    device_map="auto",
)

RuntimeError: No GPU found. A GPU is needed for quantization.

# Create service context

In [None]:
from llama_index import ServiceContext
service_context = ServiceContext.from_defaults(llm=llm, embed_model="local:BAAI/bge-small-en-v1.5")

# Indexing

In [None]:
from llama_index import SimpleDirectoryReader, VectorStoreIndex, StorageContext, load_index_from_storage
import os

if not os.path.exists('./storage'):
    # Reading files in directory
    documents = SimpleDirectoryReader('data').load_data()
    # Indexing data in embedding of Vector store
    index = VectorStoreIndex.from_documents(
        documents,
        service_context=service_context
    )
    
    # Creates storage of indexes that we do not have to vectorise them once again
    index.storage_context.persist()
else:
    # Load stored indexes
    storage_context = StorageContext.from_defaults(persist_dir='./storage')
    index = load_index_from_storage(
        storage_context=storage_context,
        service_context=service_context,
    )
 

In [None]:
from llama_index.response.notebook_utils import display_response

In [None]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

# Creating an engine of llm
Query engine has several types of use: simple querying, chat mode, stream chat. 
All of those can be used asynchronously. 

In [None]:
query_engine = index.as_chat_engine()

In [None]:
response = query_engine.chat("Give me unit structure of Introduction to big data")
print(response)

In [None]:
response = query_engine.chat("What is main objective of Big Data")
print(response)

In [None]:
response = query_engine.chat("What is Key roles of the new big data ecosystems")
print(response)

In [None]:
# Building 