^C


# Building high level sample

In [4]:
import torch
from transformers import BitsAndBytesConfig
from llama_index.prompts import PromptTemplate
from llama_index.llms import HuggingFaceLLM

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)


def messages_to_prompt(messages):
  prompt = ""
  for message in messages:
    if message.role == 'system':
      prompt += f"<|system|>\n{message.content}</s>\n"
    elif message.role == 'user':
      prompt += f"<|user|>\n{message.content}</s>\n"
    elif message.role == 'assistant':
      prompt += f"<|assistant|>\n{message.content}</s>\n"

  # ensure we start with a system prompt, insert blank if needed
  if not prompt.startswith("<|system|>\n"):
    prompt = "<|system|>\n</s>\n" + prompt

  # add final assistant prompt
  prompt = prompt + "<|assistant|>\n"

  return prompt


llm = HuggingFaceLLM(
    model_name="HuggingFaceH4/zephyr-7b-beta",
    tokenizer_name="HuggingFaceH4/zephyr-7b-beta",
    query_wrapper_prompt=PromptTemplate("<|system|>\n</s>\n<|user|>\n{query_str}</s>\n<|assistant|>\n"),
    context_window=3900,
    max_new_tokens=256,
    # model_kwargs={"quantization_config": quantization_config},
    # tokenizer_kwargs={},
    generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95},
    messages_to_prompt=messages_to_prompt,
    device_map="auto",
)

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

KeyboardInterrupt: 

# Create service context

In [None]:
from llama_index import ServiceContext

service_context = ServiceContext.from_defaults(llm=llm, embed_model="local:BAAI/bge-small-en-v1.5")

# Indexing

In [None]:
from llama_index import SimpleDirectoryReader, VectorStoreIndex, StorageContext, load_index_from_storage
import os

if not os.path.exists('./storage'):
    # Reading files in directory
    documents = SimpleDirectoryReader('data').load_data()
    # Indexing data in embedding of Vector store
    index = VectorStoreIndex.from_documents(
        documents,
        service_context=service_context
    )
    
    # Creates storage of indexes that we do not have to vectorise them once again
    index.storage_context.persist()
else:
    # Load stored indexes
    storage_context = StorageContext.from_defaults(persist_dir='./storage')
    index = load_index_from_storage(
        storage_context=storage_context,
        service_context=service_context,
    )
 

In [None]:
from llama_index.response.notebook_utils import display_response

In [None]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

# Creating an engine of llm
Query engine has several types of use: simple querying, chat mode, stream chat. 
All of those can be used asynchronously. 

In [None]:
query_engine = index.as_chat_engine()

In [None]:
response = query_engine.chat("Give me unit structure of Introduction to big data")
print(response)

In [None]:
response = query_engine.chat("What is main objective of Big Data")
print(response)

In [None]:
response = query_engine.chat("What is Key roles of the new big data ecosystems")
print(response)

In [None]:
# Building 