In [None]:
#1
!pip install -q llama-index
!pip install -q openai
!pip install -q transformers
!pip install -q accelerate
!pip install -q llama-index-llms-huggingface

In [None]:
#2
import openai
import os

OPENAI_API_KEY=""  # input your openai api key
openai.api_key=OPENAI_API_KEY
print(openai.api_key)

In [None]:
#3
from llama_index.core.llms import LLM
from llama_index.llms.openai import OpenAI
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from IPython.display import Markdown, display

In [None]:
documents = SimpleDirectoryReader("data").load_data()

In [None]:
index = VectorStoreIndex.from_documents(documents)

In [None]:
query_engine = index.as_query_engine()

In [None]:
response = query_engine.query("What is the specific address University of California, Irvine?")

In [None]:
display(Markdown(f"<b>{response}</b>"))

### Storing and Loading the Index

In [None]:
index.storage_context.persist()

In [None]:
from llama_index.core import StorageContext, load_index_from_storage

storage_context = StorageContext.from_defaults(persist_dir="./storage")
index = load_index_from_storage(storage_context=storage_context)

### How to Customize

In [None]:
from llama_index.core import ServiceContext, set_global_service_context

In [None]:
# define LLM: https://gpt-index.readthedocs.io/en/latest/core_modules/model_modules/llms/usage_custom.html
llm = OpenAI(model="gpt-3.5-turbo", temperature=0, max_tokens=256)

# configure service context
service_context = ServiceContext.from_defaults(llm=llm, chunk_size=768, chunk_overlap=128)
# set_global_service_context(service_context)
index = VectorStoreIndex.from_documents(documents, service_context=service_context)

In [None]:
# !pip install llama-index-llms-palm
# from llama_index.llms.palm import PaLM
# service_context = ServiceContext.from_defaults(llm=PaLM())

In [None]:
query_engine = index.as_query_engine(streaming=True)
response = query_engine.query("How to set your UCI email?")
response.print_response_stream()

In [None]:
query_engine = index.as_chat_engine()
response = query_engine.chat("What is this document about?")
display(Markdown(f"<b>{response}</b>"))

#### Using a HuggingFace LLM

This will NOT work on the Free Google Colab. You will need colab Pro. Video on the quantized models is coming soon.....

In [None]:
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core.prompts import PromptTemplate

system_prompt = """
<|SYSTEM|># StableLM Tuned (Alpha version)
- StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
- StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
- StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
- StableLM will refuse to participate in anything that could harm a human.
"""

# This will wrap the default prompts that are internal to llama-index
query_wrapper_prompt = PromptTemplate("<|USER|>{query_str}<|ASSISTANT|>")

import torch
llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.7, "do_sample": False},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="StabilityAI/stablelm-tuned-alpha-3b",
    model_name="StabilityAI/stablelm-tuned-alpha-3b",
    device_map="auto",
    stopping_ids=[50278, 50279, 50277, 1, 0],
    tokenizer_kwargs={"max_length": 1024},
    # uncomment this if using CUDA to reduce memory usage
    # model_kwargs={"torch_dtype": torch.float16}
)
service_context = ServiceContext.from_defaults(
    chunk_size=1024,
    llm=llm,
)

In [None]:
index = VectorStoreIndex.from_documents(documents, service_context=service_context)

In [None]:
query_engine = index.as_query_engine(streaming=True)
response = query_engine.query("How to set your UCI email?")
response.print_response_stream()