In [None]:
import os
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

os.environ['OPENAI_API_KEY'] = "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"

# Setup your LLM

from llama_index import (
    GPTVectorStoreIndex, 
    SimpleDirectoryReader,
    PromptHelper,
    LLMPredictor,
    ServiceContext,
    StorageContext, 
    load_index_from_storage,
    download_loader
)
from langchain.llms import OpenAI
from pathlib import Path
from IPython.display import Markdown, display


In [None]:
# Basic LLM indexing

documents = SimpleDirectoryReader('./data').load_data()

#2 docx
# DocxReader = download_loader("DocxReader")
# loader = DocxReader()
# documents = loader.load_data(file=Path('./data/ac_open.docx'))

# Create an index of your documents
index = GPTVectorStoreIndex.from_documents(documents)



In [None]:
# Customized LLM indexing
documents = SimpleDirectoryReader('./data').load_data()

# Create an index of your documents by customized LLM 

# define prompt helper
# set maximum input size
max_input_size = 4096
# set number of output tokens
num_output = 2048
# set maximum chunk overlap
max_chunk_overlap = 100
prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap)

# define LLM
llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, model_name="text-davinci-003", max_tokens=num_output))
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper)

# build index
index = GPTVectorStoreIndex.from_documents(documents, service_context=service_context)



In [12]:
# save index to disk
index.set_index_id("vector_index")
index.storage_context.persist('storage')

In [5]:
# rebuild storage context

storage_context = StorageContext.from_defaults(persist_dir='storage')
# load index
index = load_index_from_storage(storage_context, index_id="vector_index")

# Query your index!
query_engine = index.as_query_engine()

response = query_engine.query("野村優質基金的投資範圍?")
print(response)

#response = query_engine.query("野村優質基金的投資範圍?")
#print(response)
#response = query_engine.query("野村優質基金的成立日期?")
#print(response)
#response = query_engine.query("野村全球品牌基金的投資特色，以表格形式呈現")
#print(response)
#response = query_engine.query("野村全球品牌基金的最近五年度基金費用率，以表格形式呈現")
#print(response)




INFO:llama_index.indices.loading:Loading indices with ids: ['vector_index']
Loading indices with ids: ['vector_index']
INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 28 tokens
> [retrieve] Total embedding token usage: 28 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 2156 tokens
> [get_response] Total LLM token usage: 2156 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens
> [get_response] Total embedding token usage: 0 tokens

野村優質基金的投資範圍包括美國、歐洲、日本、中國、新加坡、台灣、香港、澳洲等地區的股票、債券、基金、指數型基金等。


In [6]:
# Customized query engine

from llama_index import ResponseSynthesizer
from llama_index.query_engine import RetrieverQueryEngine
# Retrievers 
from llama_index.retrievers import BaseRetriever, VectorIndexRetriever, KeywordTableSimpleRetriever

# configure retriever
#retriever = index.as_retriever(retriever_mode='embedding')

# default
#query_engine = RetrieverQueryEngine.from_args(retriever, response_mode='default')
#query_engine = RetrieverQueryEngine(retriever)

query_engine = index.as_query_engine()

#response = query_engine.query("基金申購價款要如何繳付呢?")
response = query_engine.query("what is chatgpt?")

print(response)
#print(response.get_formatted_sources())


INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 6 tokens
> [retrieve] Total embedding token usage: 6 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 2076 tokens
> [get_response] Total LLM token usage: 2076 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens
> [get_response] Total embedding token usage: 0 tokens

Chatgpt is a chatbot service that provides automated customer service and answers to frequently asked questions. It is powered by natural language processing and machine learning algorithms to provide accurate and timely responses to customer inquiries.
> Source (Doc id: 61e22a1b-8eaa-47bc-affe-a4d736edb7c9): ( 或 150 美元 / 歐元 / 澳幣 / 加幣 / 英
鎊、 15,000 ⽇圓、 1,500 南非幣 ) 。
2023/5/12 中午 12:09 Nomura_QA_all.html
f...



# Wikipedia Example

In [162]:
from llama_index import download_loader

WikipediaReader = download_loader("WikipediaReader")

loader = WikipediaReader()
wikidocs = loader.load_data(pages=['Cyclone Freddy'])

# https://en.wikipedia.org/wiki/Cyclone_Freddy

In [163]:
wiki_index = GPTSimpleVectorIndex(wikidocs)

INFO:root:> [build_index_from_documents] Total LLM token usage: 0 tokens
INFO:root:> [build_index_from_documents] Total embedding token usage: 4103 tokens


In [165]:
response = wiki_index.query("What is cyclone freddy?")
print(response)

INFO:root:> [query] Total LLM token usage: 3844 tokens
INFO:root:> [query] Total embedding token usage: 8 tokens






# Customer Support Example

In [150]:
documents = SimpleDirectoryReader('./asos').load_data()

In [151]:
index = GPTSimpleVectorIndex(documents)

INFO:root:> [build_index_from_documents] Total LLM token usage: 0 tokens
INFO:root:> [build_index_from_documents] Total embedding token usage: 12584 tokens


In [153]:
response = index.query("What premier service options do I have in the UAE?")
print(response)

INFO:root:> [query] Total LLM token usage: 1317 tokens
INFO:root:> [query] Total embedding token usage: 11 tokens



In the United Arab Emirates, you have the option of signing up for ASOS Premier, which gives you free Standard and Express delivery all year round when you spend over 150 AED. It costs 200 AED and is valid on the order you purchase it on.


# YouTube Video Example

In [154]:
YoutubeTranscriptReader = download_loader("YoutubeTranscriptReader")

loader = YoutubeTranscriptReader()
documents = loader.load_data(ytlinks=['https://www.youtube.com/watch?v=K7Kh9Ntd8VE&ab_channel=DaveNick'])

In [159]:
index = GPTSimpleVectorIndex(documents)

INFO:root:> [build_index_from_documents] Total LLM token usage: 0 tokens
INFO:root:> [build_index_from_documents] Total embedding token usage: 18181 tokens


In [157]:
response = index.query("What some YouTube automation mistakes to avoid?")
print(response)

INFO:root:> [query] Total LLM token usage: 4024 tokens
INFO:root:> [query] Total embedding token usage: 8 tokens




1. Re-uploading other people's content without permission.
2. Using copyrighted music.
3. Not understanding how the YouTube algorithm works.
4. Not researching the best niche for YouTube automation.
5. Not optimizing the About section with relevant keywords.
6. Not creating a logo and channel art that is professional and attractive.


# Chatbot Class - Just include your index

In [2]:
import openai
import json

class Chatbot:
    def __init__(self, api_key, index):
        self.index = index
        openai.api_key = api_key
        self.chat_history = []

    def generate_response(self, user_input):
        prompt = "\n".join([f"{message['role']}: {message['content']}" for message in self.chat_history[-5:]])
        prompt += f"\nUser: {user_input}"
        response = index.query(user_input)

        message = {"role": "assistant", "content": response.response}
        self.chat_history.append({"role": "user", "content": user_input})
        self.chat_history.append(message)
        return message
    
    def load_chat_history(self, filename):
        try:
            with open(filename, 'r') as f:
                self.chat_history = json.load(f)
        except FileNotFoundError:
            pass

    def save_chat_history(self, filename):
        with open(filename, 'w') as f:
            json.dump(self.chat_history, f)


In [None]:
documents = SimpleDirectoryReader('./data').load_data()
index = GPTSimpleVectorIndex(documents)

In [None]:
# Swap out your index below for whatever knowledge base you want
bot = Chatbot("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", index=index)
bot.load_chat_history("chat_history.json")

while True:
    user_input = input("You: ")
    if user_input.lower() in ["bye", "goodbye"]:
        print("Bot: Goodbye!")
        bot.save_chat_history("chat_history.json")
        break
    response = bot.generate_response(user_input)
    print(f"Bot: {response['content']}")