# Custom Knowledge Chatbot w/ LlamaIndex
By Liam Ottley - YouTube: https://www.youtube.com/@LiamOttley

Examples:
- https://gita.kishans.in/
- https://www.chatpdf.com/

In [7]:
!pip install llama_index
!pip install langchain
!pip install docx2txt

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Collecting docx2txt
  Downloading docx2txt-0.8.tar.gz (2.8 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: docx2txt
  Building wheel for docx2txt (pyproject.toml): started
  Building wheel for docx2txt (pyproject.toml): finished with status 'done'
  Created wheel for docx2txt: filename=docx2txt-0.8-py3-none-any.whl size=3970 sha256=4e942b98b6ca1cb43cb5c6f3b40bd7abc9c9749737b4601f147b992399bc2cad
  Stored in directory: c:\users\hunterfin

# Basic LlamaIndex Usage Pattern

In [10]:
import os

os.environ['OPENAI_API_KEY'] = "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"

In [11]:
# Load you data into 'Documents' a custom type by LlamaIndex

# Setup your LLM

from llama_index import (
    GPTVectorStoreIndex, 
    SimpleDirectoryReader,
    PromptHelper,
    LLMPredictor,
    ServiceContext
)
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI
from IPython.display import Markdown, display

documents = SimpleDirectoryReader('./data').load_data()

In [12]:
# Create an index of your documents

# index = GPTVectorStoreIndex.from_documents(documents)


In [13]:
# Query your index!

#query_engine = index.as_query_engine()
#response = query_engine.query("如何線上開戶?")
#print(response)


# Customize your LLM for different output

In [14]:


# define LLM
#llm_predictor = LLMPredictor(llm=OpenAI(temperature=0.1, model_name="text-davinci-002"))
llm_predictor_gpt3 = LLMPredictor(llm=OpenAI(temperature=0.1, model_name="text-davinci-002")) 

# define prompt helper
# set maximum input size
max_input_size = 4096
# set number of output tokens
num_output = 4096
# set maximum chunk overlap
max_chunk_overlap = 20
prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap)

service_context_gpt3 = ServiceContext.from_defaults(llm_predictor=llm_predictor_gpt3, prompt_helper=prompt_helper)


In [15]:
# Create an index of your documents
index = GPTVectorStoreIndex.from_documents(documents)

# Store index to local disk
index.storage_context.persist()

from llama_index import StorageContext, load_index_from_storage

# rebuild storage context
storage_context = StorageContext.from_defaults(persist_dir='./storage')
# load index
index = load_index_from_storage(storage_context)


In [16]:
# Query your index!

query_engine = index.as_query_engine()
response = query_engine.query("開戶申請需先準備甚麼資料?")
print(response)
response = query_engine.query("美國人可以申請開戶嗎?")
print(response)


答案：

開戶申請需先準備的資料包括：身分證正本正反面、第二證件正本之照片圖檔(如:護照、健保卡、駕照)、手寫簽名拍照圖檔、受益人身分證正反面影本及第二證件影本、約定帳戶存摺封面影本、未成年子女(未滿18歲)開戶需本人及雙法代填投資人風險屬性

答案：不可以。


# Wikipedia Example

In [162]:
from llama_index import download_loader

WikipediaReader = download_loader("WikipediaReader")

loader = WikipediaReader()
wikidocs = loader.load_data(pages=['Cyclone Freddy'])

# https://en.wikipedia.org/wiki/Cyclone_Freddy

In [163]:
wiki_index = GPTSimpleVectorIndex(wikidocs)

INFO:root:> [build_index_from_documents] Total LLM token usage: 0 tokens
INFO:root:> [build_index_from_documents] Total embedding token usage: 4103 tokens


In [165]:
response = wiki_index.query("What is cyclone freddy?")
print(response)

INFO:root:> [query] Total LLM token usage: 3844 tokens
INFO:root:> [query] Total embedding token usage: 8 tokens






# Customer Support Example

In [150]:
documents = SimpleDirectoryReader('./asos').load_data()

In [151]:
index = GPTSimpleVectorIndex(documents)

INFO:root:> [build_index_from_documents] Total LLM token usage: 0 tokens
INFO:root:> [build_index_from_documents] Total embedding token usage: 12584 tokens


In [153]:
response = index.query("What premier service options do I have in the UAE?")
print(response)

INFO:root:> [query] Total LLM token usage: 1317 tokens
INFO:root:> [query] Total embedding token usage: 11 tokens



In the United Arab Emirates, you have the option of signing up for ASOS Premier, which gives you free Standard and Express delivery all year round when you spend over 150 AED. It costs 200 AED and is valid on the order you purchase it on.


# YouTube Video Example

In [154]:
YoutubeTranscriptReader = download_loader("YoutubeTranscriptReader")

loader = YoutubeTranscriptReader()
documents = loader.load_data(ytlinks=['https://www.youtube.com/watch?v=K7Kh9Ntd8VE&ab_channel=DaveNick'])

In [159]:
index = GPTSimpleVectorIndex(documents)

INFO:root:> [build_index_from_documents] Total LLM token usage: 0 tokens
INFO:root:> [build_index_from_documents] Total embedding token usage: 18181 tokens


In [157]:
response = index.query("What some YouTube automation mistakes to avoid?")
print(response)

INFO:root:> [query] Total LLM token usage: 4024 tokens
INFO:root:> [query] Total embedding token usage: 8 tokens




1. Re-uploading other people's content without permission.
2. Using copyrighted music.
3. Not understanding how the YouTube algorithm works.
4. Not researching the best niche for YouTube automation.
5. Not optimizing the About section with relevant keywords.
6. Not creating a logo and channel art that is professional and attractive.


# Chatbot Class - Just include your index

In [2]:
import openai
import json

class Chatbot:
    def __init__(self, api_key, index):
        self.index = index
        openai.api_key = api_key
        self.chat_history = []

    def generate_response(self, user_input):
        prompt = "\n".join([f"{message['role']}: {message['content']}" for message in self.chat_history[-5:]])
        prompt += f"\nUser: {user_input}"
        response = index.query(user_input)

        message = {"role": "assistant", "content": response.response}
        self.chat_history.append({"role": "user", "content": user_input})
        self.chat_history.append(message)
        return message
    
    def load_chat_history(self, filename):
        try:
            with open(filename, 'r') as f:
                self.chat_history = json.load(f)
        except FileNotFoundError:
            pass

    def save_chat_history(self, filename):
        with open(filename, 'w') as f:
            json.dump(self.chat_history, f)


In [None]:
documents = SimpleDirectoryReader('./data').load_data()
index = GPTSimpleVectorIndex(documents)

In [None]:
# Swap out your index below for whatever knowledge base you want
bot = Chatbot("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", index=index)
bot.load_chat_history("chat_history.json")

while True:
    user_input = input("You: ")
    if user_input.lower() in ["bye", "goodbye"]:
        print("Bot: Goodbye!")
        bot.save_chat_history("chat_history.json")
        break
    response = bot.generate_response(user_input)
    print(f"Bot: {response['content']}")