In [8]:
# ====================================
# prepare docs.index & faiss_store.pkl
# ====================================

import glob
import codecs
import pickle

import faiss
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter, MarkdownTextSplitter
from langchain.vectorstores import FAISS

data = []
sources = []
for f in glob.iglob("/home/laisky/download/doc/content/*.md"):
    sources.append(f)
    with codecs.open(f, "r", "utf8") as fp:
        data.append(fp.read())
        
        
# Here we split the documents, as needed, into smaller chunks.
# We do this due to the context limits of the LLMs.
text_splitter = CharacterTextSplitter(chunk_size=1500, separator="\n")
docs = []
metadatas = []
for i, d in enumerate(data):
    splits = text_splitter.split_text(d)
    docs.extend(splits)
    metadatas.extend([{"source": sources[i]}] * len(splits))
    
    
# Here we create a vector store from the documents and save it to disk.
store = FAISS.from_texts(docs, OpenAIEmbeddings(), metadatas=metadatas)
faiss.write_index(store.index, "/home/laisky/download/langchain/docs.index")
store.index = None
with open("/home/laisky/download/langchain/doc_faiss_store.pkl", "wb") as f:
    pickle.dump(store, f)

In [1]:
# ====================================
# load docs.index & faiss_store.pkl
# ====================================

import os
import textwrap
from sys import path

path.append("/opt/configs/ramjet")
import prd
os.environ["OPENAI_API_KEY"] = prd.OPENAI_TOEN

import faiss
from langchain import OpenAI
from langchain.chains import VectorDBQAWithSourcesChain
import pickle
import argparse


index = faiss.read_index("/home/laisky/download/langchain/docs.index")
with open("/home/laisky/download/langchain/doc_faiss_store.pkl", "rb") as f:
    store = pickle.load(f)
    
store.index = index
chain = VectorDBQAWithSourcesChain.from_llm(llm=OpenAI(temperature=0), vectorstore=store)

def pretty_print(text: str) -> str:
    text = text.strip()
    return textwrap.fill(text, width=80, subsequent_indent='    ')

In [9]:
# ====================================
# chatbot as service
# ====================================

question = "啥是 TEE，整点例子展开讲讲？"
result = chain({"question": question})

print(f"🤔️: {question}\n")
print(f"🤖️: {pretty_print(result['answer'])}\n")
print(f"📖: {pretty_print(result['sources'])}")

🤔️: 啥是 TEE，整点例子展开讲讲？

🤖️: TEE (Trusted Execution Environment) is a form of trusted computing that provides
    a secure environment to run trusted applications. Examples of such
    applications include encryption/decryption, digital signing, authentication,
    etc. TEE can protect the data and code of applications from being accessed
    by unauthorized third parties.

📖: /home/laisky/download/doc/content/terms.md
