In [2]:
# ====================================
# openai tokens
# ====================================

import os
import glob
import codecs
import pickle
import textwrap

import faiss
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter, MarkdownTextSplitter
from langchain.vectorstores import FAISS
from pymongo import MongoClient

from sys import path
path.append("/opt/configs/ramjet")
import prd
os.environ["OPENAI_API_KEY"] = prd.OPENAI_TOKEN


def pretty_print(text: str) -> str:
    text = text.strip()
    return textwrap.fill(text, width=80, subsequent_indent='    ')

In [None]:
# ====================================
# prepare pdf documents docs.index & docs.store
# 
# https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/pdf.html#retain-elements
# ====================================

from urllib.parse import quote

from langchain.document_loaders import PyPDFLoader
# from langchain.document_loaders import UnstructuredPDFLoader
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter, MarkdownTextSplitter

text_splitter = CharacterTextSplitter(chunk_size=1500, separator="\n")
pdf_files = glob.glob("/home/laisky/data/langchain/pdf/security/**/*.pdf", recursive=True)


def scan_pdfs(store, scaned_files):
    i = 0
    docs = []
    metadatas = []
    for file in pdf_files:
        if file in scaned_files:
            continue

        loader = PyPDFLoader(file)
        for page, data in enumerate(loader.load()):
            splits = text_splitter.split_text(data.page_content)
            docs.extend(splits)
            for ichunk, _ in enumerate(splits):
                fnameurl = quote(file.lstrip("/home/laisky/data/langchain/pdf/security/"), safe="")
                url = "https://s3.laisky.com/public/papers/security/" + fnameurl
                metadatas.append({"source": f"{url}#page={page+1}?chunk={ichunk}"})

        scaned_files.add(file)
        print(f"scaned {file}")
        i += 1
        if i > 5:
            break
        
    store.add_texts(docs, metadatas=metadatas)
    return i
    
            
def load_store():
    index = faiss.read_index("/home/laisky/data/langchain/pdf/index/security.index")
    with open("/home/laisky/data/langchain/pdf/index/security.store", "rb") as f:
        store = pickle.load(f)
    store.index = index
    
    with open("/home/laisky/data/langchain/pdf/index/security.scanedfile", "rb") as f:
        scaned_files = pickle.load(f)
    
    return store, scaned_files
        
    
def new_store(docs, metadatas):        
    store = FAISS.from_texts(docs, OpenAIEmbeddings(), metadatas=metadatas)
    return store
    
def save_store(store, scaned_files):
    index = store.index
    faiss.write_index(index, "/home/laisky/data/langchain/pdf/index/security.index")
    store.index = None
    with open("/home/laisky/data/langchain/pdf/index/security.store", "wb") as f:
        pickle.dump(store, f)
    store.index = index
    
    with open("/home/laisky/data/langchain/pdf/index/security.scanedfile", "wb") as f:
        pickle.dump(scaned_files, f)
        
while True: 
    store, scaned_files = load_store()
    n = scan_pdfs(store, scaned_files)
    save_store(store, scaned_files)
    
    if n == 0:
        break
    
from langchain import OpenAI
from langchain.chains import VectorDBQAWithSourcesChain
chain = VectorDBQAWithSourcesChain.from_llm(llm=OpenAI(temperature=0), vectorstore=store)

scaned /home/laisky/data/langchain/pdf/security/NIST_SP-800-207_Zero Trust Architecture.pdf
scaned /home/laisky/data/langchain/pdf/security/NIST_SP-800-57-Part 1 Rev. 5_Recommendation for Key Management-General.pdf
scaned /home/laisky/data/langchain/pdf/security/NIST_SP-800-116r1-Guidelines for the Use of PIV Credentials in Facility Access.pdf
scaned /home/laisky/data/langchain/pdf/security/passwordless/FIDO_The-State-of-Authentication-Report.pdf
scaned /home/laisky/data/langchain/pdf/security/passwordless/FIDO_Alliance White Paper- Leveraging FIDO Standards to Extend the PKI Security Model in United States Government Agencies.pdf
scaned /home/laisky/data/langchain/pdf/security/passwordless/FIDO_Alliance and Asia PKI Consortium White Paper- FIDO UAF and PKI in Asia – Case Study and Recommendations.pdf


In [21]:
# ====================================
# ask pdf embeddings
# ====================================
question = ""
result = chain({"question": question})

print(f"🤔️: {question}\n")
print(f"🤖️: {pretty_print(result['answer'])}\n")
print(f"📖: {result['sources']}")

🤔️: AMD SNP 是什么

🤖️: AMD SNP stands for Advanced Micro Devices Secure Nested Paging.

📖: https://www.amd.com/en/technologies/secure-nested-paging


In [13]:
# ====================================
# prepare basebit documents docs.index & docs.store
# ====================================

import os
import glob
import codecs
import pickle

import faiss
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter, MarkdownTextSplitter
from langchain.vectorstores import FAISS
from pymongo import MongoClient


host = "100.97.108.34" # ubuntu
dbconn = MongoClient(prd.OPENAI_EMBEDDING_QA["basebit"]["mongo_source"])

# text_splitter = CharacterTextSplitter(chunk_size=1500, separator="\n")
markdown_splitter = MarkdownTextSplitter(chunk_size=1000)

docs = []
metadatas = []
cursor = dbconn["basebit"]["docus"].find().batch_size(50)
for doc in cursor:
    # splits = text_splitter.split_text(doc["text"])
    splits = markdown_splitter.split_text(doc["text"])
    docs.extend(splits)
    metadatas.extend([{"source": doc["url"]}] * len(splits))


# Here we create a vector store from the documents and save it to disk.
store = FAISS.from_texts(docs, OpenAIEmbeddings(), metadatas=metadatas)
faiss.write_index(store.index, "/home/laisky/data/langchain/basebit/docs.index")
store.index = None
with open("/home/laisky/data/langchain/basebit/docs.store", "wb") as f:
    pickle.dump(store, f)

In [1]:
# ====================================
# load docs.index & faiss_store.pkl
# ====================================

import os
import textwrap

from sys import path
path.append("/opt/configs/ramjet")
import prd
os.environ["OPENAI_API_KEY"] = prd.OPENAI_TOKEN

import faiss
from langchain import OpenAI
from langchain.chains import VectorDBQAWithSourcesChain
import pickle
import argparse


index = faiss.read_index("/home/laisky/download/langchain/docs.index")
with open("/home/laisky/download/langchain/doc_faiss_store.pkl", "rb") as f:
    store = pickle.load(f)
    
store.index = index
chain = VectorDBQAWithSourcesChain.from_llm(llm=OpenAI(temperature=0), vectorstore=store)

def pretty_print(text: str) -> str:
    text = text.strip()
    return textwrap.fill(text, width=80, subsequent_indent='    ')

In [9]:
# ====================================
# chatbot as service
# ====================================

question = "啥是 TEE，整点例子展开讲讲？"
result = chain({"question": question})

print(f"🤔️: {question}\n")
print(f"🤖️: {pretty_print(result['answer'])}\n")
print(f"📖: {pretty_print(result['sources'])}")

🤔️: 啥是 TEE，整点例子展开讲讲？

🤖️: TEE (Trusted Execution Environment) is a form of trusted computing that provides
    a secure environment to run trusted applications. Examples of such
    applications include encryption/decryption, digital signing, authentication,
    etc. TEE can protect the data and code of applications from being accessed
    by unauthorized third parties.

📖: /home/laisky/download/doc/content/terms.md
