In [32]:
# ====================================
# openai tokens
# ====================================

import os
import glob
import codecs
import pickle
import re
import textwrap
from collections import namedtuple

import faiss
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter, MarkdownTextSplitter
from langchain.vectorstores import FAISS
from pymongo import MongoClient

from sys import path

path.append("/opt/configs/ramjet")
import prd

os.environ["OPENAI_API_KEY"] = prd.OPENAI_TOKEN_ME


Index = namedtuple("index", ["store", "scaned_files"])


def pretty_print(text: str) -> str:
    text = text.strip()
    return textwrap.fill(text, width=60, subsequent_indent="    ")



In [33]:
# ==============================================================
# prepare pdf documents docs.index & docs.store
#
# https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/pdf.html#retain-elements
#
# 通用的函数定义
# ==============================================================

from urllib.parse import quote

from langchain.document_loaders import PyPDFLoader

# from langchain.document_loaders import UnstructuredPDFLoader
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter, MarkdownTextSplitter

text_splitter = CharacterTextSplitter(chunk_size=500, separator="\n")
markdown_splitter = MarkdownTextSplitter(chunk_size=500, chunk_overlap=50)

N_BACTCH_FILES = 5


def is_file_scaned(index: Index, fpath):
    return os.path.split(fpath)[1] in index.scaned_files


def embedding_pdfs(index: Index, fpaths, url, replace_by_url):
    i = 0
    docs = []
    metadatas = []
    for fpath in fpaths:
        fname = os.path.split(fpath)[1]
        if is_file_scaned(index, fname):
            continue

        loader = PyPDFLoader(fpath)
        for page, data in enumerate(loader.load_and_split()):
            splits = text_splitter.split_text(data.page_content)
            docs.extend(splits)
            for ichunk, _ in enumerate(splits):
                fnameurl = quote(fpath.removeprefix(replace_by_url), safe="")
                furl = url + fnameurl
                metadatas.append({"source": f"{furl}#page={page+1}"})

        index.scaned_files.add(fname)
        print(f"scaned {fpath}")
        i += 1
        if i > N_BACTCH_FILES:
            break

    if i != 0:
        index.store.add_texts(docs, metadatas=metadatas)

    return i


def embedding_markdowns(index: Index, fpaths, url, replace_by_url):
    i = 0
    docs = []
    metadatas = []
    for fpath in fpaths:
        fname = os.path.split(fpath)[1]
        if is_file_scaned(index, fpath):
            continue

        with codecs.open(fpath, "rb", "utf8") as fp:
            docus = markdown_splitter.create_documents([fp.read()])
            for ichunk, docu in enumerate(docus):
                docs.append(docu.page_content)
                title = quote(docu.page_content.strip().split("\n", maxsplit=1)[0])
                if url:
                    fnameurl = quote(fpath.removeprefix(replace_by_url), safe="")
                    furl = url + fnameurl
                    metadatas.append({"source": f"{furl}#{title}"})
                else:
                    metadatas.append({"source": f"{fname}#{title}"})
                    
        index.scaned_files.add(fname)
        print(f"scaned {fpath}")
        i += 1
        if i > N_BACTCH_FILES:
            break

    if i != 0:
        index.store.add_texts(docs, metadatas=metadatas)

    return i


def load_store(dirpath, name) -> Index:
    """
    Args:
        dirpath: dirpath to store index files
        name: project/file name
    """
    index = faiss.read_index(f"{os.path.join(dirpath, name)}.index")
    with open(f"{os.path.join(dirpath, name)}.store", "rb") as f:
        store = pickle.load(f)
    store.index = index

    with open(f"{os.path.join(dirpath, name)}.scanedfile", "rb") as f:
        scaned_files = pickle.load(f)

    return Index(
        store=store,
        scaned_files=scaned_files,
    )


def new_store() -> Index:
    store = FAISS.from_texts(["world"], OpenAIEmbeddings(), metadatas=[{"source": "hello"}])
    return Index(
        store=store,
        scaned_files=set([]),
    )


def save_store(index: Index, dirpath, name):
    store_index = index.store.index
    fpath_prefix = os.path.join(dirpath, name)
    print(f"save store to {fpath_prefix}")
    faiss.write_index(store_index, f"{fpath_prefix}.index")
    index.store.index = None
    with open(f"{fpath_prefix}.store", "wb") as f:
        pickle.dump(index.store, f)
    index.store.index = store_index

    with open(f"{fpath_prefix}.scanedfile", "wb") as f:
        pickle.dump(index.scaned_files, f)


In [34]:
# incremental scan pdfs

def gen_pdfs():
    yield from glob.glob("/home/laisky/data/langchain/pdf/security/**/*.pdf", recursive=True)

def run_scan_pdfs():
#     index = new_store()
    total = 0
    while True:
        index = load_store(
            dirpath="/home/laisky/data/langchain/index",
            name="security",
        )
        n = embedding_pdfs(
            index=index,
            fpaths=gen_pdfs(),
            url="https://s3.laisky.com/public/papers/security/",
            replace_by_url="/home/laisky/data/langchain/pdf/security/",
        )
        total += n
        save_store(
            index=index, 
            dirpath="/home/laisky/data/langchain/index", 
            name="security",
        )
        
        print(f"scanned {total} files")
        if n == 0:
            return
        
run_scan_pdfs()

scaned /home/laisky/data/langchain/pdf/security/TEE/comparison_between_SEV_and_SGX.pdf
scaned /home/laisky/data/langchain/pdf/security/TEE/Confidential_High-Performance_Computing_in_the_Public_Cloud.pdf
scaned /home/laisky/data/langchain/pdf/security/TEE/cocoTPM.pdf


Created a chunk of size 857, which is longer than the specified 500
Created a chunk of size 986, which is longer than the specified 500
Created a chunk of size 546, which is longer than the specified 500
Created a chunk of size 549, which is longer than the specified 500
Created a chunk of size 555, which is longer than the specified 500
Created a chunk of size 538, which is longer than the specified 500
Created a chunk of size 510, which is longer than the specified 500


scaned /home/laisky/data/langchain/pdf/security/TEE/A survey of Intel SGX and its applications.pdf
scaned /home/laisky/data/langchain/pdf/security/TEE/Towards_A_Secure_Joint_Cloud_With_Confidential_Computing.pdf
save store to /home/laisky/data/langchain/index/security
scanned 5 files
save store to /home/laisky/data/langchain/index/security
scanned 5 files


In [None]:
# incremental scan markdowns

def gen_markdowns():
    yield "/home/laisky/data/langchain/basebit/doc/content/terms.md"
    yield from glob.glob("/home/laisky/data/langchain/basebit/doc/content/research/**/*.md", recursive=True)
    

def run_scan_markdowns():
#         index = new_store()
    while True:
        index = load_store(
            dirpath="/home/laisky/data/langchain/index",
            name="security",
        )
        files = gen_markdowns()
        n = embedding_markdowns(
            index=index,
            fpaths=files,
            url="https://s3.laisky.com/public/papers/security/",
            replace_by_url="/home/laisky/data/langchain/pdf/security/",
        )
        save_store(
            index=index,
            dirpath="/home/laisky/data/langchain/index/", 
            name="security",
        )
        
        print(f"{n=}")
        if n == 0:
            return
        
        
run_scan_markdowns()

In [35]:
# ====================================
# 生成用于问答的 query chain
# ====================================

from langchain import OpenAI
from langchain.chains import VectorDBQAWithSourcesChain
from langchain.chains.question_answering import load_qa_chain
from langchain.chains.qa_with_sources import load_qa_with_sources_chain

index = load_store(
    dirpath="/home/laisky/data/langchain/index",
    name="security",
)
chain = VectorDBQAWithSourcesChain.from_llm(
    llm=OpenAI(
        temperature=0, 
        max_tokens=1000,
        model_name="text-davinci-003",
        streaming=False,
    ), 
#     retriever=VectorStoreRetriever(vectorstore=index.store, search_kwargs={"filter":{"type":"filter"},"k":3},),
    vectorstore=index.store,
    reduce_k_below_max_tokens=True,
)


In [40]:
# ====================================
# ask pdf embeddings
# ====================================
question = "使用 HKDF 算法派生密钥时，作为派生根的 master key 的长度该设置为多少"
result = chain(
    {
        "question": question,
    },
    return_only_outputs=True,
)

print(f"🤔️: {question}\n")
print(f"🤖️: {pretty_print(result['answer'])}\n")
print(f"📖: {result['sources']}")

🤔️: 使用 HKDF 算法派生密钥时，作为派生根的 master key 的长度该设置为多少

🤖️: The length of the master key used for deriving keys with the
    HKDF algorithm should be at least 256 bits.

📖: 
https://s3.laisky.com/public/papers/security/RFC8446_The%20Transport%20Layer%20Security%20%28TLS%29%20Protocol%20Version%201.3.pdf#page=145
https://s3.laisky.com/public/papers/security/Computer%20Security%20Art%20And%20Science_Matt%20Bishop%2C%20Elisabeth%20Sullivan%20etc.pdf#page=448
https://s3.laisky.com/public/papers/security/NIST_SP-800-131Ar2_Transitioning%20the%20Use%20of%20Cryptographic%20Algorithms%20and%20Key%20Lengths.pdf#page=22
