In [7]:
# ====================================
# openai tokens
# ====================================

import os
import glob
import codecs
import pickle
import re
import textwrap
from collections import namedtuple

import faiss
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter, MarkdownTextSplitter
from langchain.vectorstores import FAISS
from pymongo import MongoClient

from sys import path

path.append("/opt/configs/ramjet")
import prd

os.environ["OPENAI_API_KEY"] = prd.OPENAI_TOKEN_ME


Index = namedtuple("index", ["store", "scaned_files"])


def pretty_print(text: str) -> str:
    text = text.strip()
    return textwrap.fill(text, width=60, subsequent_indent="    ")



In [4]:
# ==============================================================
# prepare pdf documents docs.index & docs.store
#
# https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/pdf.html#retain-elements
#
# 通用的函数定义
# ==============================================================

from urllib.parse import quote

from langchain.document_loaders import PyPDFLoader

# from langchain.document_loaders import UnstructuredPDFLoader
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter, MarkdownTextSplitter

text_splitter = CharacterTextSplitter(chunk_size=500, separator="\n")
markdown_splitter = MarkdownTextSplitter(chunk_size=500, chunk_overlap=50)

N_BACTCH_FILES = 5


def is_file_scaned(index: Index, fpath):
    return os.path.split(fpath)[1] in index.scaned_files


def embedding_pdfs(index: Index, fpaths, url, replace_by_url):
    i = 0
    docs = []
    metadatas = []
    for fpath in fpaths:
        fname = os.path.split(fpath)[1]
        if is_file_scaned(index, fname):
            continue

        loader = PyPDFLoader(fpath)
        for page, data in enumerate(loader.load_and_split()):
            splits = text_splitter.split_text(data.page_content)
            docs.extend(splits)
            for ichunk, _ in enumerate(splits):
                fnameurl = quote(fpath.removeprefix(replace_by_url), safe="")
                furl = url + fnameurl
                metadatas.append({"source": f"{furl}#page={page+1}"})

        index.scaned_files.add(fname)
        print(f"scaned {fpath}")
        i += 1
        if i > N_BACTCH_FILES:
            break

    if i != 0:
        index.store.add_texts(docs, metadatas=metadatas)

    return i


def embedding_markdowns(index: Index, fpaths, url, replace_by_url):
    i = 0
    docs = []
    metadatas = []
    for fpath in fpaths:
        fname = os.path.split(fpath)[1]
        if is_file_scaned(index, fpath):
            continue

        with codecs.open(fpath, "rb", "utf8") as fp:
            docus = markdown_splitter.create_documents([fp.read()])
            for ichunk, docu in enumerate(docus):
                docs.append(docu.page_content)
                title = quote(docu.page_content.strip().split("\n", maxsplit=1)[0])
                if url:
                    fnameurl = quote(fpath.removeprefix(replace_by_url), safe="")
                    furl = url + fnameurl
                    metadatas.append({"source": f"{furl}#{title}"})
                else:
                    metadatas.append({"source": f"{fname}#{title}"})
                    
        index.scaned_files.add(fname)
        print(f"scaned {fpath}")
        i += 1
        if i > N_BACTCH_FILES:
            break

    if i != 0:
        index.store.add_texts(docs, metadatas=metadatas)

    return i


def load_store(dirpath, name) -> Index:
    """
    Args:
        dirpath: dirpath to store index files
        name: project/file name
    """
    index = faiss.read_index(f"{os.path.join(dirpath, name)}.index")
    with open(f"{os.path.join(dirpath, name)}.store", "rb") as f:
        store = pickle.load(f)
    store.index = index

    with open(f"{os.path.join(dirpath, name)}.scanedfile", "rb") as f:
        scaned_files = pickle.load(f)

    return Index(
        store=store,
        scaned_files=scaned_files,
    )


def new_store() -> Index:
    store = FAISS.from_texts(["world"], OpenAIEmbeddings(), metadatas=[{"source": "hello"}])
    return Index(
        store=store,
        scaned_files=set([]),
    )


def save_store(index: Index, dirpath, name):
    store_index = index.store.index
    fpath_prefix = os.path.join(dirpath, name)
    print(f"save store to {fpath_prefix}")
    faiss.write_index(store_index, f"{fpath_prefix}.index")
    index.store.index = None
    with open(f"{fpath_prefix}.store", "wb") as f:
        pickle.dump(index.store, f)
    index.store.index = store_index

    with open(f"{fpath_prefix}.scanedfile", "wb") as f:
        pickle.dump(index.scaned_files, f)


In [None]:
# incremental scan pdfs

def gen_pdfs():
    yield from glob.glob("/home/laisky/data/langchain/pdf/security/**/*.pdf", recursive=True)

def run_scan_pdfs():
#     index = new_store()
    total = 0
    while True:
        index = load_store(
            dirpath="/home/laisky/data/langchain/index",
            name="security",
        )
        n = embedding_pdfs(
            index=index,
            fpaths=gen_pdfs(),
            url="https://s3.laisky.com/public/papers/security/",
            replace_by_url="/home/laisky/data/langchain/pdf/security/",
        )
        total += n
        save_store(
            index=index, 
            dirpath="/home/laisky/data/langchain/index", 
            name="security",
        )
        
        print(f"scanned {total} files")
        if n == 0:
            return
        
run_scan_pdfs()

Created a chunk of size 533, which is longer than the specified 500
Created a chunk of size 523, which is longer than the specified 500
Created a chunk of size 602, which is longer than the specified 500
Created a chunk of size 598, which is longer than the specified 500
Created a chunk of size 611, which is longer than the specified 500
Created a chunk of size 648, which is longer than the specified 500
Created a chunk of size 609, which is longer than the specified 500
Created a chunk of size 547, which is longer than the specified 500
Created a chunk of size 527, which is longer than the specified 500
Created a chunk of size 647, which is longer than the specified 500
Created a chunk of size 642, which is longer than the specified 500
Created a chunk of size 559, which is longer than the specified 500
Created a chunk of size 742, which is longer than the specified 500
Created a chunk of size 508, which is longer than the specified 500
Created a chunk of size 528, which is longer tha

scaned /home/laisky/data/langchain/pdf/security/Brian Ward - How Linux Works (2021, No Starch Press).pdf
scaned /home/laisky/data/langchain/pdf/security/Computer Security Art And Science_Matt Bishop, Elisabeth Sullivan etc.pdf


Created a chunk of size 514, which is longer than the specified 500
Created a chunk of size 551, which is longer than the specified 500
Created a chunk of size 706, which is longer than the specified 500
Created a chunk of size 733, which is longer than the specified 500
Created a chunk of size 947, which is longer than the specified 500
Created a chunk of size 567, which is longer than the specified 500
Created a chunk of size 760, which is longer than the specified 500
Created a chunk of size 528, which is longer than the specified 500
Created a chunk of size 644, which is longer than the specified 500
Created a chunk of size 664, which is longer than the specified 500
Created a chunk of size 547, which is longer than the specified 500
Created a chunk of size 713, which is longer than the specified 500
Created a chunk of size 1035, which is longer than the specified 500
Created a chunk of size 796, which is longer than the specified 500
Created a chunk of size 651, which is longer th

Created a chunk of size 530, which is longer than the specified 500
Created a chunk of size 534, which is longer than the specified 500
Created a chunk of size 525, which is longer than the specified 500
Created a chunk of size 558, which is longer than the specified 500
Created a chunk of size 567, which is longer than the specified 500
Created a chunk of size 571, which is longer than the specified 500
Created a chunk of size 581, which is longer than the specified 500
Created a chunk of size 628, which is longer than the specified 500
Created a chunk of size 618, which is longer than the specified 500
Created a chunk of size 607, which is longer than the specified 500
Created a chunk of size 543, which is longer than the specified 500
Created a chunk of size 800, which is longer than the specified 500
Created a chunk of size 652, which is longer than the specified 500
Created a chunk of size 554, which is longer than the specified 500
Created a chunk of size 645, which is longer tha

scaned /home/laisky/data/langchain/pdf/security/CSAPP-Computer.Systems.A.Programmers.Perspective.3rd.Global.Edition.2015.pdf
scaned /home/laisky/data/langchain/pdf/security/TEE/SGX/Remote Attestation for Multi-Package Platforms using Intel®SGX Datacenter Attestation Primitives (DCAP).pdf
scaned /home/laisky/data/langchain/pdf/security/TEE/SGX/Supporting Third Party Attestation for Intel® SGX with Intel® Data Center Attestation Primitives.pdf
scaned /home/laisky/data/langchain/pdf/security/TEE/SGX/Moat- Verifying Confidentiality of Enclave Programs.pdf


In [None]:
# incremental scan markdowns

def gen_markdowns():
    yield "/home/laisky/data/langchain/basebit/doc/content/terms.md"
    yield from glob.glob("/home/laisky/data/langchain/basebit/doc/content/research/**/*.md", recursive=True)
    

def run_scan_markdowns():
#         index = new_store()
    while True:
        index = load_store(
            dirpath="/home/laisky/data/langchain/index",
            name="security",
        )
        files = gen_markdowns()
        n = embedding_markdowns(
            index=index,
            fpaths=files,
            url="https://s3.laisky.com/public/papers/security/",
            replace_by_url="/home/laisky/data/langchain/pdf/security/",
        )
        save_store(
            index=index,
            dirpath="/home/laisky/data/langchain/index/", 
            name="security",
        )
        
        print(f"{n=}")
        if n == 0:
            return
        
        
run_scan_markdowns()

In [25]:
# ====================================
# 生成用于问答的 query chain
# ====================================

from langchain import OpenAI
from langchain.chains import VectorDBQAWithSourcesChain
from langchain.chains.question_answering import load_qa_chain
from langchain.chains.qa_with_sources import load_qa_with_sources_chain

index = load_store(
    dirpath="/home/laisky/data/langchain/index",
    name="security",
)
chain = VectorDBQAWithSourcesChain.from_llm(
    llm=OpenAI(
        temperature=0, 
        max_tokens=1000,
        model_name="text-davinci-003",
        streaming=False,
    ), 
#     retriever=VectorStoreRetriever(vectorstore=index.store, search_kwargs={"filter":{"type":"filter"},"k":3},),
    vectorstore=index.store,
    reduce_k_below_max_tokens=True,
)


In [27]:
# ====================================
# ask pdf embeddings
# ====================================
question = "安全的哈希函数应该具备哪些性质"
result = chain(
    {
        "question": question,
    },
    return_only_outputs=True,
)

print(f"🤔️: {question}\n")
print(f"🤖️: {pretty_print(result['answer'])}\n")
print(f"📖: {result['sources']}")

🤔️: 安全的哈希函数应该具备哪些性质

🤖️: A secure hash function should have the properties of pre-
    image resistance, second pre-image resistance, and
    collision resistance, as well as high security to
    prevent attacks, a non-negative constant for the system
    to learn after the algorithm has determined the system
    synchronization, and a neural network synchronization
    determination algorithm based on the output of the
    hidden unit.

📖: https://en.wikipedia.org/wiki/Cryptographic_hash_function#Security_requirements, https://s3.laisky.com/public/papers/security/%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C%E5%90%8C%E6%AD%A5%E7%9A%84%E5%88%A4%E5%AE%9A%E5%8F%8A%E5%9C%A8%E7%A5%9E%E7%BB%8F%E5%AF%86%E7%A0%81%E4%B8%AD%E7%9A%84%E5%BA%94%E7%94%A8.pdf#page=3
