In [11]:
# ====================================
# openai tokens
# ====================================

import os
import glob
import codecs
import pickle
import re
import textwrap
from collections import namedtuple

import openai
import faiss
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter, MarkdownTextSplitter
from langchain.vectorstores import FAISS
from pymongo import MongoClient
from kipp.utils import setup_logger

from sys import path

path.append("/opt/configs/ramjet")
import prd

# ----------------------------------------------
# Azure
# ----------------------------------------------
os.environ['OPENAI_API_TYPE'] = "azure"
os.environ['OPENAI_API_VERSION'] = prd.OPENAI_AZURE_VERSION
os.environ['OPENAI_API_BASE'] = prd.OPENAI_AZURE_API
os.environ['OPENAI_API_KEY'] = prd.OPENAI_AZURE_TOKEN

openai.api_type = os.environ['OPENAI_API_TYPE']
openai.api_version = os.environ['OPENAI_API_VERSION']
openai.api_base = os.environ['OPENAI_API_BASE']
openai.api_key = os.environ['OPENAI_API_KEY']

azure_embeddings_deploymentid = prd.OPENAI_AZURE_DEPLOYMENTS["embeddings"].deployment_id
azure_gpt_deploymentid = prd.OPENAI_AZURE_DEPLOYMENTS["chat"].deployment_id
# ----------------------------------------------

# ----------------------------------------------
# OpenAI
# ----------------------------------------------
# os.environ["OPENAI_API_KEY"] = prd.OPENAI_TOKEN
# ----------------------------------------------

Index = namedtuple("index", ["store", "scaned_files"])


logger = setup_logger("security")

def pretty_print(text: str) -> str:
    text = text.strip()
    return textwrap.fill(text, width=60, subsequent_indent="    ")


# =============================
# 定义文件路径
# =============================

index_dirpath = "/home/laisky/data/langchain/index-azure/"


In [33]:
# ==============================================================
# prepare pdf documents docs.index & docs.store
#
# https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/pdf.html#retain-elements
#
# 通用的函数定义
# ==============================================================

from urllib.parse import quote

from langchain.document_loaders import PyPDFLoader

# from langchain.document_loaders import UnstructuredPDFLoader
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter, MarkdownTextSplitter

text_splitter = CharacterTextSplitter(chunk_size=500, separator="\n")
markdown_splitter = MarkdownTextSplitter(chunk_size=500, chunk_overlap=50)

N_BACTCH_FILES = 5


def is_file_scaned(index: Index, fpath):
    return os.path.split(fpath)[1] in index.scaned_files


def embedding_pdfs(index: Index, fpaths, url, replace_by_url):
    i = 0
    docs = []
    metadatas = []
    for fpath in fpaths:
        fname = os.path.split(fpath)[1]
        if is_file_scaned(index, fname):
            continue

        try:
            loader = PyPDFLoader(fpath)
            for page, data in enumerate(loader.load_and_split()):
                splits = text_splitter.split_text(data.page_content)
                docs.extend(splits)
                for ichunk, _ in enumerate(splits):
                    fnameurl = quote(fpath.removeprefix(replace_by_url), safe="")
                    furl = url + fnameurl
                    metadatas.append({"source": f"{furl}#page={page+1}"})
        except Exception as err:
            logger.error(f"skip file {fpath}: {err}")
            continue

        index.scaned_files.add(fname)
        print(f"scaned {fpath}")
        i += 1
        if i > N_BACTCH_FILES:
            break

    if i != 0:
        index.store.add_texts(docs, metadatas=metadatas)

    return i


def embedding_markdowns(index: Index, fpaths, url, replace_by_url):
    i = 0
    docs = []
    metadatas = []
    for fpath in fpaths:
        fname = os.path.split(fpath)[1]
        if is_file_scaned(index, fpath):
            continue

        with codecs.open(fpath, "rb", "utf8") as fp:
            docus = markdown_splitter.create_documents([fp.read()])
            for ichunk, docu in enumerate(docus):
                docs.append(docu.page_content)
                title = quote(docu.page_content.strip().split("\n", maxsplit=1)[0])
                if url:
                    fnameurl = quote(fpath.removeprefix(replace_by_url), safe="")
                    furl = url + fnameurl
                    metadatas.append({"source": f"{furl}#{title}"})
                else:
                    metadatas.append({"source": f"{fname}#{title}"})
                    
        index.scaned_files.add(fname)
        print(f"scaned {fpath}")
        i += 1
        if i > N_BACTCH_FILES:
            break

    if i != 0:
        index.store.add_texts(docs, metadatas=metadatas)

    return i


def load_store(dirpath, name) -> Index:
    """
    Args:
        dirpath: dirpath to store index files
        name: project/file name
    """
    index = faiss.read_index(f"{os.path.join(dirpath, name)}.index")
    with open(f"{os.path.join(dirpath, name)}.store", "rb") as f:
        store = pickle.load(f)
    store.index = index

    with open(f"{os.path.join(dirpath, name)}.scanedfile", "rb") as f:
        scaned_files = pickle.load(f)

    return Index(
        store=store,
        scaned_files=scaned_files,
    )


def new_store() -> Index:
    embedding_model = OpenAIEmbeddings(
        model="text-embedding-ada-002",
        deployment=azure_embeddings_deploymentid,
    )
    store = FAISS.from_texts(["world"], embedding_model, metadatas=[{"source": "hello"}])
    return Index(
        store=store,
        scaned_files=set([]),
    )


def save_store(index: Index, dirpath, name):
    store_index = index.store.index
    fpath_prefix = os.path.join(dirpath, name)
    print(f"save store to {fpath_prefix}")
    faiss.write_index(store_index, f"{fpath_prefix}.index")
    index.store.index = None
    with open(f"{fpath_prefix}.store", "wb") as f:
        pickle.dump(index.store, f)
    index.store.index = store_index

    with open(f"{fpath_prefix}.scanedfile", "wb") as f:
        pickle.dump(index.scaned_files, f)


In [13]:
# incremental scan pdfs

def gen_pdfs():
    yield from glob.glob("/home/laisky/data/langchain/pdf/security/**/*.pdf", recursive=True)

def run_scan_pdfs():
#     index = new_store()
#     save_store(
#         index=index, 
#         dirpath=index_dirpath, 
#         name="security",
#     )
    
    total = 0
    while True:
        index = load_store(
            dirpath=index_dirpath,
            name="security",
        )
        n = embedding_pdfs(
            index=index,
            fpaths=gen_pdfs(),
            url="https://s3.laisky.com/public/papers/security/",
            replace_by_url="/home/laisky/data/langchain/pdf/security/",
        )
        total += n
        save_store(
            index=index, 
            dirpath=index_dirpath, 
            name="security",
        )
        
#         return
        print(f"scanned {total} files")
        if n == 0:
            return
        
run_scan_pdfs()

[2023-06-15 03:00:14,366 - ERROR - /tmp/ipykernel_567492/578690748.py:47 - security] - skip file /home/laisky/data/langchain/pdf/security/RFC2986_Certification Request Syntax Specification.pdf: EOF marker not found
scaned /home/laisky/data/langchain/pdf/security/TEE/TIO/Intel® TDX Connect TEE-IO Device Guide.pdf
scaned /home/laisky/data/langchain/pdf/security/TEE/TIO/PCIe Security Webinar_Aug 2020_PDF.pdf
scaned /home/laisky/data/langchain/pdf/security/TEE/TIO/AMD SEV-TIO- Trusted I:O for Secure Encrypted Virtualization.pdf


Created a chunk of size 1415, which is longer than the specified 500
Created a chunk of size 759, which is longer than the specified 500
Created a chunk of size 921, which is longer than the specified 500
Created a chunk of size 526, which is longer than the specified 500
Created a chunk of size 1661, which is longer than the specified 500
Created a chunk of size 1349, which is longer than the specified 500
Created a chunk of size 960, which is longer than the specified 500


scaned /home/laisky/data/langchain/pdf/security/TEE/TIO/Software Enabling for Intel® TDX in Support of TEE-I:O.pdf
[2023-06-15 03:00:19,088 - ERROR - /tmp/ipykernel_567492/578690748.py:47 - security] - skip file /home/laisky/data/langchain/pdf/security/国标/GB_T39204-2022 信息安全技术 关键信息基础设施安全保护要求.pdf: EOF marker not found
save store to /home/laisky/data/langchain/index-azure/security
scanned 4 files
[2023-06-15 03:07:22,269 - ERROR - /tmp/ipykernel_567492/578690748.py:47 - security] - skip file /home/laisky/data/langchain/pdf/security/RFC2986_Certification Request Syntax Specification.pdf: EOF marker not found
[2023-06-15 03:07:22,485 - ERROR - /tmp/ipykernel_567492/578690748.py:47 - security] - skip file /home/laisky/data/langchain/pdf/security/国标/GB_T39204-2022 信息安全技术 关键信息基础设施安全保护要求.pdf: EOF marker not found
save store to /home/laisky/data/langchain/index-azure/security
scanned 4 files


In [None]:
# incremental scan markdowns

def gen_markdowns():
    yield "/home/laisky/data/langchain/basebit/doc/content/terms.md"
    yield from glob.glob("/home/laisky/data/langchain/basebit/doc/content/research/**/*.md", recursive=True)
    

def run_scan_markdowns():
#         index = new_store()
    while True:
        index = load_store(
            dirpath="/home/laisky/data/langchain/index",
            name="security",
        )
        files = gen_markdowns()
        n = embedding_markdowns(
            index=index,
            fpaths=files,
            url="https://s3.laisky.com/public/papers/security/",
            replace_by_url="/home/laisky/data/langchain/pdf/security/",
        )
        save_store(
            index=index,
            dirpath="/home/laisky/data/langchain/index/", 
            name="security",
        )
        
        print(f"{n=}")
        if n == 0:
            return
        
        
run_scan_markdowns()

In [47]:
# ====================================
# merge FAISS index
# ====================================

from typing import List

def merge_index(indexs: List[Index]) -> Index:
    new_index = new_store()
    
    for coming_idx in indexs:
        new_ids = faiss.IDSelectorRange(0, new_index.store.ntotal)
        coming_ids = faiss.IDSelectorRange(0, coming_idx.ntotal)
        _, dups = faiss.bitset_remove_pairwise_and_reorder(new_ids, coming_ids)
        new_index = faiss.reconstruct_from_subset(new_index, dups)

        new_index.store.merge_from(coming_idx.store)
        new_index.scaned_files.add(coming_idx.scaned_files)
        
    return new_index
        

new_index = new_store()
old_index = load_store(
    dirpath="/home/laisky/data/langchain/index",
    name="security",
)
      
    
# merge_index([new_index, old_index])

In [49]:
new_index.store.merge_from(old_index.store)
print(new_index.store.index.ntotal)
print(old_index.store.index.ntotal)

170563
0


In [50]:
# ====================================
# 生成用于问答的 query chain
# ====================================

from langchain.chains import VectorDBQAWithSourcesChain, RetrievalQAWithSourcesChain
from langchain.chat_models import ChatOpenAI, AzureChatOpenAI
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.chains.question_answering import load_qa_chain

system_template="""Use the following pieces of context to answer the users question.
Take note of the sources and include them in the answer in the format: "SOURCES: source1 source2", use "SOURCES" in capital letters regardless of the number of sources.
If you don't know the answer, just say that "I don't know", don't try to make up an answer.
----------------
{summaries}"""
messages = [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template("{question}")
]
prompt = ChatPromptTemplate.from_messages(messages)

# index = load_store(
#     dirpath=index_dirpath,
#     name="security",
# )
index = new_index

chain_type_kwargs = {"prompt": prompt}

# llm = ChatOpenAI(
#     model_name="gpt-3.5-turbo", 
#     temperature=0, 
#     max_tokens=1000)  

llm = AzureChatOpenAI(
    deployment_name=azure_gpt_deploymentid,
    model_name="gpt-3.5-turbo",
    max_tokens=2000,
)

# chain = VectorDBQAWithSourcesChain.from_chain_type(
#     llm=llm,
#     vectorstore=index.store,
#     return_source_documents=True,
#     chain_type_kwargs=chain_type_kwargs,
#     reduce_k_below_max_tokens=True,
# )



# chain = RetrievalQAWithSourcesChain.from_chain_type(
#     llm=llm,
#     chain_type="stuff",
#     retriever=index.store.as_retriever(),
#     return_source_documents=True,
#     chain_type_kwargs=chain_type_kwargs,
#     reduce_k_below_max_tokens=True,
# )


query = "list tpm's features"
related_docs = index.store.similarity_search(
    query=query,
    k=10,
)
chain = load_qa_chain(llm, chain_type="stuff")
response = chain.run(
    input_documents=related_docs, 
    question=query,
)
print(response)

The TPM (Trusted Platform Module) provides several security features, including:
- Support for bulk (symmetric) encryption in the platform
- High-quality random numbers
- Cryptographic services
- A protected persistent store for small amounts of data, sticky-bits, monotonic counters and extendable registers
- A protected pseudo-persistent store for unlimited amounts of keys and data
- An extensive choice of authorization methods
- Provision of monotonic counters that can provide a secure mechanism to prevent replay attacks
- Provision of time-stamping
- Algorithm agility, the ability to implement new cryptographic algorithms as needed
- Support for multiple TPM instances
- Feature API to make the most-used facilities of the TPM 2.0 easily available to programmers
- User-defined indexes that can hold unstructured data with various read and write locks
- Attestation, which allows the TPM to provide a fundamental set of security features that have been defined by the TCG, including protec

In [8]:
# ====================================
# ask pdf embeddings
# ====================================
question = "how virtual tpm help measured guest system boot, explain in step by step"
result = chain(
    {
        "question": question,
    },
    return_only_outputs=True,
)

print(f"🤔️: {question}\n")
print(f"🤖️: {pretty_print(result['answer'])}\n")
print(f"📖: {result['sources']}")

ValueError: Missing some input keys: {'input_documents'}