In [13]:
# ====================================
# openai tokens
# ====================================

import os
import glob
import codecs
import pickle
import re
import textwrap
from collections import namedtuple

import openai
import faiss
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter, MarkdownTextSplitter
from langchain.vectorstores import FAISS
from pymongo import MongoClient
from kipp.utils import setup_logger

from sys import path

path.append("/opt/configs/ramjet")
import prd

# ----------------------------------------------
# Azure
# ----------------------------------------------
os.environ['OPENAI_API_TYPE'] = "azure"
os.environ['OPENAI_API_VERSION'] = prd.OPENAI_AZURE_VERSION
os.environ['OPENAI_API_BASE'] = prd.OPENAI_AZURE_API
os.environ['OPENAI_API_KEY'] = prd.OPENAI_AZURE_TOKEN

azure_embeddings_deploymentid = prd.OPENAI_AZURE_DEPLOYMENTS["embedding"].deployment_id
azure_gpt_deploymentid = prd.OPENAI_AZURE_DEPLOYMENTS["chat"].deployment_id
# ----------------------------------------------

# ----------------------------------------------
# OpenAI
# ----------------------------------------------
# os.environ["OPENAI_API_KEY"] = prd.OPENAI_TOKEN
# ----------------------------------------------

Index = namedtuple("index", ["store", "scaned_files"])


logger = setup_logger("security")

def pretty_print(text: str) -> str:
    text = text.strip()
    return textwrap.fill(text, width=60, subsequent_indent="    ")


In [18]:
# ==============================================================
# prepare pdf documents docs.index & docs.store
#
# https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/pdf.html#retain-elements
#
# 通用的函数定义
# ==============================================================

from urllib.parse import quote

from langchain.document_loaders import PyPDFLoader

# from langchain.document_loaders import UnstructuredPDFLoader
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter, MarkdownTextSplitter

text_splitter = CharacterTextSplitter(chunk_size=500, separator="\n")
markdown_splitter = MarkdownTextSplitter(chunk_size=500, chunk_overlap=50)

N_BACTCH_FILES = 5


def is_file_scaned(index: Index, fpath):
    return os.path.split(fpath)[1] in index.scaned_files


def embedding_pdfs(index: Index, fpaths, url, replace_by_url):
    i = 0
    docs = []
    metadatas = []
    for fpath in fpaths:
        fname = os.path.split(fpath)[1]
        if is_file_scaned(index, fname):
            continue

        try:
            loader = PyPDFLoader(fpath)
            for page, data in enumerate(loader.load_and_split()):
                splits = text_splitter.split_text(data.page_content)
                docs.extend(splits)
                for ichunk, _ in enumerate(splits):
                    fnameurl = quote(fpath.removeprefix(replace_by_url), safe="")
                    furl = url + fnameurl
                    metadatas.append({"source": f"{furl}#page={page+1}"})
        except Exception as err:
            logger.error(f"skip file {fpath}: {err}")
            continue

        index.scaned_files.add(fname)
        print(f"scaned {fpath}")
        i += 1
        if i > N_BACTCH_FILES:
            break

    if i != 0:
        index.store.add_texts(docs, metadatas=metadatas)

    return i


def embedding_markdowns(index: Index, fpaths, url, replace_by_url):
    i = 0
    docs = []
    metadatas = []
    for fpath in fpaths:
        fname = os.path.split(fpath)[1]
        if is_file_scaned(index, fpath):
            continue

        with codecs.open(fpath, "rb", "utf8") as fp:
            docus = markdown_splitter.create_documents([fp.read()])
            for ichunk, docu in enumerate(docus):
                docs.append(docu.page_content)
                title = quote(docu.page_content.strip().split("\n", maxsplit=1)[0])
                if url:
                    fnameurl = quote(fpath.removeprefix(replace_by_url), safe="")
                    furl = url + fnameurl
                    metadatas.append({"source": f"{furl}#{title}"})
                else:
                    metadatas.append({"source": f"{fname}#{title}"})
                    
        index.scaned_files.add(fname)
        print(f"scaned {fpath}")
        i += 1
        if i > N_BACTCH_FILES:
            break

    if i != 0:
        index.store.add_texts(docs, metadatas=metadatas)

    return i


def load_store(dirpath, name) -> Index:
    """
    Args:
        dirpath: dirpath to store index files
        name: project/file name
    """
    index = faiss.read_index(f"{os.path.join(dirpath, name)}.index")
    with open(f"{os.path.join(dirpath, name)}.store", "rb") as f:
        store = pickle.load(f)
    store.index = index

    with open(f"{os.path.join(dirpath, name)}.scanedfile", "rb") as f:
        scaned_files = pickle.load(f)

    return Index(
        store=store,
        scaned_files=scaned_files,
    )


def new_store() -> Index:
    embedding_model = OpenAIEmbeddings(
        model="text-embedding-ada-002",
        deployment=azure_embeddings_deploymentid,
    )
    store = FAISS.from_texts(["world"], embedding_model, metadatas=[{"source": "hello"}])
    return Index(
        store=store,
        scaned_files=set([]),
    )


def save_store(index: Index, dirpath, name):
    store_index = index.store.index
    fpath_prefix = os.path.join(dirpath, name)
    print(f"save store to {fpath_prefix}")
    faiss.write_index(store_index, f"{fpath_prefix}.index")
    index.store.index = None
    with open(f"{fpath_prefix}.store", "wb") as f:
        pickle.dump(index.store, f)
    index.store.index = store_index

    with open(f"{fpath_prefix}.scanedfile", "wb") as f:
        pickle.dump(index.scaned_files, f)


In [22]:
# =============================
# 定义文件路径
# =============================

index_dirpath = "/home/laisky/data/langchain/index-azure/"

In [None]:
# incremental scan pdfs

def gen_pdfs():
    yield from glob.glob("/home/laisky/data/langchain/pdf/security/**/*.pdf", recursive=True)

def run_scan_pdfs():
#     index = new_store()
#     save_store(
#         index=index, 
#         dirpath=index_dirpath, 
#         name="security",
#     )
    
    total = 0
    while True:
        index = load_store(
            dirpath=index_dirpath,
            name="security",
        )
        n = embedding_pdfs(
            index=index,
            fpaths=gen_pdfs(),
            url="https://s3.laisky.com/public/papers/security/",
            replace_by_url="/home/laisky/data/langchain/pdf/security/",
        )
        total += n
        save_store(
            index=index, 
            dirpath=index_dirpath, 
            name="security",
        )
        
#         return
        print(f"scanned {total} files")
        if n == 0:
            return
        
run_scan_pdfs()

[2023-06-04 03:45:41,545 - ERROR - /tmp/ipykernel_1424395/578690748.py:47 - security] - skip file /home/laisky/data/langchain/pdf/security/RFC2986_Certification Request Syntax Specification.pdf: EOF marker not found
scaned /home/laisky/data/langchain/pdf/security/NIST_SP-800-78-4-Cryptographic Algorithms and Key Sizes for Personal Identity Verification.pdf
scaned /home/laisky/data/langchain/pdf/security/NIST_SP-800-73-4_Interfaces for Personal Identity Verification.pdf
scaned /home/laisky/data/langchain/pdf/security/ISOIEC 27001-2022_Information security, cybersecurity and privacy protection - Information security management systems - Requirements.pdf
scaned /home/laisky/data/langchain/pdf/security/Practical Threshold Signatures.pdf
scaned /home/laisky/data/langchain/pdf/security/NIST_SP-800-57-Part 3_Application-Specific Key Management Guidance.pdf
scaned /home/laisky/data/langchain/pdf/security/RFC5280_Internet X.509 Public Key Infrastructure Certificate and Certificate Revocation Li

Multiple definitions in dictionary at byte 0xb57a for key /Ascent
Multiple definitions in dictionary at byte 0x303c0 for key /Ascent
Multiple definitions in dictionary at byte 0x4556f for key /Ascent


scaned /home/laisky/data/langchain/pdf/security/Computer Security Art And Science_Matt Bishop, Elisabeth Sullivan etc.pdf


Multiple definitions in dictionary at byte 0x79331 for key /Ascent
Created a chunk of size 676, which is longer than the specified 500
Overwriting cache for 0 2166


scaned /home/laisky/data/langchain/pdf/security/神经网络同步的判定及在神经密码中的应用.pdf
scaned /home/laisky/data/langchain/pdf/security/NIST_FIPS-198-1_The Keyed-Hash Message Authentication Code.pdf
scaned /home/laisky/data/langchain/pdf/security/NIST_FIPS-180-4_Secure Hash Standard.pdf
scaned /home/laisky/data/langchain/pdf/security/NIST_SP-800-89_Recommendation for Obtaining Assurances for Digital Signature Applications.pdf
scaned /home/laisky/data/langchain/pdf/security/whitepaper_Security of AWS CloudHSM Backups.pdf


In [None]:
# incremental scan markdowns

def gen_markdowns():
    yield "/home/laisky/data/langchain/basebit/doc/content/terms.md"
    yield from glob.glob("/home/laisky/data/langchain/basebit/doc/content/research/**/*.md", recursive=True)
    

def run_scan_markdowns():
#         index = new_store()
    while True:
        index = load_store(
            dirpath="/home/laisky/data/langchain/index",
            name="security",
        )
        files = gen_markdowns()
        n = embedding_markdowns(
            index=index,
            fpaths=files,
            url="https://s3.laisky.com/public/papers/security/",
            replace_by_url="/home/laisky/data/langchain/pdf/security/",
        )
        save_store(
            index=index,
            dirpath="/home/laisky/data/langchain/index/", 
            name="security",
        )
        
        print(f"{n=}")
        if n == 0:
            return
        
        
run_scan_markdowns()

In [None]:
# ====================================
# 生成用于问答的 query chain
# ====================================

from langchain.chains import VectorDBQAWithSourcesChain, RetrievalQAWithSourcesChain
from langchain.chat_models import ChatOpenAI, AzureChatOpenAI
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)

system_template="""Use the following pieces of context to answer the users question.
Take note of the sources and include them in the answer in the format: "SOURCES: source1 source2", use "SOURCES" in capital letters regardless of the number of sources.
If you don't know the answer, just say that "I don't know", don't try to make up an answer.
----------------
{summaries}"""
messages = [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template("{question}")
]
prompt = ChatPromptTemplate.from_messages(messages)

index = load_store(
    dirpath=index_dirpath,
    name="security",
)

chain_type_kwargs = {"prompt": prompt}

# llm = ChatOpenAI(
#     model_name="gpt-3.5-turbo", 
#     temperature=0, 
#     max_tokens=1000)  

llm = AzureChatOpenAI(
    deployment_name=azure_gpt_deploymentid,
    model_name="gpt-3.5-turbo",
)

# chain = VectorDBQAWithSourcesChain.from_chain_type(
#     llm=llm,
#     vectorstore=index.store,
#     return_source_documents=True,
#     chain_type_kwargs=chain_type_kwargs,
#     reduce_k_below_max_tokens=True,
# )

chain = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=index.store.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs,
    reduce_k_below_max_tokens=True,
)



In [34]:
# ====================================
# ask pdf embeddings
# ====================================
question = "TLS 能干啥"
result = chain(
    {
        "question": question,
    },
    return_only_outputs=True,
)

print(f"🤔️: {question}\n")
print(f"🤖️: {pretty_print(result['answer'])}\n")
print(f"📖: {result['sources']}")

🤔️: TLS 能干啥

🤖️: TLS 是一种传输层安全协议，主要用于提供数据的加密和认证。它可以在网络连接中提供端到端的安全性，包括在客户端和服务器之
    间传输的数据的机密性、完整性和身份验证。TLS
    能够防止数据在传输过程中被窃听、篡改和伪造。在网络安全方面，TLS 是一个非常重要的协议，应该学习和了解。

📖: https://s3.laisky.com/public/papers/security/Brian%20Ward%20-%20How%20Linux%20Works%20%282021%2C%20No%20Starch%20Press%29.pdf#page=316
