In [1]:
# ====================================
# openai tokens
# ====================================

import os
import glob
import codecs
import pickle
import re
import textwrap
from collections import namedtuple
from concurrent.futures import ThreadPoolExecutor

import openai
import faiss
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter, MarkdownTextSplitter
from langchain.vectorstores import FAISS
from pymongo import MongoClient
from kipp.utils import setup_logger

from sys import path

path.append("/opt/configs/ramjet")
import prd

# ----------------------------------------------
# Azure
# ----------------------------------------------
# os.environ['OPENAI_API_TYPE'] = "azure"
# os.environ['OPENAI_API_VERSION'] = prd.OPENAI_AZURE_VERSION
# os.environ['OPENAI_API_BASE'] = prd.OPENAI_AZURE_API
# os.environ['OPENAI_API_KEY'] = prd.OPENAI_AZURE_TOKEN

# openai.api_type = os.environ['OPENAI_API_TYPE']
# openai.api_version = os.environ['OPENAI_API_VERSION']
# openai.api_base = os.environ['OPENAI_API_BASE']
# openai.api_key = os.environ['OPENAI_API_KEY']

# azure_embeddings_deploymentid = prd.OPENAI_AZURE_DEPLOYMENTS["embeddings"].deployment_id
# azure_gpt_deploymentid = prd.OPENAI_AZURE_DEPLOYMENTS["chat"].deployment_id
# ----------------------------------------------

# ----------------------------------------------
# OpenAI
# ----------------------------------------------
os.environ["OPENAI_API_KEY"] = prd.OPENAI_TOKEN
os.environ["OPENAI_API_BASE"] = prd.OPENAI_API

openai.api_key = os.environ['OPENAI_API_KEY']
openai.api_base = os.environ["OPENAI_API_BASE"]
# ----------------------------------------------

Index = namedtuple("index", ["store", "scaned_files"])


def pretty_print(text: str) -> str:
    text = text.strip()
    return textwrap.fill(text, width=60, subsequent_indent="    ")


# =============================
# 定义文件路径
# =============================
name = "immigrate"
logger = setup_logger(name)

index_dirpath = "/home/laisky/data/langchain/index-azure"
pdf_dirpath = f"/home/laisky/data/langchain/pdf/{name}"

for path in [index_dirpath, pdf_dirpath]:
    try:
        os.mkdir(path)
    except FileExistsError:
        pass



In [2]:
# ==============================================================
# prepare pdf documents docs.index & docs.store
#
# https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/pdf.html#retain-elements
#
# 通用的函数定义
# ==============================================================

from urllib.parse import quote

from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter, MarkdownTextSplitter
from ramjet.tasks.gptchat.llm.embeddings import reset_eof_of_pdf

text_splitter = CharacterTextSplitter(chunk_size=500, separator="\n")
markdown_splitter = MarkdownTextSplitter(chunk_size=500, chunk_overlap=50)

N_BACTCH_FILES = 5


def is_file_scaned(index: Index, fpath):
    return os.path.split(fpath)[1] in index.scaned_files


# def embedding_pdfs(index: Index, fpaths, url, replace_by_url):
#     i = 0
#     docs = []
#     metadatas = []
#     for fpath in fpaths:
#         fname = os.path.split(fpath)[1]
#         if is_file_scaned(index, fname):
#             continue

#         try:
#             reset_eof_of_pdf(fpath)
#             loader = PyPDFLoader(fpath)
#             for page, data in enumerate(loader.load_and_split()):
#                 splits = text_splitter.split_text(data.page_content)
#                 docs.extend(splits)
#                 for ichunk, _ in enumerate(splits):
#                     fnameurl = quote(fpath.removeprefix(replace_by_url), safe="")
#                     furl = url + fnameurl
#                     metadatas.append({"source": f"{furl}#page={page+1}"})
#         except Exception:
#             logger.exception(f"skip file {fpath}")
#             continue

#         index.scaned_files.add(fname)
#         print(f"scaned {fpath}")
#         i += 1
#         if i > N_BACTCH_FILES:
#             break

#     if i != 0:
#         # fix stupid compatability issue in langchain faiss
#         if not getattr(index.store, "_normalize_L2", None):
#             index.store._normalize_L2 = False
            
#         index.store.add_texts(docs, metadatas=metadatas)

#     return i


# def embedding_markdowns(index: Index, fpaths, url, replace_by_url):
#     i = 0
#     docs = []
#     metadatas = []
#     for fpath in fpaths:
#         fname = os.path.split(fpath)[1]
#         if is_file_scaned(index, fpath):
#             continue

#         with codecs.open(fpath, "rb", "utf8") as fp:
#             docus = markdown_splitter.create_documents([fp.read()])
#             for ichunk, docu in enumerate(docus):
#                 docs.append(docu.page_content)
#                 title = quote(docu.page_content.strip().split("\n", maxsplit=1)[0])
#                 if url:
#                     fnameurl = quote(fpath.removeprefix(replace_by_url), safe="")
#                     furl = url + fnameurl
#                     metadatas.append({"source": f"{furl}#{title}"})
#                 else:
#                     metadatas.append({"source": f"{fname}#{title}"})
                    
#         index.scaned_files.add(fname)
#         print(f"scaned {fpath}")
#         i += 1
#         if i > N_BACTCH_FILES:
#             break

#     if i != 0:
#         index.store.add_texts(docs, metadatas=metadatas)

#     return i


def load_store(dirpath, name) -> Index:
    """
    Args:
        dirpath: dirpath to store index files
        name: project/file name
    """
    if os.environ.get("OPENAI_API_TYPE") == "azure":
        azure_embeddings_deploymentid = prd.OPENAI_AZURE_DEPLOYMENTS[
            "embeddings"
        ].deployment_id
        # azure_gpt_deploymentid = prd.OPENAI_AZURE_DEPLOYMENTS["chat"].deployment_id

        embedding_model = OpenAIEmbeddings(
            client=None,
            model="text-embedding-ada-002",
            deployment=azure_embeddings_deploymentid,
        )
    else:
        embedding_model = OpenAIEmbeddings(
            client=None,
            model="text-embedding-ada-002",
        )
    
    index = faiss.read_index(f"{os.path.join(dirpath, name)}.index")
    with open(f"{os.path.join(dirpath, name)}.store", "rb") as f:
        store = pickle.load(f)
    store.index = index

    with open(f"{os.path.join(dirpath, name)}.scanedfile", "rb") as f:
        scaned_files = pickle.load(f)
        
    # compatable with azure/openai embeddings
    store.embedding_function = embedding_model.embed_query

    return Index(
        store=store,
        scaned_files=scaned_files,
    )


def new_store() -> Index:
    if os.environ.get("OPENAI_API_TYPE") == "azure":
        embedding_model = OpenAIEmbeddings(
            client=None,
            model="text-embedding-ada-002",
            deployment=azure_embeddings_deploymentid,
        )
    else:
        embedding_model = OpenAIEmbeddings(
            client=None,
            model="text-embedding-ada-002",
        )
        
    store = FAISS.from_texts(["world"], embedding_model, metadatas=[{"source": "hello"}])
    return Index(
        store=store,
        scaned_files=set([]),
    )


def save_store(index: Index, dirpath, name):
    store_index = index.store.index
    fpath_prefix = os.path.join(dirpath, name)
    print(f"save store to {fpath_prefix}")
    faiss.write_index(store_index, f"{fpath_prefix}.index")
    index.store.index = None
    with open(f"{fpath_prefix}.store", "wb") as f:
        pickle.dump(index.store, f)
    index.store.index = store_index

    with open(f"{fpath_prefix}.scanedfile", "wb") as f:
        pickle.dump(index.scaned_files, f)


Using 16 process workers and 20 thread workers


In [4]:
# incremental scan pdfs
# /home/laisky/data/langchain/pdf/security

from ramjet.tasks.gptchat.llm.embeddings import _embedding_pdf

def gen_pdfs():
    yield from glob.glob(f"{pdf_dirpath}/**/*.pdf", recursive=True)

def run_scan_pdfs():
#     index = new_store()
#     save_store(
#         index=index, 
#         dirpath=index_dirpath, 
#         name=name,
#     )

    index = load_store(
        dirpath=index_dirpath,
        name=name,
    )

    total = 0
    for fpath in gen_pdfs():
        fname = os.path.split(fpath)[1]
        if is_file_scaned(index, fname):
            continue
            
        metadata_name = f"https://s3.laisky.com/public/papers/{fpath.removeprefix('/home/laisky/data/langchain/pdf/')}"
        logger.info(f"scan pdf {metadata_name=}")
#         continue
    
        try:
            file_index = _embedding_pdf(
                fpath=fpath,
                metadata_name=metadata_name,
                apikey=os.environ['OPENAI_API_KEY'],
                max_chunks=1000,
            )
            index.store.merge_from(file_index.store)
            total += 1
        except Exception:
            logger.exception(f"failed parse file {fpath=}")

    save_store(
        index=index, 
        dirpath=index_dirpath, 
        name=name,
    )

    print(f"scanned {total} files")
        
run_scan_pdfs()

save store to /home/laisky/data/langchain/index-azure/immigrate
[2023-09-18 06:27:25,451 - INFO - /tmp/ipykernel_2399029/2716387378.py:29 - immigrate] - scan pdf metadata_name='https://s3.laisky.com/public/papers/immigrate/canada/Welcome to Canada What you should know.pdf'
[2023-09-18 06:27:36,704 - INFO - /tmp/ipykernel_2399029/2716387378.py:29 - immigrate] - scan pdf metadata_name='https://s3.laisky.com/public/papers/immigrate/canada/Apply for permanent residence- Documents for Express Entry.pdf'
[2023-09-18 06:27:39,485 - INFO - /tmp/ipykernel_2399029/2716387378.py:29 - immigrate] - scan pdf metadata_name='https://s3.laisky.com/public/papers/immigrate/canada/Eligibility to apply for the Canadian Experience Class (Express Entry).pdf'
[2023-09-18 06:27:43,573 - INFO - /tmp/ipykernel_2399029/2716387378.py:29 - immigrate] - scan pdf metadata_name='https://s3.laisky.com/public/papers/immigrate/canada/immigration_and_refugee_law.pdf'
[2023-09-18 06:27:52,255 - INFO - /tmp/ipykernel_239902

In [None]:
# incremental scan markdowns

def gen_markdowns():
    yield "/home/laisky/data/langchain/basebit/doc/content/terms.md"
    yield from glob.glob("/home/laisky/data/langchain/basebit/doc/content/research/**/*.md", recursive=True)
    

def run_scan_markdowns():
#         index = new_store()
    while True:
        index = load_store(
            dirpath="/home/laisky/data/langchain/index",
            name=name,
        )
        files = gen_markdowns()
        n = embedding_markdowns(
            index=index,
            fpaths=files,
            url=f"https://s3.laisky.com/public/papers/{name}/",
            replace_by_url=f"/home/laisky/data/langchain/pdf/{name}/",
        )
        save_store(
            index=index,
            dirpath="/home/laisky/data/langchain/index/", 
            name=name,
        )
        
        print(f"{n=}")
        if n == 0:
            return
        
        
run_scan_markdowns()

In [6]:
# ====================================
# 生成用于问答的 query chain
# ====================================

from langchain.chains import VectorDBQAWithSourcesChain, RetrievalQAWithSourcesChain
from langchain.chat_models import ChatOpenAI, AzureChatOpenAI
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.chains.question_answering import load_qa_chain
from langchain.chains import LLMChain

system_template="""Use the following pieces of context to answer the users question.
Take note of the sources and include them in the answer in the format: "SOURCES: source1 source2", use "SOURCES" in capital letters regardless of the number of sources.
If you don't know the answer, just say that "I don't know", don't try to make up an answer.
----------------
{summaries}"""
messages = [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template("{question}")
]
prompt = ChatPromptTemplate.from_messages(messages)

index = load_store(
    dirpath=index_dirpath,
    name=name,
)

llm = ChatOpenAI(
    client=None,
    model="gpt-3.5-turbo", 
    temperature=0, 
    max_tokens=2000,
    streaming=False,
)  

# llm = AzureChatOpenAI(
#     deployment_name=azure_gpt_deploymentid,
#     model_name="gpt-3.5-turbo",
#     max_tokens=2000,
# )

chain = load_qa_chain(llm, chain_type="stuff")
# chain = LLMChain(llm=llm, prompt=prompt)

In [13]:
# ====================================
# ask pdf embeddings
# ====================================
query = "提交 FSW 前需要准备什么考试"


translated = llm.predict(f"""I want you to act as an English translator, spelling corrector and improver. 
I will speak to you in any language and you will detect the language, 
translate it and answer in the corrected and improved version of my text, in English. 
I want you to replace my simplified A0-level words and sentences with more beautiful and elegant, 
upper level English words and sentences. Keep the meaning same, but make them more literary. 
I want you to only reply the correction, the improvements and nothing else, do not write explanations. 
following is the sentence that need to be translate:
---
{query}""")

related_docs = index.store.similarity_search(
    query=translated,
    k=5,
)

response = chain.run(
#     context=';'.join([d.page_content for d in related_docs]), 
    input_documents=related_docs,
    question=query,
)

print(f"🤔️: {query}\n")
print(f"📖: {response}\n")

🤔️: 提交 FSW 前需要准备什么考试

📖: 根据提供的上下文，没有明确提到提交FSW（Federal Skilled Worker）申请前需要参加任何考试。然而，根据加拿大移民规定，申请FSW需要满足一些基本要求，包括语言能力测试（如IELTS或CELPIP）和教育背景评估（如学历认证）。此外，根据个人情况，可能还需要通过其他专业资格考试或技能评估。建议您查阅加拿大移民部官方网站或咨询专业移民顾问以获取最准确和最新的信息。



In [None]:
# ====================================
# manually qa based on embedidngs step by step
# ====================================
from typing import List
import re

from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain


system_template="""Use the following pieces of context to answer the users question.
If you don't know the answer, or you think more information is needed to provide a better answer, 
just say in this strict format: "I need more informations about: [list keywords that will be used to search more informations]" to ask more informations, 
don't try to make up an answer.
----------------
context: {summaries}"""
messages = [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template("{question}")
]
prompt = ChatPromptTemplate.from_messages(messages)


def query_for_more_info(query: str) -> str:
    related_docs = index.store.similarity_search(
        query=query,
        k=5,
    )

    return "; ".join([d.page_content for d in related_docs]) 


chain = LLMChain(llm=llm, prompt=prompt)


query = "how to measure host os in vm by vtpm"


n = 0
last_sub_query = ""
regexp = re.compile(r'I need more information about "([^"]+)"')
while n<3: 
    n += 1
    resp = chain.run({
        "summaries": query_for_more_info(query),
        "question": query,
    })
    matched = regexp.findall(resp)
    if len(matched) == 0:
        break
        
    sub_query = matched[0]
    if sub_query == last_sub_query:
        break
    last_sub_query = sub_query
    
    print(f"require more informations about: {sub_query}")
    query += f"; {query_for_more_info(sub_query)}"
    
print(resp)

In [None]:
cnt = 'I need more information about "Amber GPU-CC" to provide an accurate answer. Could you please provide more context or clarify your question?'

regexp = re.compile(r'I need more information about "([^\)]+)"')

regexp.findall(cnt)

In [None]:
# ====================================
# use vectore store in functions(agents)
# ====================================

from langchain.agents import initialize_agent, Tool
from langchain.agents import AgentType
from langchain.tools import BaseTool
from langchain.llms import OpenAI
from langchain import LLMMathChain, SerpAPIWrapper


related_docs = index.store.similarity_search(
    query=query,
    k=5,
)


def query_for_agent(query: str) -> str:
    related_docs = index.store.similarity_search(
        query=query,
        k=5,
    )

    return "\n".join([d.page_content for d in related_docs])

def context_for_agent(query: str) -> str:
    related_docs = index.store.similarity_search(
        query=query,
        k=5,
    )
        
    response = chain.run(
        input_documents=related_docs,
        question=query,
    )
    
    return response
    
    
tools = [
    Tool(
        name="Search",
        func=query_for_agent,
        description="useful for when you need to answer questions, this function takes a string as input and returns a string. This function is capable of vectorizing the input string and searching for similar information in a vector database. Your AI can call this function to retrieve the data it needs based on its requirements.",
    ),
]

query = "what is tee-io"

agent = initialize_agent(
#     tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True
    tools, llm, agent=AgentType.OPENAI_FUNCTIONS, verbose=True,
)

agent.run(
    input_documents=related_docs,
    question=query,
)



In [None]:
agent.run??