In [None]:
import os, sys
sys.path.append('..')

## 读取数据

In [None]:
from utils.document_loader import auto_read_pdf
path = r"./data"
documents = auto_read_pdf(path)

### 写入文件

In [None]:
import pandas as pd
df = pd.DataFrame([ {'page_content': doc.page_content, 'source': doc.metadata["source"]} for doc in documents])
df.to_csv("stkj_ocr.csv", index=False)

# MarkDown读取

In [None]:
from langchain_community.document_loaders import UnstructuredMarkdownLoader
loader = UnstructuredMarkdownLoader('test.md', mode="elements")

documents = loader.load()

In [None]:
with open('test.md', 'r', encoding='utf-8') as file:
    documents = file.read()

In [None]:
from langchain_text_splitters import MarkdownHeaderTextSplitter

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
md_header_splits = markdown_splitter.split_text(documents)

from langchain_text_splitters import RecursiveCharacterTextSplitter

chunk_size = 250
chunk_overlap = 30
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap
)

# Split
splits = text_splitter.split_documents(md_header_splits)
splits

## Chroma Embeding

In [None]:
from langchain_chroma import Chroma
from langchain_community.embeddings import OllamaEmbeddings

embeddings = OllamaEmbeddings(model=local_model, show_progress=True)

vectorstore = Chroma.from_documents(
    documents,
    embedding=embeddings,
)

In [None]:
# vectorstore.similarity_search("公司主要产品")
from langchain_core.runnables import RunnableLambda

retriever = RunnableLambda(vectorstore.similarity_search).bind(k=1)  # select top result
retriever.batch(["公司主要产品"])

In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

message = """
根据Context回答问题,如果没有相关信息,则回答没有.用中文回答.
{question}
Context:
{context}
"""

prompt = ChatPromptTemplate.from_messages([("human", message)])

rag_chain = {"context": retriever, "question": RunnablePassthrough()} | prompt | llm
response = rag_chain.invoke("公司主营产品是什么")
print(response)

## 简单的将文本进行总结

In [None]:
from langchain.chains import SimpleSequentialChain, LLMChain
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate

def print_wrapped_text(text, width=80):
    for line in text.split('\n'):
        print('\n'.join([line[i:i+width] for i in range(0, len(line), width)]))

summarize_chain = load_summarize_chain(llm)

# 定义翻译提示模板
translation_prompt = PromptTemplate(input_variables=["text"], template="将以下文本翻译成中文：\n\n{text}")
# 创建翻译链
translation_chain = LLMChain(llm=llm, prompt=translation_prompt)
# 将总结链和翻译链结合
def summarize_and_translate(docs):
    summary = summarize_chain.run(docs)
    translation = translation_chain.run({"text": summary})
    return translation

# 处理文档并获取总结和翻译
docs = documents[2:3]
# print(docs[0].page_content)
translated_summary = summarize_and_translate(docs)
print_wrapped_text(translated_summary)

## [Method 1: Stuffing](https://github.com/GoogleCloudPlatform/generative-ai/blob/main/language/use-cases/document-summarization/summarization_large_documents_langchain.ipynb)

Stuffing is the simplest method to pass data to a language model. It "stuffs" text into the prompt as context in a way that all of the relevant information can be processed by the model to get what you want.

In LangChain, you can use StuffDocumentsChain as part of the load_summarize_chain method. What you need to do is setting stuff as chain_type of your chain.

In [None]:
from langchain.document_loaders import PyPDFLoader
from utils.chain.load_translate_chain import load_translate_chain
from langchain.chains.summarize import load_summarize_chain
pdf_loader = PyPDFLoader('./data/688169_石头科技_北京石头世纪科技股份有限公司第二届董事会第二十五次会议决议公告_1220086493.pdf')
docs = pdf_loader.load_and_split()

s_chain = load_summarize_chain(llm, chain_type="stuff")
t_chain = load_translate_chain(llm)

from langchain.chains import SimpleSequentialChain
combined_chain = SimpleSequentialChain(chains=[s_chain, t_chain])
result = combined_chain.invoke({"input": docs})
print(result)

## 自定义的公告总结

In [None]:
from utils.chain.custom_load_summarize_chain import custom_load_summarize_chain

chain = custom_load_summarize_chain(llm)
chain.run({"input_documents": docs})

In [None]:
from typing import Any, Mapping, Optional, Protocol

from langchain_core.callbacks import Callbacks
from langchain_core.language_models import BaseLanguageModel
from langchain_core.prompts import BasePromptTemplate
from langchain_core.prompts import PromptTemplate

from langchain.chains import LLMChain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain

prompt_template = """用中文简要概括以下内容:

"{text}"

简要总结:
<公司简称><股票代码><公告标题>
<公告主要内容>
"""
PROMPT = PromptTemplate(template=prompt_template, input_variables=["text"])

def custom_load_summarize_chain(
    llm: BaseLanguageModel,
    prompt: BasePromptTemplate = PROMPT,
    document_variable_name: str = "text",
    verbose: Optional[bool] = None,
    **kwargs: Any,
) -> StuffDocumentsChain:
    llm_chain = LLMChain(llm=llm, prompt=prompt, verbose=verbose)  # type: ignore[arg-type]
    # TODO: document prompt
    return StuffDocumentsChain(
        llm_chain=llm_chain,
        document_variable_name=document_variable_name,
        verbose=verbose,  # type: ignore[arg-type]
        **kwargs,
    )

pdf_loader = PyPDFLoader('./data/688169_石头科技_北京石头世纪科技股份有限公司第二届董事会第二十五次会议决议公告_1220086493.pdf')
docs = pdf_loader.load_and_split()
for doc in docs:
    print(doc)
# chain = custom_load_summarize_chain(llm)
# chain.run({"input_documents": docs})

In [None]:
from langchain.document_loaders import PyPDFLoader
from langchain.chains.summarize import load_summarize_chain
file_path = './data/688169_石头科技_北京石头世纪科技股份有限公司第二届董事会第二十五次会议决议公告_1220086493.pdf'
pdf_loader = PyPDFLoader(file_path)
docs = pdf_loader.load_and_split()

chain = load_summarize_chain(llm, chain_type="stuff")
result = chain.run({"input_documents": docs})
result

In [None]:
from transformers import GPT2Tokenizer

# 初始化tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokens = tokenizer.encode(docs[0].page_content)
token_length = len(tokens)
print(f"Token数量: {token_length}")

In [None]:
from transformers import GPT2Tokenizer

# 初始化tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# 定义文本
text = "年报、半年报、一季报、三季报、业绩预告、权益分派、董事会、监事会、股东大会、日常经营、公司治理、中介报告、首发、增发、股权激励、配股、解禁、公司债、可转债、其他融资、股权变动、补充更正、澄清致歉、风险提示、特别处理和退市、退市整理期"
# 编码文本
tokens = tokenizer.encode(text)
print(tokens)
# 计算token数量
token_length = len(tokens)
print(f"Token数量: {token_length}")

## Method 2: MapReduce
The MapReduce method implements a multi-stage summarization. It is a technique for summarizing large pieces of text by first summarizing smaller chunks of text and then combining those summaries into a single summary.

In LangChain, you can use MapReduceDocumentsChain as part of the load_summarize_chain method. What you need to do is setting map_reduce as chain_type of your chain.

### Prompt design with MapReduce chain
In our example, you have a 32-page document that you need to summarize.

With LangChain, the map_reduce chain breaks the document down into 1024 token chunks max. Then it runs the initial prompt you define on each chunk to generate a summary of that chunk. In the example below, you use the following first stage or map prompt.
```
Write
'''{text}'''. BULLET POINT SUMMARY:```

Once summaries for all of the chunks are generated, it runs a different prompt to combine those summaries into a single summary. In the example below, you use the following second stage or combine prompt.

```Write a summary of the entire document that includes the main points from all of the individual summaries.```
```

In [None]:
from langchain.document_loaders import PyPDFLoader
pdf_loader = PyPDFLoader('./data/688169_石头科技_中信证券股份有限公司关于北京石头世纪科技股份有限公司2023年度募集资金存放与实际使用情况的专项核查意见_1219442733.pdf')
docs = pdf_loader.load_and_split()
# print(pages[3].page_content)
for page in docs:
    print(type(page), page)

In [None]:
from langchain.chains import LLMChain
from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain, StuffDocumentsChain
from langchain_text_splitters import CharacterTextSplitter
from langchain import hub
from langchain_community.llms import Ollama
from langchain_core.prompts import PromptTemplate

llm = Ollama(model="llama2-chinese:13b", temperature=0)

# Map
map_template = """The following is a set of documents
{docs}
Based on this list of docs, please identify the main themes 
Helpful Answer:"""
map_prompt = PromptTemplate.from_template(map_template)
map_chain = LLMChain(llm=llm, prompt=map_prompt)

# Reduce
reduce_template = """The following is set of summaries:
{docs}
Take these and distill it into a final, consolidated summary of the main themes. 
Helpful Answer:"""
reduce_prompt = PromptTemplate.from_template(reduce_template)
reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)

In [None]:
# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain, document_variable_name="docs"
)

# Combines and iteratively reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
    # This is final chain that is called.
    combine_documents_chain=combine_documents_chain,
    # If documents exceed context for `StuffDocumentsChain`
    collapse_documents_chain=combine_documents_chain,
    # The maximum number of tokens to group documents into.
    token_max=4000,
)

# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
    # Map chain
    llm_chain=map_chain,
    # Reduce chain
    reduce_documents_chain=reduce_documents_chain,
    # The variable name in the llm_chain to put the documents in
    document_variable_name="docs",
    # Return the results of the map steps in the output
    return_intermediate_steps=False,
)

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1000, chunk_overlap=0
)
split_docs = text_splitter.split_documents(docs)

In [None]:
result = map_reduce_chain.invoke(split_docs)

print(result["output_text"])

In [None]:
# 测试代码
from transformers import GPT2TokenizerFast
# 加载预训练的 GPT-2 分词器
tokenizer = GPT2TokenizerFast.from_pretrained("../gpt2")

# [Option 3. Refine](https://python.langchain.com/v0.2/docs/tutorials/summarization/#refine)

In [None]:
from langchain.chains.summarize import load_summarize_chain
chain = load_summarize_chain(llm, chain_type="refine")
result = chain.invoke(split_docs)

print(result["output_text"])

In [None]:
import os

# 获取 HF_HOME 环境变量的值
os.getenv('HF_HOME')

## 生成embeding

In [None]:
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)
local_model = "llama2-chinese:13b"

embeddings = OllamaEmbeddings(model=local_model, show_progress=True)
db = FAISS.from_documents(docs, embeddings)

In [None]:
query_result = db.similarity_search("石头科技2023年年度报告，管理层分析", k=4)
for doc in query_result:
    print(doc)
    # print(str(doc.metadata["page"]) + ":", doc.page_content[:300])

In [None]:
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 4})

In [None]:
# Step 5: Set up the local model:

from langchain_community.chat_models import ChatOllama

llm = ChatOllama(model=local_model, num_predict=400,
                 stop=["<|start_header_id|>", "<|end_header_id|>", "<|eot_id|>"])

In [None]:
# Step 6: Set up the RAG chain:

from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

prompt_template = """
<|start_header_id|>user<|end_header_id|>
Answer the user's question using provided context. Stick to the facts, do not draw your own conclusions.
Question: {question}
Context: {context}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
question = "石头科技2023年年报，管理层讨论一节都说了些什么"
rag_chain.invoke(question)

In [None]:
from langchain.vectorstores import Annoy
from langchain.text_splitter import CharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
import time
from langchain import PromptTemplate
from langchain.embeddings import HuggingFaceEmbeddings

def create_index(documents):    
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    split_docs = text_splitter.split_documents(documents)
    
    embeddings = HuggingFaceEmbeddings(model_name='moka-ai/m3e-large')
    vector_store_path = r"./storage4"

    docsearch = Annoy.from_documents(documents=split_docs,
                                    embedding=embeddings,
                                    persist_directory=vector_store_path)
    docsearch.save_local(vector_store_path)

def search(txt):
    embeddings = HuggingFaceEmbeddings(model_name='moka-ai/m3e-large')
    vector_store_path = r"./storage4"
    docsearch = Annoy.load_local(vector_store_path,embeddings=embeddings)

    start = time.time()
    prompt_template = """请注意：请谨慎评估query与提示的Context信息的相关性，只根据本段输入文字信息的内容进行回答，如果query与提供的材料无关，请回答"对不起，我不知道"，另外也不要回答无关答案：
    Context: {context}
    Question: {question}
    Answer:"""
    PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

    # qa = VectorDBQA.from_chain_type(llm=ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-16k"), chain_type="stuff", vectorstore=docsearch, return_source_documents=True)
    # result = qa({"query": txt})
    
    
    qa = RetrievalQA.from_chain_type(llm=ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo"), chain_type="stuff", retriever=docsearch.as_retriever(search_kwargs={"k": 8}),
                                 chain_type_kwargs={"prompt": PROMPT})
    
    result = qa.run(txt)
    print(result)
    print(time.time() - start)

参考文档
https://github.com/ollama/ollama/blob/main/examples/langchain-python-rag-document/main.py

In [None]:

from langchain.document_loaders import OnlinePDFLoader
from langchain.vectorstores import Chroma
from langchain.embeddings import GPT4AllEmbeddings
from langchain import PromptTemplate
from langchain.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import RetrievalQA
import sys
import os

class SuppressStdout:
    def __enter__(self):
        self._original_stdout = sys.stdout
        self._original_stderr = sys.stderr
        sys.stdout = open(os.devnull, 'w')
        sys.stderr = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout
        sys.stderr = self._original_stderr

# load the pdf and split it into chunks
loader = OnlinePDFLoader("https://d18rn0p25nwr6d.cloudfront.net/CIK-0001813756/975b3e9b-268e-4798-a9e4-2a9a7c92dc10.pdf")
data = loader.load()

from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
all_splits = text_splitter.split_documents(data)

with SuppressStdout():
    vectorstore = Chroma.from_documents(documents=all_splits, embedding=GPT4AllEmbeddings())

while True:
    query = input("\nQuery: ")
    if query == "exit":
        break
    if query.strip() == "":
        continue

    # Prompt
    template = """Use the following pieces of context to answer the question at the end.
    If you don't know the answer, just say that you don't know, don't try to make up an answer.
    Use three sentences maximum and keep the answer as concise as possible.
    {context}
    Question: {question}
    Helpful Answer:"""
    QA_CHAIN_PROMPT = PromptTemplate(
        input_variables=["context", "question"],
        template=template,
    )

    llm = Ollama(model="llama3:8b", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
    qa_chain = RetrievalQA.from_chain_type(
        llm,
        retriever=vectorstore.as_retriever(),
        chain_type_kwargs={"prompt": QA_CHAIN_PROMPT},
    )

    result = qa_chain({"query": query})