## [How to build RAG](https://www.notion.so/d75758fbb9f0473f87f47febd40dd9dd?v=b7715a608c494e35a5663e203701f3f3&p=5566dedeb23e404797303f399b2ed559&pm=s)

In [1]:

def print_all_items(obj):
    # 获取对象的所有属性和方法
    attributes = dir(obj)
    for attr in attributes:
        try:
            # 打印属性及其值
            value = getattr(obj, attr)
            print(f"{attr}: {value}")
        except AttributeError:
            # 如果属性无法访问，则跳过
            pass
        
def single_document_vscode(document):
    separator_in_page = '-' * 50
    separator = "=" * 50

    page_content = document.page_content
    metadata = document.metadata

    # Format metadata
    # formatted_metadata = f"Metadata:\nPage: {metadata['page']}\nSource: {metadata['source']}"
    formatted_metadata = str(metadata)

    # Format page content with line breaks
    formatted_content = "\n".join(page_content.split("\n"))

    # Print formatted content and metadata
    print("Page Content:\n" + formatted_content)
    print(separator_in_page)
    print(formatted_metadata)
    print(separator + "\n")
    
def format_documents_vscode(documents):
    separator_in_page = '-' * 50
    separator = "=" * 50

    for doc in documents:
        page_content = doc.page_content
        metadata = doc.metadata
        
        # Format metadata
        # formatted_metadata = f"Metadata:\nPage: {metadata['page']}\nSource: {metadata['source']}"
        formatted_metadata = str(metadata)
        
        # Format page content with line breaks
        formatted_content = "\n".join(page_content.split("\n"))
        
        # Print formatted content and metadata
        print("Page Content:\n" + formatted_content)
        print(separator_in_page)
        print(formatted_metadata)
        print(separator + "\n")

# 创建RAG

## 创建llm and embeding
选用llama2-chinese:13b作为LLM

选用M3E作为embeding

In [1]:
from langchain_community.llms.ollama import Ollama
from langchain_huggingface import HuggingFaceEmbeddings

DATA_PATH="data/601919_中远海控"
DB_PATH = "vectorstores/db/601919"
PDF_PATH = "data/601919_中远海控/601919_中远海控_中远海控2023年年度报告_1219449961.pdf"

model_name = "llama2-chinese:13b"
llm = Ollama(model=model_name, temperature=0)
model_kwargs = {'device': 'cuda'}
embeddings = HuggingFaceEmbeddings(model_name="../ai_models/m3e-base", model_kwargs=model_kwargs)


  from tqdm.autonotebook import tqdm, trange


Generate Document From Markdown

In [3]:
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_core.documents import Document

markdown_path = "test.md"
# loader = UnstructuredMarkdownLoader(markdown_path, mode="elements")
loader = TextLoader(file_path=markdown_path)

docs = loader.load()
# format_documents_vscode(docs)

ParentDocumentRetriever

In [14]:
from langchain.vectorstores import Chroma

from langchain.storage import InMemoryStore
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers import ParentDocumentRetriever
from langchain_text_splitters import MarkdownTextSplitter
from utils.CustomMarkdownHeaderTextSplitter import CustomMarkdownHeaderTextSplitter

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

parent_splitter = CustomMarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on[0:1])
child_splitter = CustomMarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

vectorstore = Chroma(collection_name="split_parents", embedding_function=embeddings)
store = InMemoryStore()
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)
retriever.add_documents(docs)

In [15]:
question = "公司从事的业务情况"

biubiu = retriever.invoke(question)
format_documents_vscode(biubiu)

Page Content:
# 三、报告期内公司从事的业务情况
## 1、主要业务
上海贝岭是集成电路设计企业，提供模拟和数模混合集成电路及系统解决方案。公司专注于
集成电路芯片设计和产品应用开发，是国内集成电路产品主要供应商之一。报告期内，公司集成
电路产品业务布局在电源管理、信号链产品和功率器件 3大产品领域，包含电源管理、电机驱动、
数据转换器、电力专用芯片、物联网前端、非挥发存储器、标准信号产品业务、功率器件共 8个
细分产品业务，产品客户主要集中在汽车电子、工控、光伏、储能、能效监测、电力设备、光通
讯、家电、短距离交通工具、高端及便携式医疗设备市场以及手机摄像头模组等其它消费类应用
市场。报告期内，公司积极拓展销售渠道，与主要的整机厂商、销售渠道保持良好的合作关系，
建立了完善的营销网络体系。公司对生产经营和质量保障拥有丰富的实践经验，拥有运营保障核
心竞争力。
## 2、经营模式
公司主业属于集成电路设计业，采用无晶圆生产线的集成电路设计模式（Fabless）。公司进
行集成电路的设计和销售，将晶圆加工、电路封装和测试等生产环节分别外包给专业的晶圆加工  
企业、封装和测试企业来完成。公司与国内主要的晶圆加工、封装测试厂商长期保持了良好的合
作关系，保证了公司产品业务产业链的有效运转和产品质量。
半导体产业链示意图
--------------------------------------------------
{'source': 'test.md', 'Header 1': '三、报告期内公司从事的业务情况'}

Page Content:
# 五、报告期内主要经营情况
2023 年公司共实现营业收入 213,711.08 万元，较上年增长 4.54%。其中：主营业务收入为
208,178.58 万元，较上年增长 4.42%；其他业务收入为 5,532.50 万元，较上年增长 9.28%。2023
年公司共实现毛利 62,970.30万元。其中：主营业务毛利为 58,169.64万元，较上年减少 6,966.21
万元，降幅为10.69%；其他业务毛利为4,800.66万元，较上年增长247.71万元，增幅为5.44%。
2023 年公司实现归属于上市公司股东的扣除非经常性损益的净利润为 17,013.88 万元，较上
年减少 16,027.6

## Regular retriever

In [None]:
retriever = vectorstore.as_retriever()
docs = retriever.invoke("公司从事的业务情况")
format_documents_vscode(docs)

## [Maximum marginal relevance retrieval](https://python.langchain.com/v0.2/docs/how_to/vectorstore_retriever/#maximum-marginal-relevance-retrieval)

In [None]:
retriever = vectorstore.as_retriever(search_type="mmr")
retriever.invoke("公司的主要客户有哪些，前五客户集中度如何")

## [MultiQueryRetriever](https://python.langchain.com/v0.2/docs/how_to/MultiQueryRetriever/)

In [None]:
from langchain.retrievers.multi_query import MultiQueryRetriever

question = "报告期内公司从事的业务情况"
retriever = vectorstore.as_retriever()
retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=retriever, llm=llm
)
unique_docs = retriever_from_llm.invoke(question)
format_documents_vscode(unique_docs)

## [Add scores to retriever results](https://python.langchain.com/v0.2/docs/how_to/add_scores_retriever/)

In [None]:
from typing import List

from langchain_core.documents import Document
from langchain_core.runnables import chain


@chain
def retriever(query: str) -> List[Document]:
    docs, scores = zip(*vectorstore.similarity_search_with_score(query))
    for doc, score in zip(docs, scores):
        doc.metadata["score"] = score

    return docs

docs = retriever.invoke("公司从事的业务情况")
format_documents_vscode(unique_docs)

## [contextual compression](https://python.langchain.com/v0.2/docs/how_to/contextual_compression/)

In [None]:
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader(PDF_PATH)
documents = loader.load()

print(f"Processed {len(documents)} pdf files")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
texts = text_splitter.split_documents(documents)

# Base Retriever

In [None]:
from langchain_community.vectorstores import FAISS
    
retriever = FAISS.from_documents(texts, embeddings).as_retriever()
docs = retriever.invoke("公司从事的业务情况")
format_documents_vscode(docs)

## Compress Retriever
**目前实验下来效果较差**

In [None]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

compressor = LLMChainExtractor.from_llm(llm)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
)

compressed_docs = compression_retriever.invoke("公司从事的业务情况")
format_documents_vscode(compressed_docs)

## More built-in compressors: filters
### LLMChainFilter

In [None]:
from langchain.retrievers.document_compressors import LLMChainFilter

_filter = LLMChainFilter.from_llm(llm)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=_filter, base_retriever=retriever
)

compressed_docs = compression_retriever.invoke("公司从事的业务情况")
format_documents_vscode(compressed_docs)

### EmbeddingsFilter

In [None]:
from langchain.retrievers.document_compressors import EmbeddingsFilter

embeddings_filter = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.76)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=embeddings_filter, base_retriever=retriever
)

compressed_docs = compression_retriever.invoke("公司从事的业务情况")
format_documents_vscode(compressed_docs)

In [None]:
keyword = "公司从事的业务情况"
for text in texts:
    if text.metadata['page'] == 14:
        single_document_vscode(text)
    # if keyword in text.page_content:
    #     # format_documents_vscode(text)
    #     single_document_vscode(text)