## [How to build RAG](https://www.notion.so/d75758fbb9f0473f87f47febd40dd9dd?v=b7715a608c494e35a5663e203701f3f3&p=5566dedeb23e404797303f399b2ed559&pm=s)

In [None]:
def single_document_vscode(document):
    separator_in_page = '-' * 50
    separator = "=" * 50

    page_content = document.page_content
    metadata = document.metadata

    # Format metadata
    formatted_metadata = f"Metadata:\nPage: {metadata['page']}\nSource: {metadata['source']}"

    # Format page content with line breaks
    formatted_content = "\n".join(page_content.split("\n"))

    # Print formatted content and metadata
    print("Page Content:\n" + formatted_content)
    print(separator_in_page)
    print(formatted_metadata)
    print(separator + "\n")
    
def format_documents_vscode(documents):
    separator_in_page = '-' * 50
    separator = "=" * 50

    for doc in documents:
        page_content = doc.page_content
        metadata = doc.metadata
        
        # Format metadata
        formatted_metadata = f"Metadata:\nPage: {metadata['page']}\nSource: {metadata['source']}"
        
        # Format page content with line breaks
        formatted_content = "\n".join(page_content.split("\n"))
        
        # Print formatted content and metadata
        print("Page Content:\n" + formatted_content)
        print(separator_in_page)
        print(formatted_metadata)
        print(separator + "\n")

# 创建RAG

## 创建llm and embeding
选用llama2-chinese:13b作为LLM

选用M3E作为embeding

In [None]:
from langchain_community.llms.ollama import Ollama
from langchain_huggingface import HuggingFaceEmbeddings

DATA_PATH="data/601919_中远海控"
DB_PATH = "vectorstores/db/601919"

model_name = "llama2-chinese:13b"
llm = Ollama(model=model_name, temperature=0)
model_kwargs = {'device': 'cuda'}
embeddings = HuggingFaceEmbeddings(model_name="../ai_models/m3e-base", model_kwargs=model_kwargs)


使用Chroma创建VectorDB

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain.document_loaders import UnstructuredHTMLLoader, BSHTMLLoader
from langchain.vectorstores import Chroma
from langchain_text_splitters import MarkdownHeaderTextSplitter

import os

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

def create_vector_db():
    loader = PyPDFDirectoryLoader(DATA_PATH)
    documents = loader.load()
    # with open('test.md', 'r', encoding='utf-8') as file:
    #     documents = file.read()
    # markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
    # documents = markdown_splitter.split_text(documents)
    print(f"Processed {len(documents)} pdf files")
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
    texts=text_splitter.split_documents(documents)
    
    vectorstore = Chroma.from_documents(documents=texts, embedding=embeddings, persist_directory=DB_PATH)      
    vectorstore.persist()

create_vector_db()

# [How to use a vectorstore as a retriever](https://python.langchain.com/v0.2/docs/how_to/vectorstore_retriever/)

In [None]:
from langchain.vectorstores import Chroma
vectorstore = Chroma(persist_directory=DB_PATH, embedding_function=embeddings)

## Regular retriever

In [None]:
retriever = vectorstore.as_retriever()
docs = retriever.invoke("公司从事的业务情况")
format_documents_vscode(docs)

## [Maximum marginal relevance retrieval](https://python.langchain.com/v0.2/docs/how_to/vectorstore_retriever/#maximum-marginal-relevance-retrieval)

In [None]:
retriever = vectorstore.as_retriever(search_type="mmr")
retriever.invoke("公司的主要客户有哪些，前五客户集中度如何")

## [MultiQueryRetriever](https://python.langchain.com/v0.2/docs/how_to/MultiQueryRetriever/)

In [None]:
from langchain.retrievers.multi_query import MultiQueryRetriever

question = "报告期内公司从事的业务情况"
retriever = vectorstore.as_retriever()
retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=retriever, llm=llm
)
unique_docs = retriever_from_llm.invoke(question)
format_documents_vscode(unique_docs)

## [Add scores to retriever results](https://python.langchain.com/v0.2/docs/how_to/add_scores_retriever/)

In [None]:
from typing import List

from langchain_core.documents import Document
from langchain_core.runnables import chain


@chain
def retriever(query: str) -> List[Document]:
    docs, scores = zip(*vectorstore.similarity_search_with_score(query))
    for doc, score in zip(docs, scores):
        doc.metadata["score"] = score

    return docs

docs = retriever.invoke("公司从事的业务情况")
format_documents_vscode(unique_docs)

## [contextual compression](https://python.langchain.com/v0.2/docs/how_to/contextual_compression/)

In [None]:
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader

PDF_PATH = "data/601919_中远海控/601919_中远海控_中远海控2023年年度报告_1219449961.pdf"
loader = PyPDFLoader(PDF_PATH)
documents = loader.load()

print(f"Processed {len(documents)} pdf files")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
texts = text_splitter.split_documents(documents)

# Base Retriever

In [None]:
from langchain_community.vectorstores import FAISS
    
retriever = FAISS.from_documents(texts, embeddings).as_retriever()
docs = retriever.invoke("公司从事的业务情况")
format_documents_vscode(docs)

## Compress Retriever
**目前实验下来效果较差**

In [None]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

compressor = LLMChainExtractor.from_llm(llm)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
)

compressed_docs = compression_retriever.invoke("公司从事的业务情况")
format_documents_vscode(compressed_docs)

## More built-in compressors: filters
### LLMChainFilter

In [None]:
from langchain.retrievers.document_compressors import LLMChainFilter

_filter = LLMChainFilter.from_llm(llm)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=_filter, base_retriever=retriever
)

compressed_docs = compression_retriever.invoke("公司从事的业务情况")
format_documents_vscode(compressed_docs)

### EmbeddingsFilter

In [None]:
from langchain.retrievers.document_compressors import EmbeddingsFilter

embeddings_filter = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.76)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=embeddings_filter, base_retriever=retriever
)

compressed_docs = compression_retriever.invoke("公司从事的业务情况")
format_documents_vscode(compressed_docs)

In [None]:
keyword = "公司从事的业务情况"
for text in texts:
    if text.metadata['page'] == 14:
        single_document_vscode(text)
    # if keyword in text.page_content:
    #     # format_documents_vscode(text)
    #     single_document_vscode(text)