## Extract PDF

In [8]:
from typing import Iterator, Union
from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document as LCDocument
from docling.document_converter import DocumentConverter

class DoclingPDFLoader(BaseLoader):

    def __init__(self, file_path: Union[str, list[str]]) -> None:
        self._file_paths = file_path if isinstance(file_path, list) else [file_path]
        self._converter = DocumentConverter()

    def lazy_load(self) -> Iterator[LCDocument]:
        for source in self._file_paths:
            dl_doc = self._converter.convert(source).document
            text = dl_doc.export_to_markdown()
            yield LCDocument(page_content=text)

In [9]:
path ="/Users/jyp/Desktop/manual/samsung/A15.pdf"

### Load and Split

In [10]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

loader = DoclingPDFLoader(file_path=path)
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
)

In [11]:
docs = loader.load()
splits = text_splitter.split_documents(docs)

Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 82601.17it/s]
2024-11-27 09:07:50.710 ( 672.774s) [          490127]    doc_normalisation.h:448   WARN| found new `other` type: checkbox-unselected
2024-11-27 09:07:50.710 ( 672.775s) [          490127]    doc_normalisation.h:448   WARN| found new `other` type: checkbox-unselected
2024-11-27 09:07:50.712 ( 672.777s) [          490127]    doc_normalisation.h:448   WARN| found new `other` type: checkbox-unselected
2024-11-27 09:07:50.712 ( 672.777s) [          490127]    doc_normalisation.h:448   WARN| found new `other` type: checkbox-unselected
2024-11-27 09:07:50.715 ( 672.780s) [          490127]    doc_normalisation.h:448   WARN| found new `other` type: checkbox-unselected
2024-11-27 09:07:50.715 ( 672.780s) [          490127]    doc_normalisation.h:448   WARN| found new `other` type: checkbox-unselected


## Embedding

In [12]:
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

HF_EMBED_MODEL_ID = "BAAI/bge-small-en-v1.5"
embeddings = HuggingFaceEmbeddings(model_name=HF_EMBED_MODEL_ID)

## VectorStore

In [13]:
from tempfile import TemporaryDirectory

from langchain_milvus import Milvus

MILVUS_URI = os.environ.get(
    "MILVUS_URI", f"{(tmp_dir := TemporaryDirectory()).name}/milvus_demo.db"
)

vectorstore = Milvus.from_documents(
    splits,
    embeddings,
    connection_args={"uri": MILVUS_URI},
    drop_old=True,
)

## LLM Model

In [20]:
from langchain_huggingface import HuggingFaceEndpoint

HF_API_KEY = os.environ.get("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")
HF_LLM_MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"

llm = HuggingFaceEndpoint(
    repo_id=HF_LLM_MODEL_ID,
    huggingfacehub_api_token=HF_API_KEY,
)

## Output Prompt

In [21]:
from typing import Iterable

from langchain_core.documents import Document as LCDocument
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough


def format_docs(docs: Iterable[LCDocument]):
    return "\n\n".join(doc.page_content for doc in docs)


retriever = vectorstore.as_retriever()

prompt = PromptTemplate.from_template(
    "Context information is below.\n---------------------\n{context}\n---------------------\nGiven the context information and not prior knowledge, answer the query.\nQuery: {question}\nAnswer:\n"
)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

## Usage

In [22]:
rag_chain.invoke("How to turn off wifi?")

'To turn off Wi-Fi on your device, go to Settings > Connections > Wi-Fi, and then toggle off the Wi-Fi switch. You can also go to Settings > Connections > Wi-Fi and select the network you\'re currently connected to, then tap the "Forget network" option and toggle off the Wi-Fi switch. This will disconnect you from the network and turn off Wi-Fi on your device.'

In [4]:
import os

def _load_files_from_directories(directories) -> list[str]:
        """
        Read all files from the specified directories.

        Args:
            directories (list[str]): A list of folder paths to search for files.

        Returns:
            list[str]: A list of file paths found in the given directories.
        """
        
        all_files = []
        for directory in directories:
            print(f"Loading files from {directory}")
            for root, _, files in os.walk(directory):
                for file in files:
                    all_files.append(os.path.join(root, file))
                    print(f"{file} added")
        return all_files

# Directories to load files from
directories = ["/Users/jyp/Documents/GitHub/BKMS2/apple", "/Users/jyp/Documents/GitHub/BKMS2/samsung"]
files = _load_files_from_directories(directories)



Loading files from /Users/jyp/Documents/GitHub/BKMS2/apple
x.pdf added
iphone-16-manual.pdf added
Loading files from /Users/jyp/Documents/GitHub/BKMS2/samsung
Fold6.pdf added
ZFold6.pdf added
S24.pdf added
A15.pdf added
SNote20.pdf added
