# Read Files

In [None]:
!pip install rapidocr_onnxruntime -i https://pypi.tuna.tsinghua.edu.cn/simple
!pip install "unstructured[all-docs]" -i https://pypi.tuna.tsinghua.edu.cn/simple
!pip install pyMuPDF -i https://pypi.tuna.tsinghua.edu.cn/simple

## Read PDF

In [None]:
from langchain.document_loaders import PyMuPDFLoader


pdf_loader = PyMuPDFLoader("/workdir/data_base/knowledge_db/pumkin_book/pumpkin_book.pdf")

pdf_pages = pdf_loader.load()

for p in pdf_pages[:1]:
    print(p)

## Read Markdown

In [None]:
from langchain.document_loaders import UnstructuredMarkdownLoader

md_loader = UnstructuredMarkdownLoader(
    "/workdir/data_base/knowledge_db/prompt_engineering/1. 简介 Introduction.md",
    # mode="elements", strategy="fast",
)
md_pages = md_loader.load()

print(md_pages)

## Read MP4

In [None]:
!pip install opencv-python==4.9.0.80-i https://pypi.tuna.tsinghua.edu.cn/simple
!pip install opencv-contrib-python==4.9.0.80 -i https://pypi.tuna.tsinghua.edu.cn/simple

In [None]:
from langchain.document_loaders import UnstructuredFileLoader
txt_loader = UnstructuredFileLoader("/workdir/data_base/knowledge_db/easy_rl/强化学习入门指南.txt")
txt_pages = txt_loader.load()
import cv2
for p in txt_pages[:1]:
    print(p)

# Chunk Datasets

## Split Text

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

chunk_size = 500
overlap_size = 50

pdf_page = pdf_pages[1]

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=overlap_size
)
text_splitter.split_text(pdf_page.page_content[:1000])


## Split Documents

In [None]:
split_docs = text_splitter.split_documents(pdf_pages)
print(len(pdf_pages), len(split_docs))

In [None]:
print(f"Number of characters: {sum([len(sp_doc.page_content) for sp_doc in split_docs])}")

# Text to Embedding

In [3]:
import os
import openai
from dotenv import load_dotenv, find_dotenv


_ = load_dotenv(find_dotenv())

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings(
    api_key=os.environ["OPENAI_SECRET_KEY"],
    base_url=os.environ["OPENAI_API_BASE"]
)

query1 = "机器学习"
query2 = "强化学习"
query3 = "大语言模型"

emb1 = embedding.embed_query(query1)
emb2 = embedding.embed_query(query2)
emb3 = embedding.embed_query(query3)
print(f"emb[:30]: {emb1[:30]}")

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

emb1_np = np.array(emb1)
emb2_np = np.array(emb2)
emb3_np = np.array(emb3)

print(f"{query1} * {query2} : {np.dot(emb1_np, emb2_np)}")
print(f"{query1} * {query3} : {np.dot(emb1_np, emb3_np)}")
print(f"{query3} * {query2} : {np.dot(emb3_np, emb2_np)}")
