# PDF + HuggingFace + LangChain

In [1]:

from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFacePipeline
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.prompts import PromptTemplate

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from pathlib import Path
import os


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

def load_all_pdfs_from_folder(folder_path):
    pdf_files = list(Path(folder_path).rglob("*.pdf"))
    all_docs = []
    for pdf_file in pdf_files:
        loader = PyPDFLoader(str(pdf_file))
        docs = loader.load()
        all_docs.extend(docs)
    return all_docs

folder_path = "./data"  # üìÇ th∆∞ m·ª•c ch·ª©a c√°c file PDF
documents = load_all_pdfs_from_folder(folder_path)
print(f"Loaded {len(documents)} documents")


Loaded 4 documents


In [3]:

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50, separators=["\n\n", "\n", ". ", " ", ""])
chunks = text_splitter.split_documents(documents)
print(f"Split into {len(chunks)} chunks")


Split into 15 chunks


In [4]:
print(chunks[0])

page_content='Bachelor in ICT Program
1. PROGRAM INTRODUCTION
The ICT formation in USTH prepares graduates for careers in the fields of
Software Engineering and Computer Science. It focuses on applied research
and emerging technologies. The ICT program equips students with the
fundamental understanding of computing and system administration, then
the professional knowledge of information systems, advanced databases,
and eventually management of projects.' metadata={'producer': 'WeasyPrint 53.3', 'creator': 'PyPDF', 'creationdate': '', 'source': 'data\\usth_ict_program.pdf', 'total_pages': 4, 'page': 0, 'page_label': '1'}


In [5]:
for chunk in chunks:
    if 'ICT' in chunk.page_content:
        print(chunk)
        print("_________")

page_content='Bachelor in ICT Program
1. PROGRAM INTRODUCTION
The ICT formation in USTH prepares graduates for careers in the fields of
Software Engineering and Computer Science. It focuses on applied research
and emerging technologies. The ICT program equips students with the
fundamental understanding of computing and system administration, then
the professional knowledge of information systems, advanced databases,
and eventually management of projects.' metadata={'producer': 'WeasyPrint 53.3', 'creator': 'PyPDF', 'creationdate': '', 'source': 'data\\usth_ict_program.pdf', 'total_pages': 4, 'page': 0, 'page_label': '1'}
_________
page_content='and eventually management of projects.
After graduation, ICT students process a solid foundation in mathematics,
informatics, software engineering, and communication. They also have core
knowledge of computer science for higher education. Additionally, students
can specialize in particular areas such as mobile and web development,
security and s

In [7]:

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
persist_dir = "./chroma_pdf_db"

if not os.path.exists(persist_dir):
    vectorstore = Chroma.from_documents(chunks, embedding=embedding_model, persist_directory=persist_dir)
    vectorstore.persist()
else:
    vectorstore = Chroma(persist_directory=persist_dir, embedding_function=embedding_model)

retriever = vectorstore.as_retriever()


  vectorstore = Chroma(persist_directory=persist_dir, embedding_function=embedding_model)


In [9]:
#print(vectorstore._collection.count())
collection  = vectorstore._collection
sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
print(vectorstore._collection)
# Ki·ªÉm tra s·ªë chi·ªÅu (s·ªë ph·∫ßn t·ª≠ trong vector)
dimensions = len(sample_embedding)
print(f"The vectors have {dimensions:,} dimensions")

Collection(name=langchain)
The vectors have 384 dimensions


In [None]:
sample_embedding

array([-6.22113198e-02,  5.80043606e-02,  4.63337526e-02, -4.63679992e-02,
       -2.97737923e-02, -5.56303784e-02,  4.90446016e-02,  2.17754114e-02,
       -3.69764157e-02,  3.05962302e-02, -1.27523288e-01, -5.88759594e-02,
        1.20368756e-01, -7.87599161e-02, -9.57339443e-03, -1.99690163e-02,
       -8.17926042e-03, -1.73477575e-01,  1.03767693e-01, -7.89336637e-02,
       -3.97640653e-03, -1.35015824e-03,  2.11512335e-02, -1.59017995e-01,
        1.83615535e-02,  3.40211368e-03,  3.37740518e-02, -7.36168176e-02,
        1.64607505e-03,  2.36419290e-02,  1.25774257e-02, -1.26344841e-02,
        7.72454813e-02,  8.66943970e-02, -1.75137203e-02,  4.15405398e-03,
        7.46792555e-02, -2.60252561e-02, -1.11188805e-02,  2.16082428e-02,
       -8.37049857e-02,  2.73499712e-02, -1.34544494e-02, -2.44980073e-03,
        4.03113775e-02, -2.27804724e-02, -2.08231565e-02,  5.72956121e-03,
       -7.19641820e-02, -1.04785408e-03, -3.92183550e-02, -2.04335302e-02,
        4.77338471e-02,  

In [None]:

model_id = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_new_tokens=256)
llm = HuggingFacePipeline(pipeline=pipe)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Device set to use cpu
  llm = HuggingFacePipeline(pipeline=pipe)


In [None]:

memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

prompt_template = PromptTemplate.from_template(
    "Context: {context}\n\nQuestion: {question}\n\nAnswer:"
)

conversation_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    combine_docs_chain_kwargs={"prompt": prompt_template}
)


  memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)


In [None]:

# Chat v·ªõi h·ªá th·ªëng
query = "i only need email of ICT department"
response = conversation_chain.run(query)
print("Bot:", response)


Bot: ict_dept@usth.edu.vn
