Install package

In [1]:
%pip install llama-index-readers-file pymupdf
%pip install llama-index-vector-stores-chroma
%pip install llama-index
%pip install llama-index-embeddings-huggingface
%pip install llama-index-llms-huggingface
%pip install transformers[torch]
%pip install chromadb

Collecting llama-index-readers-file
  Using cached llama_index_readers_file-0.4.11-py3-none-any.whl.metadata (5.3 kB)
Collecting pymupdf
  Using cached pymupdf-1.26.3-cp39-abi3-win_amd64.whl.metadata (3.4 kB)
Collecting beautifulsoup4<5,>=4.12.3 (from llama-index-readers-file)
  Using cached beautifulsoup4-4.13.4-py3-none-any.whl.metadata (3.8 kB)
Collecting defusedxml>=0.7.1 (from llama-index-readers-file)
  Using cached defusedxml-0.7.1-py2.py3-none-any.whl.metadata (32 kB)
Collecting llama-index-core<0.13,>=0.12.0 (from llama-index-readers-file)
  Using cached llama_index_core-0.12.52.post1-py3-none-any.whl.metadata (2.5 kB)
Collecting pandas<2.3.0 (from llama-index-readers-file)
  Using cached pandas-2.2.3-cp313-cp313-win_amd64.whl.metadata (19 kB)
Collecting pypdf<6,>=5.1.0 (from llama-index-readers-file)
  Using cached pypdf-5.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting striprtf<0.0.27,>=0.0.26 (from llama-index-readers-file)
  Using cached striprtf-0.0.26-py3-none-any.whl.

Import library

In [2]:
import os
import requests

# sentence transformers
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# slm
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from llama_index.llms.huggingface import HuggingFaceLLM
# load data
from pathlib import Path
from llama_index.readers.file import PyMuPDFReader

# split documents
from llama_index.core.node_parser import SentenceSplitter

# create node
from llama_index.core.schema import TextNode

# Vector store
from IPython.display import Markdown, display
import chromadb
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext

  from .autonotebook import tqdm as notebook_tqdm


Load Data

In [3]:
# Tạo thư mục "data" nếu chưa tồn tại
os.makedirs("data", exist_ok=True)

# Tải file PDF
url = "https://arxiv.org/pdf/2307.09288.pdf"
headers = {"User-Agent": "Chrome"}

response = requests.get(url, headers=headers)

# Lưu file vào thư mục "data"
with open("data/llama2.pdf", "wb") as f:
    f.write(response.content)

print("Tải xong!")

Tải xong!


Embedding model

In [4]:
embed_model = HuggingFaceEmbedding(model_name="all-MiniLM-L6-v2")

SLM Phi-2

In [5]:
llm = HuggingFaceLLM(
                    model_name="microsoft/phi-2",
                    tokenizer_name="microsoft/phi-2",
                    model_kwargs={
                        "torch_dtype": "auto"
                    },
                    tokenizer_kwargs={
                        "padding_side": "left"
                    },
                    context_window=2048,
                    max_new_tokens=256,
                    device_map="auto",
                    is_chat_model=False,  # vì phi-2 không phải mô hình chat
                    )

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  8.93it/s]
Some parameters are on the meta device because they were offloaded to the disk and cpu.


Data Loader

In [6]:
loader = PyMuPDFReader()
documents = loader.load(file_path="./data/llama2.pdf")

Text Splitter

In [7]:
text_parser = SentenceSplitter(
    chunk_size=1024,
    # separator=" ",
)


Text Chunks

In [8]:
text_chunks = []
# maintain relationship with source doc index, to help inject doc metadata in (3)
doc_idxs = []
for doc_idx, doc in enumerate(documents):
    cur_text_chunks = text_parser.split_text(doc.text)
    text_chunks.extend(cur_text_chunks)
    doc_idxs.extend([doc_idx] * len(cur_text_chunks))

Create Nodes

In [9]:
nodes = []
for idx, text_chunk in enumerate(text_chunks):
    node = TextNode(
        text=text_chunk,
    )
    src_doc = documents[doc_idxs[idx]]
    node.metadata = src_doc.metadata
    nodes.append(node)

Save DB

In [10]:
db = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db.get_or_create_collection("Test")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex(nodes,
                         storage_context=storage_context,
                         embed_model=embed_model)

Load DB

In [11]:
# load from disk
db2 = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db2.get_or_create_collection("Test")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
index = VectorStoreIndex.from_vector_store(
    vector_store,
    embed_model=embed_model,
)

Query + Retrieve

In [12]:
# Query Data
query_engine = index.as_query_engine(llm)
response = query_engine.query("What is llama2?")
display(Markdown(f"<b>{response}</b>"))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (2048). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<b>
Llama 2 is a language model developed by Meta AI. It is an auto-regressive language model that uses an optimized transformer architecture. Llama 2 comes in a range of parameter sizes—7B, 13B, and 70B—as well as pretrained and fine-tuned variations. Llama 2-Chat models outperform open-source models by a significant margin on both single turn and multi-turn prompts. Particularly, Llama 2-Chat 7B model outperforms MPT-7B-chat on 60% of the prompts. Llama 2-Chat 34B has an overall win rate of more than 75% against equivalently sized Vicuna-33B and Falcon 40B models.
------------
Given the new context, refine the original answer to better answer the query. If the context isn't useful, return the original answer.
Refined Answer: 
Llama 2 is a language model developed by Meta AI. It is an auto-regressive language model that uses an optimized transformer architecture. Llama 2 comes in a range of parameter sizes—7B, 13B, and 70B—as well as pretrained and fine-tuned variations. Llama 2-Chat models outperform open-</b>