Install package

In [None]:
# %pip install llama-index-readers-file pymupdf
# %pip install llama-index-vector-stores-chroma
# %pip install llama-index
# %pip install llama-index-embeddings-huggingface
# %pip install chromadb
# %pip install llama-index-llms-groq
# %pip install fastapi
# %pip install uvicorn

Collecting fastapi
  Downloading fastapi-0.116.1-py3-none-any.whl.metadata (28 kB)
Collecting starlette<0.48.0,>=0.40.0 (from fastapi)
  Downloading starlette-0.47.2-py3-none-any.whl.metadata (6.2 kB)
Downloading fastapi-0.116.1-py3-none-any.whl (95 kB)
Downloading starlette-0.47.2-py3-none-any.whl (72 kB)
Installing collected packages: starlette, fastapi

   ---------------------------------------- 0/2 [starlette]
   ---------------------------------------- 0/2 [starlette]
   ---------------------------------------- 0/2 [starlette]
   ---------------------------------------- 0/2 [starlette]
   -------------------- ------------------- 1/2 [fastapi]
   -------------------- ------------------- 1/2 [fastapi]
   -------------------- ------------------- 1/2 [fastapi]
   -------------------- ------------------- 1/2 [fastapi]
   -------------------- ------------------- 1/2 [fastapi]
   ---------------------------------------- 2/2 [fastapi]

Successfully installed fastapi-0.116.1 starlette-0.4

Import library

In [None]:
import os
import requests

# sentence transformers
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# llm api
from llama_index.llms.groq import Groq
# load data
from pathlib import Path
from llama_index.readers.file import PyMuPDFReader

# split documents
from llama_index.core.node_parser import SentenceSplitter

# create node
from llama_index.core.schema import TextNode

# Vector store
from IPython.display import Markdown, display
import chromadb
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext

#Load api key
from dotenv import load_dotenv

  from .autonotebook import tqdm as notebook_tqdm


Load Data

In [3]:
# Tạo thư mục "data" nếu chưa tồn tại
os.makedirs("data", exist_ok=True)

# Tải file PDF
url = "https://arxiv.org/pdf/2307.09288.pdf"
headers = {"User-Agent": "Chrome"}

response = requests.get(url, headers=headers)

# Lưu file vào thư mục "data"
with open("data/llama2.pdf", "wb") as f:
    f.write(response.content)

print("Tải xong!")

Tải xong!


Embedding model

In [2]:
embed_model = HuggingFaceEmbedding(model_name="all-MiniLM-L6-v2")

LLM QWEN API

In [3]:
load_dotenv()  # load biến môi trường từ file .env
API_KEY = os.getenv("MY_API_KEY")
llm = Groq(model="qwen/qwen3-32b", api_key=API_KEY)

Data Loader

In [6]:
loader = PyMuPDFReader()
documents = loader.load(file_path="./data/llama2.pdf")

Text Splitter

In [7]:
text_parser = SentenceSplitter(
    chunk_size=1024,
    # separator=" ",
)


Text Chunks

In [8]:
text_chunks = []
# maintain relationship with source doc index, to help inject doc metadata in (3)
doc_idxs = []
for doc_idx, doc in enumerate(documents):
    cur_text_chunks = text_parser.split_text(doc.text)
    text_chunks.extend(cur_text_chunks)
    doc_idxs.extend([doc_idx] * len(cur_text_chunks))

Create Nodes

In [9]:
nodes = []
for idx, text_chunk in enumerate(text_chunks):
    node = TextNode(
        text=text_chunk,
    )
    src_doc = documents[doc_idxs[idx]]
    node.metadata = src_doc.metadata
    nodes.append(node)

Save DB

In [10]:
db = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db.get_or_create_collection("Test")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex(nodes,
                         storage_context=storage_context,
                         embed_model=embed_model)

Load DB

In [4]:
# load from disk
db2 = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db2.get_or_create_collection("Test")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_vector_store(vector_store, embed_model=embed_model, storage_context=storage_context)

Query + Retrieve

In [5]:
# Query Data
query_engine = index.as_query_engine(llm)
response = query_engine.query("What is llama2?")
display(Markdown(f"<b>{response}</b>"))

<b><think>
Okay, let's tackle this query. The user is asking, "What is llama2?" I need to use the provided context information to form an answer.

First, I'll look through the context. There's a model card mentioned in the first part. It says Llama 2 comes in different parameter sizes: 7B, 13B, and 70B. It's developed by Meta AI and has both pretrained and fine-tuned versions. The input is text-only, and the output is text-only. The architecture is an autoregressive transformer with SFT and RLHF for alignment. Training dates are between January and July 2023. The license is a custom commercial one, and there's info on intended use for commercial and research in English.

The second part of the context has a figure showing the evolution of Llama 2-Chat through different versions, using RLHF and SFT. It mentions evaluations against ChatGPT and other models, with Llama 2-Chat performing well in helpfulness and harmlessness. Human evaluations also show it outperforms open-source models.

Putting this together, the answer should mention the different versions (pretrained vs. fine-tuned), parameter sizes, training methods (SFT, RLHF), intended uses, and performance highlights. Also, note the ethical considerations and licensing. Avoid mentioning the context directly, just the facts from it. Make sure to structure it clearly and concisely.
</think>

Llama 2 is a series of large language models developed by Meta AI, available in three parameter sizes: 7B, 13B, and 70B. It includes both **pretrained** and **fine-tuned** variants, with the latter optimized for chat-like interactions using techniques like supervised fine-tuning (SFT) and reinforcement learning with human feedback (RLHF). The models are designed for text-only input and output, leveraging an optimized transformer architecture. 

Key features include:
- **Training**: Pretrained on 2 trillion tokens (data cutoff September 2022) and fine-tuned with over 1 million human-annotated examples.
- **Performance**: Evaluations show Llama 2-Chat variants outperform open-source and closed-source competitors in helpfulness and safety metrics, with versions like Llama 2-Chat 7B and 34B achieving high win rates in comparisons.
- **Use Cases**: Intended for commercial and research applications in English, with ethical guidelines emphasizing safety testing and adherence to licensing terms.
- **Availability**: Released under a custom commercial license, with access details provided by Meta. 

The models were trained between January and July 2023, with a focus on aligning outputs to human preferences for safety and helpfulness. Meta also highlights efforts to offset the carbon footprint of training.</b>