In [1]:
import os
import json
from dotenv import load_dotenv
from llama_index.core import (
    VectorStoreIndex,
    Settings,
    Document,
    StorageContext,
)
from llama_index.vector_stores.chroma import ChromaVectorStore

from langchain_huggingface import HuggingFaceEmbeddings
from llama_index.embeddings.langchain import LangchainEmbedding

# Load environment variables
load_dotenv()
GEMINI_API_KEY = os.getenv('GEMINI_API_KEY')

In [2]:
def split_text_sliding_window(text, max_tokens=512, overlap=50):
    words = text.split()  # Giả sử mỗi từ là 1 token
    chunks = []
    start = 0

    while start < len(words):
        chunk = words[start : start + max_tokens]
        chunks.append(" ".join(chunk))
        start += max_tokens - overlap  # Trượt cửa sổ với độ chồng lấn

    return chunks

def load_and_process_json(json_path):  
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    documents = []
    
    for item in data:
        content = item.get("content", "")
        unit_info = f"Unit: {item.get('unit')}" if item.get('unit') else ""
        section_info = f"Section: {item.get('section')}" if item.get('section') else ""
        type_info = f"Type: {item.get('type')}" if item.get('type') else ""
        
        # Add "passage: " prefix for document content
        enriched_text = f"passage: {unit_info}\n{section_info}\n{type_info}\n{content}".strip()
        
        metadata = item.get("metadata", {})
        metadata.update({
            "id": item.get("id"),
            "unit": item.get("unit"),
            "section": item.get("section"),
        })
        
        # Split text if it's too long
        chunks = split_text_sliding_window(enriched_text, max_tokens=500, overlap=50)
        
        if len(chunks) > 1:
            for i, chunk in enumerate(chunks):
                chunk_metadata = metadata.copy()
                chunk_metadata.update({
                    "id": f"{metadata['id']}_chunk_{i}",
                })
                doc = Document(text=chunk, metadata=chunk_metadata)
                documents.append(doc)
        else:
            doc = Document(text=enriched_text, metadata=metadata)
            documents.append(doc)
    
    return documents

def process_all_json_files(directory):
    documents = []
    
    for filename in os.listdir(directory):
        if filename.endswith(".json"):
            file_path = os.path.join(directory, filename)
            documents.extend(load_and_process_json(file_path))
    
    return documents

In [3]:
json_dir = "/home/buma04/ai-tutor/data/book_content/"
documents = process_all_json_files(json_dir)
lc_embed_model = HuggingFaceEmbeddings(
    model_name="intfloat/multilingual-e5-small"
)
embed_model = LangchainEmbedding(lc_embed_model)
Settings.embed_model = embed_model

In [6]:
file_path = "/home/buma04/ai-tutor/data/book_content/unit1_getting_started.json"
documents.extend(load_and_process_json(file_path))

In [7]:
import chromadb

# create client and a new collection
# chromadb.EphemeralClient saves data in-memory.
chroma_client = chromadb.PersistentClient(path="../chromadb")

# Delete collection if it exists
try:
    chroma_client.delete_collection("unit1_db")
except:
    pass

chroma_collection = chroma_client.create_collection("unit1_db")

vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# Create index
index = VectorStoreIndex.from_documents(
    documents,
    embed_model=embed_model,
    storage_context=storage_context,
    show_progress=True,
)

Parsing nodes:   0%|          | 0/90 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/91 [00:00<?, ?it/s]

In [16]:
# Truy xuất các tài liệu tương tự với một câu truy vấn
retriever = index.as_retriever(similarity_top_k=10)
retrieved_nodes = retriever.retrieve("query: " + "The dialogue in Section Getting Started Unit 1")

print("\nCác tài liệu tương tự:")
for i, node in enumerate(retrieved_nodes):
    print(f"Tài liệu #{i+1} (Score: {node.score}):")
    print(f"Metadata: {node.metadata}")
    print(f"Nội dung: {node.text[:500]}...\n")
    print("-"*200)

# Kiểm tra thông tin về index
print(f"\nTổng số tài liệu trong index: {len(documents)}")

Number of requested results 10 is greater than number of elements in index 8, updating n_results = 8



Các tài liệu tương tự:
Tài liệu #1 (Score: 0.7601767550815584):
Metadata: {'page': '9', 'chunk_type': 'paragraph', 'related_chunks': 'unit_1_page_8_2, unit_1_page_9_1', 'id': 'unit_1_page_9_4', 'unit': '1. Life stories we admire', 'section': 'Getting Started'}
Nội dung: passage: Unit: 1. Life stories we admire
Section: Getting Started
Type: text
4 Complete the sentences based on the conversation.
Dang Thuy Tram was a young surgeon. She (1) _________ her diary while she (2) _________ in a field hospital during the war. One day, she (3) _________ while she (4) _________ in the jungle. She was only 27 then. An American soldier (5) _________ her diary for many years before returning a copy to her family....

--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Tài liệu #2 (Score: 0.7550540918152124):
Metadata: {'page': '9', 'chunk_type': 'list

In [17]:
# ... existing code ...
from transformers import AutoTokenizer

def analyze_token_count(text, tokenizer):
    """
    Analyze token count for a given text.
    
    Args:
        text (str): Input text to analyze
        tokenizer: HuggingFace tokenizer instance
        
    Returns:
        int: Number of tokens
    """
    tokens = tokenizer.encode(text)
    return len(tokens)

def load_and_process_json(json_path, tokenizer=None):
    """
    Load a JSON file and process its content to extract text data.
    
    Args:
        json_path (str): Path to the JSON file.
        tokenizer: Optional tokenizer for token count analysis
    
    Returns:
        list: A list of Document objects containing processed text and metadata.
    """
    # Load JSON data
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    documents = []
    
    for item in data:
        content = item.get("content", "")
        unit_info = f"Unit: {item.get('unit')}" if item.get('unit') else ""
        section_info = f"Section: {item.get('section')}" if item.get('section') else ""
        type_info = f"Type: {item.get('type')}" if item.get('type') else ""
        enriched_text = f"{unit_info}\n{section_info}\n{type_info}\n{content}".strip()
        
        metadata = item.get("metadata", {})
        metadata.update({
            "id": item.get("id"),
            "unit": item.get("unit"),
            "section": item.get("section"),
        })
        
        # Add token count to metadata if tokenizer is provided
        if tokenizer:
            token_count = analyze_token_count(enriched_text, tokenizer)
            if token_count > 512:
                print(item)
            metadata["token_count"] = token_count
            
        # Create a Document object
        doc = Document(text=enriched_text, metadata=metadata)
        documents.append(doc)
    
    return documents

def process_all_json_files(directory, tokenizer):
    """
    Process all JSON files in the specified directory.
    
    Args:
        directory (str): Path to the directory containing JSON files.
    
    Returns:
        list: A list of Document objects from all processed JSON files.
    """
    documents = []
    
    for filename in os.listdir(directory):
        if filename.endswith(".json"):
            file_path = os.path.join(directory, filename)
            documents.extend(load_and_process_json(file_path, tokenizer))
    
    return documents

# Example usage:
tokenizer = AutoTokenizer.from_pretrained("intfloat/multilingual-e5-small")
documents = process_all_json_files(json_dir, tokenizer)

# Analyze token distribution
token_counts = [doc.metadata["token_count"] for doc in documents]
print(f"Total chunks: {len(token_counts)}")
print(f"Average tokens per chunk: {sum(token_counts)/len(token_counts):.2f}")
print(f"Max tokens: {max(token_counts)}")
print(f"Min tokens: {min(token_counts)}")

Token indices sequence length is longer than the specified maximum sequence length for this model (551 > 512). Running this sequence through the model will result in indexing errors


{'id': 'unit_1_page_11_6', 'unit': 'Life stories we admire', 'section': 'READING', 'type': 'text', 'content': '2 Read the article. Choose the words or phrases with the closest meaning to the highlighted words or phrases in the text.\nSTEVE JOBS’ LIFE AND ACHIEVEMENTS\nA.\nSteven Paul Jobs was born on 24 February, 1955 in San Francisco, USA. His biological parents were not married and gave him up for adoption. He was **adopted** by Clara and Paul Jobs. In 1971, Jobs met Steve Wozniak, who was five years older than him, but they bonded over their love of electronics. After high school, Jobs attended Reed College in Oregon, but found the classes boring and **dropped out** after six months.\nB.\nWhen Jobs was 21, he and Wozniak started Apple Computers in Jobs’ family garage with money they got by selling Jobs’ van and Wozniak’s scientific calculator. By making computers smaller, cheaper, and accessible to everyday users, their company became a huge success and sales quickly increased.\nAlt