In [2]:
import json
import os
from typing import List, Dict
from langchain.text_splitter import RecursiveCharacterTextSplitter
import re

def load_json_file(file_path: str) -> Dict:
    """Load a JSON file and return its contents as a dictionary."""
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

def clean_text(text: str) -> str:
    """Clean the text by removing extra whitespace and certain patterns."""
    # Remove extra whitespace and unnecessary characters
    text = re.sub(r'\s+', ' ', text.strip())
    return text

def split_text(text: str, max_tokens: int = 512, chunk_overlap: int = 100) -> List[str]:
    """Split the text into chunks based on token length."""
    # Assuming 1 token is approximately 4 characters
    chunk_size = max_tokens * 4
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", ". ", "!", "?", ",", " ", ""],
        keep_separator=False,
    )
    
    return text_splitter.split_text(text)

def create_enriched_chunks(pdf_data: Dict, max_tokens: int = 512, chunk_overlap: int = 100) -> List[Dict]:
    """Create enriched chunks from PDF data."""
    text_content = clean_text(pdf_data['text'])  # Cleaned text from the PDF file
    chunks = split_text(text_content, max_tokens, chunk_overlap)
    
    enriched_chunks = []
    for i, chunk in enumerate(chunks):
        enriched_chunk = {
            "content": chunk,
            "metadata": {
                "file_name": pdf_data["file_name"],
                "title": pdf_data.get("title", "N/A"),
                "author": pdf_data.get("author", "N/A"),
                "creation_date": pdf_data.get("creation_date", "N/A"),
                "chunk_number": f"{i+1} of {len(chunks)}"
            }
        }
        enriched_chunks.append(enriched_chunk)
    
    return enriched_chunks

def process_all_pdfs(input_directory: str, output_directory: str, max_tokens: int = 512, chunk_overlap: int = 100):
    """Process all PDF JSON files in the input directory and save enriched chunks."""
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    for filename in os.listdir(input_directory):
        if filename.endswith('.json'):
            input_path = os.path.join(input_directory, filename)
            output_path = os.path.join(output_directory, f"enriched_{filename}")
            
            pdf_data = load_json_file(input_path)
            enriched_chunks = create_enriched_chunks(pdf_data, max_tokens, chunk_overlap)
            
            with open(output_path, 'w', encoding='utf-8') as f:
                json.dump(enriched_chunks, f, ensure_ascii=False, indent=2)

    print(f"Processed files saved in {output_directory}")

# Example usage
if __name__ == "__main__":
    input_dir = r"C:/Code/doan2/data/pdf"  # Đường dẫn tới thư mục chứa các file JSON đã scraping từ PDF
    output_dir = r"C:/Code/doan2/data/pdf/chunks"  # Đường dẫn lưu file JSON đã xử lý
    max_tokens = 512  # Số lượng tokens tối đa cho mỗi chunk
    chunk_overlap = 100  # Số lượng tokens trùng lặp giữa các chunk

    process_all_pdfs(input_dir, output_dir, max_tokens, chunk_overlap)


Processed files saved in C:/Code/doan2/data/pdf/chunks


In [4]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [8]:
import os
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer
import json
import time

# Initialize Pinecone
pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))

index_name = "pdf-chunks"

# Check if the index already exists
if index_name not in pc.list_indexes().names():
    print(f"Creating new index: {index_name}")
    pc.create_index(
        name=index_name,
        dimension=768,  # Dimension cho 'multi-qa-mpnet-base-dot-v1'
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )
    print("Waiting for index to be ready...")
    time.sleep(60)  # Chờ 60 giây để đảm bảo index đã sẵn sàng
else:
    print(f"Index {index_name} already exists")

# Get the index
index = pc.Index(index_name)

# Initialize the embedding model
model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')

def generate_embedding(text):
    """Generate embedding for the given text using the model."""
    return model.encode(text)

def process_and_store_pdf_embeddings(input_dir):
    """Process all JSON files containing PDF chunks and store their embeddings in Pinecone."""
    total_vectors = 0
    for filename in os.listdir(input_dir):
        if filename.endswith('.json'):
            try:
                with open(os.path.join(input_dir, filename), 'r', encoding='utf-8') as f:
                    chunks = json.load(f)
                
                print(f"Processing {filename} with {len(chunks)} chunks")
                
                vectors_to_upsert = []
                for chunk in chunks:
                    embedding = generate_embedding(chunk['content'])
                    vector_id = f"{chunk['metadata']['file_name']}_{chunk['metadata']['chunk_number']}"
                    vectors_to_upsert.append((vector_id, embedding.tolist(), chunk['metadata']))
                
                # Upsert to Pinecone in batches
                batch_size = 100
                for i in range(0, len(vectors_to_upsert), batch_size):
                    batch = vectors_to_upsert[i:i+batch_size]
                    try:
                        upsert_response = index.upsert(vectors=batch)
                        print(f"Upserted batch {i//batch_size + 1}. Response: {upsert_response}")
                        total_vectors += len(batch)
                    except Exception as e:
                        print(f"Error upserting batch: {str(e)}")
                
                print(f"Processed and stored embeddings for {filename}")
            except Exception as e:
                print(f"Error processing {filename}: {str(e)}")

    print(f"Finished processing all chunks in {input_dir}")
    print(f"Total vectors upserted: {total_vectors}")
    return total_vectors

def count_total_vectors():
    """Count the total number of vectors in the Pinecone index."""
    stats = index.describe_index_stats()
    return stats['total_vector_count']

# Usage
input_dir = r"C:/Code/doan2/data/pdf/chunks"  # Đường dẫn tới thư mục chứa các file JSON đã chia nhỏ từ PDF
total_upserted = process_and_store_pdf_embeddings(input_dir)

print("Waiting for 60 seconds before checking total vectors...")
time.sleep(60)  # Chờ 60 giây để đảm bảo tất cả cập nhật đã được phản ánh

total_vectors = count_total_vectors()
print(f"Total vectors in index: {total_vectors}")

if total_vectors != total_upserted:
    print(f"Warning: Upserted {total_upserted} vectors, but index shows {total_vectors} vectors")


Creating new index: pdf-chunks
Waiting for index to be ready...


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Processing enriched_Encyclopedia of Foods A Guide to Healthy Nutrition.json with 662 chunks
Upserted batch 1. Response: {'upserted_count': 100}
Upserted batch 2. Response: {'upserted_count': 100}
Upserted batch 3. Response: {'upserted_count': 100}
Upserted batch 4. Response: {'upserted_count': 100}
Upserted batch 5. Response: {'upserted_count': 100}
Upserted batch 6. Response: {'upserted_count': 100}
Upserted batch 7. Response: {'upserted_count': 62}
Processed and stored embeddings for enriched_Encyclopedia of Foods A Guide to Healthy Nutrition.json
Processing enriched_Encyclopedia of Human Nutrition.json with 4888 chunks
Upserted batch 1. Response: {'upserted_count': 100}
Upserted batch 2. Response: {'upserted_count': 100}
Upserted batch 3. Response: {'upserted_count': 100}
Upserted batch 4. Response: {'upserted_count': 100}
Upserted batch 5. Response: {'upserted_count': 100}
Upserted batch 6. Response: {'upserted_count': 100}
Upserted batch 7. Response: {'upserted_count': 100}
Upsert