## Setting up the environment

In [1]:
%load_ext autoreload
%autoreload 2

import os
from dotenv import load_dotenv
load_dotenv()

# Setup embedding model
from llama_index.core import Settings

# Use local embedding models served by LM Studio
# Use fake API key (LM Studio doesn't validate it)
from llama_index.embeddings.openai import OpenAIEmbedding
# embed_model = OpenAIEmbedding(
#     api_base = os.getenv("LM_STUDIO_API_BASE"),
#     api_key = "whatever-it-is",
#     model_name = os.getenv("LM_STUDIO_EMBED_MODEL"),
#     embed_batch_size = 2
# )

# # Embedding model verification
# Settings.embed_model = embed_model
# embed = embed_model.get_text_embedding("The quick brown fox jumps over the lazy dog.")
# print(embed[:5])  # Should print a list of floats

In [2]:
# Import the embedding solution
from embedding_solution import create_embedding_model

# Use raw string to avoid escape sequence warnings
model_path = r"F:\Intern\EDF\EmbeddingModels\Qwen3-Embedding-0.6B"

print(f"Loading model from: {model_path}")

# Create the embedding model with fallback
try:
    embed_model = create_embedding_model(model_path, fallback=True)
    print(f"‚úì Successfully created embedding model: {type(embed_model).__name__}")
    
    # Test the embedding
    test_text = "Silicon battery technology is promising."
    embedding = embed_model.get_text_embedding(test_text)
    print(f"‚úì Test embedding generated successfully")
    print(f"Embedding dimension: {len(embedding)}")
    
    # Set as default for llama-index
    try:
        from llama_index.core import Settings
        Settings.embed_model = embed_model
        print("‚úì Successfully set as default embedding model for llama-index")
        
        # Test llama-index integration
        test_embedding = Settings.embed_model.get_text_embedding("Test query")
        print(f"‚úì Llama-index integration working")
        
    except ImportError:
        print("llama-index not available, but embedding model works")
    except Exception as e:
        print(f"Llama-index integration failed: {e}")
    
except Exception as e:
    print(f"‚úó Failed to create embedding model: {e}")
    print("Trying fallback models...")
    
    try:
        embed_model = create_embedding_model(fallback=True)
        print(f"‚úì Created fallback embedding model: {type(embed_model).__name__}")
        
        # Set as default for llama-index
        try:
            from llama_index.core import Settings
            Settings.embed_model = embed_model
            print("‚úì Successfully set fallback model as default for llama-index")
        except ImportError:
            print("llama-index not available, but fallback embedding model works")
            
    except Exception as e2:
        print(f"‚úó Even fallback failed: {e2}")
        embed_model = None

print("\n" + "="*50)
if embed_model:
    print("SUCCESS: Embedding model is ready to use!")
    print("You can now use llama-index with this embedding model")
else:
    print("FAILED: No working embedding model found")
    print("Please check your dependencies and model path")
print("="*50) 

‚úì Llama-index embeddings available


  from .autonotebook import tqdm as notebook_tqdm


‚úì Transformers version: 4.51.3
‚úó Sentence-transformers not available
Loading model from: F:\Intern\EDF\EmbeddingModels\Qwen3-Embedding-0.6B
Loading Qwen3 model from: F:\Intern\EDF\EmbeddingModels\Qwen3-Embedding-0.6B
‚úì Qwen3 model loaded successfully (Method 1)
‚úì Successfully created embedding model: Qwen3Embedding
‚úì Test embedding generated successfully
Embedding dimension: 1024
‚úì Successfully set as default embedding model for llama-index
‚úì Llama-index integration working

SUCCESS: Embedding model is ready to use!
You can now use llama-index with this embedding model


In [3]:
# Embedding model verification
Settings.embed_model = embed_model
embed = embed_model.get_text_embedding("The quick brown fox jumps over the lazy dog.")
print(embed[:5])  # Should print a list of floats

[2.673593044281006, -6.904200553894043, -0.5414437055587769, 1.5437066555023193, 0.8829537630081177]


In [4]:
# ‰ΩøÁî®Êô∫Ë∞±ÂÖçË¥πÊ®°ÂûãÔºåÊèêÂèñËäÇÁÇπÂÖ≥Á≥ª
from llama_index.llms.zhipuai import ZhipuAI
ZHIPU_API_KEY = "41b29e65745d4110a018c5d616b0012f.A6CEwmornnYXSVLC"
ZHIPU_LLM_MODEL_NAME = "glm-4-flash"
llm_extraction = ZhipuAI(
    api_key=ZHIPU_API_KEY,
    model=ZHIPU_LLM_MODEL_NAME
)
Settings.llm = llm_extraction

# Verify the LLM
print(llm_extraction.complete("\nBriefly introduce yourself in 50 Chinese characters."))

ÊàëÊòØ‰∫∫Â∑•Êô∫ËÉΩÂä©ÊâãÔºå‰∏∫ÊÇ®ÊúçÂä°„ÄÇ


## Pipeline 1: Collecting & preparing the documents

In [None]:
# from llama_index.core import SimpleDirectoryReader

# # Load documents
# documents = SimpleDirectoryReader("../papers").load_data()
# print(documents[2])

Doc ID: 1447df38-7d1d-491b-9f4a-8cbd6a401164
Text: Deren Yang Editor Handbook of Photovoltaic Silicon With 578
Figures and 71 Tables


In [5]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import glob
from llama_index.core import Document

pdf_files = glob.glob("../zotero/*.pdf")
all_docs = []

splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=100,
    separators=["\n\n", "\n", ".", "„ÄÇ", "ÔºÅ", "Ôºü", "!", "?"]
)

for pdf in pdf_files:
    loader = PyPDFLoader(pdf)
    documents = loader.load()
    docs = splitter.split_documents(documents)
    all_docs.extend(docs)

print(f"ÂÖ±Âä†ËΩΩÂàÜÂâ≤ {len(all_docs)} ‰∏™ÊñáÊ°£Âùó")
print(all_docs[0].page_content[:500])

# ËΩ¨‰∏∫ llama_index Document
documents = [Document(text=doc.page_content) for doc in all_docs]

ÂÖ±Âä†ËΩΩÂàÜÂâ≤ 13528 ‰∏™ÊñáÊ°£Âùó
Handbook of 
Photovoltaic 
Silicon
Deren Yang
Editor


In [5]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import glob
from llama_index.core import Document

import pdfplumber
import fitz  # PyMuPDF
import os
from datetime import datetime
import hashlib

import warnings
warnings.filterwarnings("ignore")

pdf_files = glob.glob("../zotero/*.pdf")
all_docs = []

# ÂàÜÂâ≤Âô®
splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=100,
    separators=["\n\n", "\n", ".", "„ÄÇ", "ÔºÅ", "Ôºü", "!", "?"]
)

def extract_pdf_basic_info(pdf_path):
    """ÊèêÂèñPDFÂü∫Êú¨‰ø°ÊÅØ‰Ωú‰∏∫metadata"""
    try:
        doc = fitz.open(pdf_path)
        metadata = doc.metadata
        doc.close()
        
        # Ëé∑ÂèñÊñá‰ª∂‰ø°ÊÅØ
        file_stats = os.stat(pdf_path)
        creation_time = datetime.fromtimestamp(file_stats.st_ctime).isoformat()
        modification_time = datetime.fromtimestamp(file_stats.st_mtime).isoformat()
        file_size = file_stats.st_size
        
        return {
            'title': metadata.get('title', ''),
            'author': metadata.get('author', ''),
            'subject': metadata.get('subject', ''),
            'creator': metadata.get('creator', ''),
            'producer': metadata.get('producer', ''),
            'creation_date': metadata.get('creationDate', ''),
            'modification_date': metadata.get('modDate', ''),
            'file_creation_time': creation_time,
            'file_modification_time': modification_time,
            'file_size_bytes': file_size
        }
    except Exception as e:
        print(f"Warning: Could not extract metadata from {pdf_path}: {e}")
        return {}

for pdf in pdf_files:
    file_name = os.path.basename(pdf)
    file_path = os.path.abspath(pdf)
    
    # ÊèêÂèñPDFÂü∫Êú¨‰ø°ÊÅØ
    pdf_metadata = extract_pdf_basic_info(pdf)
    
    # 1. ÊñáÊú¨ÂùóÊèêÂèñÔºàÂ¢ûÂº∫ÁâàÔºöÂåÖÂê´ÂÆåÊï¥metadataÔºâ
    loader = PyPDFLoader(pdf)
    documents = loader.load()
    docs = splitter.split_documents(documents)
    
    for chunk_idx, doc in enumerate(docs):
        # ÂàõÂª∫ËØ¶ÁªÜÁöÑmetadata
        chunk_metadata = {
            # Ê∫êÊñá‰ª∂‰ø°ÊÅØ
            'source_file': file_name,
            'source_path': file_path,
            'file_type': 'pdf',
            
            # PDFÊñáÊ°£‰ø°ÊÅØ
            'pdf_title': pdf_metadata.get('title', ''),
            'pdf_author': pdf_metadata.get('author', ''),
            'pdf_subject': pdf_metadata.get('subject', ''),
            'pdf_creator': pdf_metadata.get('creator', ''),
            'pdf_creation_date': pdf_metadata.get('creation_date', ''),
            
            # Êñá‰ª∂Á≥ªÁªü‰ø°ÊÅØ
            'file_size_bytes': pdf_metadata.get('file_size_bytes', 0),
            'file_creation_time': pdf_metadata.get('file_creation_time', ''),
            'file_modification_time': pdf_metadata.get('file_modification_time', ''),
            
            # Âùó‰ø°ÊÅØ
            'chunk_type': 'text',
            'chunk_index': chunk_idx,
            'page_number': doc.metadata.get('page', 'unknown'),
            'chunk_size': len(doc.page_content),
            
            # ÂéüÂßãPyPDFLoader metadata
            'original_metadata': doc.metadata,
            
            # Â§ÑÁêÜÊó∂Èó¥Êà≥
            'processing_timestamp': datetime.now().isoformat(),
            
            # ÂÜÖÂÆπÂìàÂ∏åÔºàÁî®‰∫éÂéªÈáçÂíåÁâàÊú¨ÊéßÂà∂Ôºâ
            'content_hash': hashlib.md5(doc.page_content.encode()).hexdigest(),
            
            # ÁªüËÆ°‰ø°ÊÅØ
            'word_count': len(doc.page_content.split()),
            'char_count': len(doc.page_content)
        }
        
        # ÂàõÂª∫Â∏¶metadataÁöÑDocument
        enhanced_doc = Document(
            text=doc.page_content,
            metadata=chunk_metadata
        )
        all_docs.append(enhanced_doc)

    # 2. Ë°®Ê†ºÊèêÂèñÔºàÂ¢ûÂº∫ÁâàÔºöÂåÖÂê´ÂÆåÊï¥metadataÔºâ
    with pdfplumber.open(pdf) as pdf_doc:
        for page_num, page in enumerate(pdf_doc.pages):
            tables = page.extract_tables()
            for table_idx, table in enumerate(tables):
                # ËΩ¨‰∏∫ÁªìÊûÑÂåñÊñáÊú¨
                table_text = "\n".join([", ".join([str(cell) if cell is not None else "" for cell in row]) for row in table])
                
                # Ë°®Ê†ºÊ†áËØÜ‰ø°ÊÅØ
                table_header = f"„ÄêË°®Ê†º„ÄëÊñá‰ª∂:{file_name}, È°µÁ†Å:{page_num+1}, Ë°®Ê†ºÂ∫èÂè∑:{table_idx+1}"
                full_table_text = f"{table_header}\n{table_text}"
                
                # Ë°®Ê†ºmetadata
                table_metadata = {
                    # Ê∫êÊñá‰ª∂‰ø°ÊÅØ
                    'source_file': file_name,
                    'source_path': file_path,
                    'file_type': 'pdf',
                    
                    # PDFÊñáÊ°£‰ø°ÊÅØ
                    'pdf_title': pdf_metadata.get('title', ''),
                    'pdf_author': pdf_metadata.get('author', ''),
                    'pdf_subject': pdf_metadata.get('subject', ''),
                    
                    # Ë°®Ê†ºÁâπÂÆö‰ø°ÊÅØ
                    'chunk_type': 'table',
                    'page_number': page_num + 1,
                    'table_index': table_idx + 1,
                    'table_rows': len(table),
                    'table_cols': len(table[0]) if table else 0,
                    
                    # Â§ÑÁêÜ‰ø°ÊÅØ
                    'processing_timestamp': datetime.now().isoformat(),
                    'content_hash': hashlib.md5(full_table_text.encode()).hexdigest(),
                    'word_count': len(full_table_text.split()),
                    'char_count': len(full_table_text)
                }
                
                table_doc = Document(
                    text=full_table_text,
                    metadata=table_metadata
                )
                all_docs.append(table_doc)

    # 3. ÂõæÂÉè/ÂõæË°®ÊèêÂèñÔºàÂ¢ûÂº∫ÁâàÔºöÂåÖÂê´ÂÆåÊï¥metadataÔºâ
    doc = fitz.open(pdf)
    for page_index in range(len(doc)):
        page = doc[page_index]
        for img_index, img in enumerate(page.get_images(full=True)):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            img_ext = base_image["ext"]
            img_name = f"{os.path.splitext(file_name)[0]}_page{page_index+1}_img{img_index+1}.{img_ext}"
            img_save_path = os.path.join("./extracted_images", img_name)
            os.makedirs("./extracted_images", exist_ok=True)
            
            with open(img_save_path, "wb") as f:
                f.write(image_bytes)
            
            # ÂõæÂÉèÊèèËø∞ÊñáÊú¨
            image_description = f"„ÄêÂõæË°®„ÄëÊñá‰ª∂:{file_name}, È°µÁ†Å:{page_index+1}, ÂõæÁâáÂ∫èÂè∑:{img_index+1}, Ë∑ØÂæÑ:{img_save_path}"
            
            # ÂõæÂÉèmetadata
            image_metadata = {
                # Ê∫êÊñá‰ª∂‰ø°ÊÅØ
                'source_file': file_name,
                'source_path': file_path,
                'file_type': 'pdf',
                
                # PDFÊñáÊ°£‰ø°ÊÅØ
                'pdf_title': pdf_metadata.get('title', ''),
                'pdf_author': pdf_metadata.get('author', ''),
                'pdf_subject': pdf_metadata.get('subject', ''),
                
                # ÂõæÂÉèÁâπÂÆö‰ø°ÊÅØ
                'chunk_type': 'image',
                'page_number': page_index + 1,
                'image_index': img_index + 1,
                'image_format': img_ext,
                'image_path': img_save_path,
                'image_size_bytes': len(image_bytes),
                
                # ÂõæÂÉèÊäÄÊúØ‰ø°ÊÅØ
                'image_width': base_image.get('width', 0),
                'image_height': base_image.get('height', 0),
                'image_colorspace': base_image.get('colorspace', 'unknown'),
                
                # Â§ÑÁêÜ‰ø°ÊÅØ
                'processing_timestamp': datetime.now().isoformat(),
                'content_hash': hashlib.md5(image_description.encode()).hexdigest(),
                'word_count': len(image_description.split()),
                'char_count': len(image_description)
            }
            
            image_doc = Document(
                text=image_description,
                metadata=image_metadata
            )
            all_docs.append(image_doc)
    
    doc.close()

print(f"ÂÖ±Âä†ËΩΩÂàÜÂâ≤ {len(all_docs)} ‰∏™ÊñáÊ°£ÂùóÔºàÂê´ÊñáÊú¨„ÄÅË°®Ê†º„ÄÅÂõæË°®‰ø°ÊÅØÔºâ")
print(f"ÊñáÊ°£Á±ªÂûãÂàÜÂ∏É:")
chunk_types = {}
for doc in all_docs:
    chunk_type = doc.metadata.get('chunk_type', 'unknown')
    chunk_types[chunk_type] = chunk_types.get(chunk_type, 0) + 1

for chunk_type, count in chunk_types.items():
    print(f"  {chunk_type}: {count} ‰∏™")

print(f"\nÁ§∫‰æãÊñáÊ°£Âùómetadata:")
if all_docs:
    example_doc = all_docs[0]
    print(f"ÊñáÊú¨È¢ÑËßà: {example_doc.text[:200]}...")
    print(f"Metadata keys: {list(example_doc.metadata.keys())}")
    print(f"Ê∫êÊñá‰ª∂: {example_doc.metadata.get('source_file', 'N/A')}")
    print(f"ÂùóÁ±ªÂûã: {example_doc.metadata.get('chunk_type', 'N/A')}")
    print(f"È°µÁ†Å: {example_doc.metadata.get('page_number', 'N/A')}")

# ÂêéÁª≠‰∏éÂéü pipeline ‰øùÊåÅ‰∏ÄËá¥
documents = all_docs

Cannot set gray non-stroke color because /'Pa1' is an invalid float value
Cannot set gray non-stroke color because /'Pa2' is an invalid float value
Cannot set gray non-stroke color because /'Pa3' is an invalid float value
Cannot set gray non-stroke color because /'Pa1' is an invalid float value
Cannot set gray non-stroke color because /'Pa2' is an invalid float value
Cannot set gray non-stroke color because /'Pa1' is an invalid float value
Cannot set gray non-stroke color because /'Pa1' is an invalid float value
Cannot set gray non-stroke color because /'Pa1' is an invalid float value
Cannot set gray non-stroke color because /'Pa1' is an invalid float value
Cannot set gray non-stroke color because /'Pa1' is an invalid float value
Cannot set gray non-stroke color because /'Pa1' is an invalid float value
Cannot set gray non-stroke color because /'Pa2' is an invalid float value
Cannot set gray non-stroke color because /'Pa1' is an invalid float value
Cannot set gray non-stroke color becau

ÂÖ±Âä†ËΩΩÂàÜÂâ≤ 18727 ‰∏™ÊñáÊ°£ÂùóÔºàÂê´ÊñáÊú¨„ÄÅË°®Ê†º„ÄÅÂõæË°®‰ø°ÊÅØÔºâ
ÊñáÊ°£Á±ªÂûãÂàÜÂ∏É:
  text: 13528 ‰∏™
  table: 879 ‰∏™
  image: 4320 ‰∏™

Á§∫‰æãÊñáÊ°£Âùómetadata:
ÊñáÊú¨È¢ÑËßà: Handbook of 
Photovoltaic 
Silicon
Deren Yang
Editor...
Metadata keys: ['source_file', 'source_path', 'file_type', 'pdf_title', 'pdf_author', 'pdf_subject', 'pdf_creator', 'pdf_creation_date', 'file_size_bytes', 'file_creation_time', 'file_modification_time', 'chunk_type', 'chunk_index', 'page_number', 'chunk_size', 'original_metadata', 'processing_timestamp', 'content_hash', 'word_count', 'char_count']
Ê∫êÊñá‰ª∂: 978-3-662-56472-1.pdf
ÂùóÁ±ªÂûã: text
È°µÁ†Å: 0


## Pipeline 2: Creating vector store

In [3]:
from llama_index.vector_stores.deeplake import DeepLakeVectorStore
from llama_index.core import StorageContext, VectorStoreIndex

# Path for vector store and dataset
vector_dataset = "./dataset/vector_storage_with_metadata" # local storage
vector_store_path = vector_dataset
dataset_path = vector_dataset

  import pkg_resources  # type: ignore


In [11]:
from llama_index.vector_stores.deeplake import DeepLakeVectorStore
from llama_index.core import StorageContext, VectorStoreIndex

# Path for vector store and dataset
vector_dataset = "./dataset/vector_storage_with_metadata" # local storage
vector_store_path = vector_dataset
dataset_path = vector_dataset

# Create an index over the documents
# Overwrites the existing dataset if True
ow = True

if ow==True:
    try:
        vector_store = DeepLakeVectorStore(dataset_path=dataset_path, overwrite=ow)
        storage_context = StorageContext.from_defaults(vector_store=vector_store)
        index = VectorStoreIndex.from_documents(documents, storage_context, show_progress=True)
    except Exception as e:
        print(f"An error occurred: {e}")
        print(f"Error type: {type(e)}")
        print(f"Error traceback: {e.__traceback__}")

Parsing nodes: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 18727/18727 [00:21<00:00, 874.97it/s] 
Generating embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2048/2048 [3:25:09<00:00,  6.01s/it]  


Uploading data to deeplake dataset.


  0%|          | 0/2048 [00:00<?, ?it/s]

An error occurred: Error while attempting to rollback appends
Error type: <class 'Exception'>
Error traceback: <traceback object at 0x0000018467C793C0>





In [5]:
# -*- coding: utf-8 -*-
"""
This script loads a pre-existing DeepLake vector store and prepares it for querying.
It assumes you have already run the 'run_vectorization.py' script and have the
vector store folder available.
"""

import os
from llama_index.core import VectorStoreIndex, Settings
from llama_index.vector_stores.deeplake import DeepLakeVectorStore
import warnings

warnings.filterwarnings("ignore")

# --- 1. ÈÖçÁΩÆË∑ØÂæÑ ---
# --- 1. Configuration ---

# ÂÖ≥ÈîÆÊ≠•È™§ÔºöËøôÈáåÁöÑÊ®°ÂûãË∑ØÂæÑÂøÖÈ°ªÂíå‰Ω†Âêå‰∫ãÁîüÊàêÂêëÈáèÊó∂‰ΩøÁî®ÁöÑÊ®°ÂûãÂÆåÂÖ®‰∏ÄËá¥ÔºÅ
# CRITICAL STEP: The model path here MUST be identical to the one your colleague used for vectorization!
# ËØ∑Ê†πÊçÆ‰Ω†Êú¨Âú∞Â≠òÊîæ embedding Ê®°ÂûãÁöÑ‰ΩçÁΩÆ‰øÆÊîπÊ≠§Ë∑ØÂæÑ„ÄÇ
# Please update this path to where you have stored the embedding model locally.
EMBEDDING_MODEL_PATH = r"F:\Intern\EDF\EmbeddingModels\Qwen3-Embedding-0.6B"

# ÂÖ≥ÈîÆÊ≠•È™§ÔºöÊåáÂêë‰Ω†‰ªéÂêå‰∫ãÈÇ£ÈáåÊã∑Ë¥ùËøáÊù•ÁöÑÂêëÈáèÊï∞ÊçÆÂ∫ìÊñá‰ª∂Â§π„ÄÇ
# CRITICAL STEP: Point this to the vector store folder you copied from your colleague.
VECTOR_STORE_PATH = "./dataset/vector_storage_for_colleague"


# --- 2. ‰∏ªÊâßË°åÂáΩÊï∞ ---
# --- 2. Main Execution Function ---

def main():
    """
    Loads the DeepLake index and runs a sample query.
    """
    print("--- Starting Query Process ---")

    # Ê≠•È™§ 1: ËÆæÁΩÆÂÖ®Â±ÄÁöÑ Embedding Ê®°Âûã
    # Step 1: Set up the global embedding model

    # Ê≠•È™§ 2: Âä†ËΩΩ DeepLake ÂêëÈáèÂ∫ì
    # Step 2: Load the DeepLake vector store
    print(f"\nStep 2: Loading vector store from: {VECTOR_STORE_PATH}")
    if not os.path.exists(VECTOR_STORE_PATH):
        print(f"--- ‚ùå CRITICAL ERROR ---")
        print(f"Vector store not found at: {VECTOR_STORE_PATH}")
        print("Please ensure you have copied the folder correctly.")
        return

    try:
        # ‰ª•Âè™ËØªÊ®°ÂºèÂä†ËΩΩÔºåËøôÊòØ‰∏Ä‰∏™ÂÆâÂÖ®ÁöÑÂ•Ω‰π†ÊÉØ
        # Loading in read-only mode is a safe and good practice
        vector_store = DeepLakeVectorStore(
            dataset_path=VECTOR_STORE_PATH,
            read_only=True
        )

        # ‰ªéÂ∑≤Â≠òÂú®ÁöÑÂêëÈáèÂ∫ì‰∏≠ÈáçÂª∫Á¥¢Âºï
        # Reconstruct the index from the existing vector store
        index = VectorStoreIndex.from_vector_store(vector_store)
        print("-> Vector store loaded successfully.")

    except Exception as e:
        print(f"--- ‚ùå ERROR ---")
        print(f"An error occurred while loading the vector store: {e}")
        return

    # Ê≠•È™§ 3: ÂàõÂª∫Êü•ËØ¢ÂºïÊìéÂπ∂ÊèêÈóÆ
    # Step 3: Create a query engine and ask a question
    print("\nStep 3: Creating query engine...")
    query_engine = index.as_query_engine(similarity_top_k=5) # Á§∫‰æãÔºöËøîÂõûÊúÄÁõ∏‰ººÁöÑ5‰∏™ÁªìÊûú
    print("-> Query engine is ready.")

    # --- Âú®ËøôÈáåËæìÂÖ•‰Ω†ÁöÑÈóÆÈ¢ò ---
    # --- Enter your query here ---
    print("\n--- Ready to Query! ---")
    query_text = "What are the applications of perovskite in solar cells?"

    print(f"\nQuerying with: '{query_text}'")
    response = query_engine.query(query_text)

    print("\n--- Response ---")
    print(str(response))

    print("\n--- Source Nodes ---")
    for i, node in enumerate(response.source_nodes):
        print(f"  Source {i+1} (Score: {node.score:.4f}):")
        # ÊâìÂç∞Êñá‰ª∂ÂêçÂÖÉÊï∞ÊçÆÔºåÂ¶ÇÊûúÂ≠òÂú®ÁöÑËØù
        # Print the source file metadata if it exists
        if 'source_file' in node.metadata:
            print(f"    File: {node.metadata['source_file']}")
        print(f"    Text: {node.get_content()[:200]}...") # ÊâìÂç∞ÈÉ®ÂàÜÊñáÊú¨ÂÜÖÂÆπ
        print("-" * 20)


if __name__ == "__main__":
    main()


  import pkg_resources  # type: ignore


--- Starting Query Process ---

Step 2: Loading vector store from: ./dataset/vector_storage_for_colleague
Deep Lake Dataset in ./dataset/vector_storage_for_colleague already exists, loading from the storage




-> Vector store loaded successfully.

Step 3: Creating query engine...
-> Query engine is ready.

--- Ready to Query! ---

Querying with: 'What are the applications of perovskite in solar cells?'

--- Response ---
Perovskite materials have several applications in solar cells, including enhancing the efficiency of photovoltaic devices, providing a cheaper alternative to traditional solar cell materials, and enabling the development of flexible and lightweight solar panels. They can be used to create solar cells with improved stability and higher light absorption capabilities, contributing to advancements in photovoltaic technology.

--- Source Nodes ---
  Source 1 (Score: 0.5236):
    File: 978-3-662-56472-1.pdf
    Text: WCPEC-3 Organizing Committee, (Osaka, 2003), p. 1112‚Äì1115
D. Karg, H. ChariÔ¨Å, G. Pensl, M. Schulz, G. Hahn, in19th Europ. Photovoltaic Solar Energy Conf.:
Proc. of the Int. Conf. held in Paris, France...
--------------------
  Source 2 (Score: 0.5194):
    File: Za

In [None]:
# -*- coding: utf-8 -*-
"""
This script loads a pre-existing DeepLake vector store and prepares it for querying.
It assumes you have already run the 'run_vectorization.py' script and have the
vector store folder available.
"""

import os
from llama_index.core import VectorStoreIndex, Settings
from llama_index.vector_stores.deeplake import DeepLakeVectorStore
import warnings

warnings.filterwarnings("ignore")

# --- 1. ÈÖçÁΩÆË∑ØÂæÑ ---
# --- 1. Configuration ---

# ÂÖ≥ÈîÆÊ≠•È™§ÔºöËøôÈáåÁöÑÊ®°ÂûãË∑ØÂæÑÂøÖÈ°ªÂíå‰Ω†Âêå‰∫ãÁîüÊàêÂêëÈáèÊó∂‰ΩøÁî®ÁöÑÊ®°ÂûãÂÆåÂÖ®‰∏ÄËá¥ÔºÅ
# CRITICAL STEP: The model path here MUST be identical to the one your colleague used for vectorization!
# ËØ∑Ê†πÊçÆ‰Ω†Êú¨Âú∞Â≠òÊîæ embedding Ê®°ÂûãÁöÑ‰ΩçÁΩÆ‰øÆÊîπÊ≠§Ë∑ØÂæÑ„ÄÇ
# Please update this path to where you have stored the embedding model locally.
EMBEDDING_MODEL_PATH = r"F:\Intern\EDF\EmbeddingModels\Qwen3-Embedding-0.6B"

# ÂÖ≥ÈîÆÊ≠•È™§ÔºöÊåáÂêë‰Ω†‰ªéÂêå‰∫ãÈÇ£ÈáåÊã∑Ë¥ùËøáÊù•ÁöÑÂêëÈáèÊï∞ÊçÆÂ∫ìÊñá‰ª∂Â§π„ÄÇ
# CRITICAL STEP: Point this to the vector store folder you copied from your colleague.
VECTOR_STORE_PATH = "./dataset/vector_storage_for_colleague"


# --- 2. ‰∏ªÊâßË°åÂáΩÊï∞ ---
# --- 2. Main Execution Function ---

def main():
    """
    Loads the DeepLake index and runs a sample query.
    """
    print("--- Starting Query Process ---")

    # Ê≠•È™§ 1: ËÆæÁΩÆÂÖ®Â±ÄÁöÑ Embedding Ê®°Âûã
    # Step 1: Set up the global embedding model

    # Ê≠•È™§ 2: Âä†ËΩΩ DeepLake ÂêëÈáèÂ∫ì
    # Step 2: Load the DeepLake vector store
    print(f"\nStep 2: Loading vector store from: {VECTOR_STORE_PATH}")
    if not os.path.exists(VECTOR_STORE_PATH):
        print(f"--- ‚ùå CRITICAL ERROR ---")
        print(f"Vector store not found at: {VECTOR_STORE_PATH}")
        print("Please ensure you have copied the folder correctly.")
        return

    try:
        # ‰ª•Âè™ËØªÊ®°ÂºèÂä†ËΩΩÔºåËøôÊòØ‰∏Ä‰∏™ÂÆâÂÖ®ÁöÑÂ•Ω‰π†ÊÉØ
        # Loading in read-only mode is a safe and good practice
        vector_store = DeepLakeVectorStore(
            dataset_path=VECTOR_STORE_PATH,
            read_only=True
        )

        # ‰ªéÂ∑≤Â≠òÂú®ÁöÑÂêëÈáèÂ∫ì‰∏≠ÈáçÂª∫Á¥¢Âºï
        # Reconstruct the index from the existing vector store
        index = VectorStoreIndex.from_vector_store(vector_store)
        print("-> Vector store loaded successfully.")

    except Exception as e:
        print(f"--- ‚ùå ERROR ---")
        print(f"An error occurred while loading the vector store: {e}")
        return

    # Ê≠•È™§ 3: ÂàõÂª∫Êü•ËØ¢ÂºïÊìéÂπ∂ÊèêÈóÆ
    # Step 3: Create a query engine and ask a question
    print("\nStep 3: Creating query engine...")
    query_engine = index.as_query_engine(similarity_top_k=5) # Á§∫‰æãÔºöËøîÂõûÊúÄÁõ∏‰ººÁöÑ5‰∏™ÁªìÊûú
    print("-> Query engine is ready.")

    # --- Âú®ËøôÈáåËæìÂÖ•‰Ω†ÁöÑÈóÆÈ¢ò ---
    # --- Enter your query here ---
    print("\n--- Ready to Query! ---")
    query_text = "What are the applications of perovskite in solar cells?"

    print(f"\nQuerying with: '{query_text}'")
    response = query_engine.query(query_text)

    print("\n--- Response ---")
    print(str(response))

    print("\n--- Source Nodes ---")
    for i, node in enumerate(response.source_nodes):
        print(f"  Source {i+1} (Score: {node.score:.4f}):")
        # ÊâìÂç∞Êñá‰ª∂ÂêçÂÖÉÊï∞ÊçÆÔºåÂ¶ÇÊûúÂ≠òÂú®ÁöÑËØù
        # Print the source file metadata if it exists
        if 'source_file' in node.metadata:
            print(f"    File: {node.metadata['source_file']}")
        print(f"    Text: {node.get_content()[:200]}...") # ÊâìÂç∞ÈÉ®ÂàÜÊñáÊú¨ÂÜÖÂÆπ
        print("-" * 20)


if __name__ == "__main__":
    main()


--- Starting Query Process ---

Step 2: Loading vector store from: ./dataset/vector_storage_for_colleague
--- ‚ùå ERROR ---
An error occurred while loading the vector store: Please use a url that points to an existing Deep Lake Dataset or an empty folder. If you wish to delete the folder and its contents, you may run deeplake.delete(dataset_path, force=True).


In [None]:
# -*- coding: utf-8 -*-
"""
This script loads a pre-existing DeepLake vector store and prepares it for querying.
It assumes you have already run the 'run_vectorization.py' script and have the
vector store folder available.
"""

import os
from llama_index.core import VectorStoreIndex, Settings
from llama_index.vector_stores.deeplake import DeepLakeVectorStore
import warnings

warnings.filterwarnings("ignore")

# --- 1. ÈÖçÁΩÆË∑ØÂæÑ ---
# --- 1. Configuration ---

# ÂÖ≥ÈîÆÊ≠•È™§ÔºöËøôÈáåÁöÑÊ®°ÂûãË∑ØÂæÑÂøÖÈ°ªÂíå‰Ω†Âêå‰∫ãÁîüÊàêÂêëÈáèÊó∂‰ΩøÁî®ÁöÑÊ®°ÂûãÂÆåÂÖ®‰∏ÄËá¥ÔºÅ
# CRITICAL STEP: The model path here MUST be identical to the one your colleague used for vectorization!
# ËØ∑Ê†πÊçÆ‰Ω†Êú¨Âú∞Â≠òÊîæ embedding Ê®°ÂûãÁöÑ‰ΩçÁΩÆ‰øÆÊîπÊ≠§Ë∑ØÂæÑ„ÄÇ
# Please update this path to where you have stored the embedding model locally.
EMBEDDING_MODEL_PATH = r"F:\Intern\EDF\EmbeddingModels\Qwen3-Embedding-0.6B"

# ÂÖ≥ÈîÆÊ≠•È™§ÔºöÊåáÂêë‰Ω†‰ªéÂêå‰∫ãÈÇ£ÈáåÊã∑Ë¥ùËøáÊù•ÁöÑÂêëÈáèÊï∞ÊçÆÂ∫ìÊñá‰ª∂Â§π„ÄÇ
# CRITICAL STEP: Point this to the vector store folder you copied from your colleague.
VECTOR_STORE_PATH = "./dataset/vector_storage_for_colleague"


# --- 2. ‰∏ªÊâßË°åÂáΩÊï∞ ---
# --- 2. Main Execution Function ---

def main():
    """
    Loads the DeepLake index and runs a sample query.
    """
    print("--- Starting Query Process ---")

    # Ê≠•È™§ 1: ËÆæÁΩÆÂÖ®Â±ÄÁöÑ Embedding Ê®°Âûã
    # Step 1: Set up the global embedding model

    # Ê≠•È™§ 2: Âä†ËΩΩ DeepLake ÂêëÈáèÂ∫ì
    # Step 2: Load the DeepLake vector store
    print(f"\nStep 2: Loading vector store from: {VECTOR_STORE_PATH}")
    if not os.path.exists(VECTOR_STORE_PATH):
        print(f"--- ‚ùå CRITICAL ERROR ---")
        print(f"Vector store not found at: {VECTOR_STORE_PATH}")
        print("Please ensure you have copied the folder correctly.")
        return

    try:
        # ‰ª•Âè™ËØªÊ®°ÂºèÂä†ËΩΩÔºåËøôÊòØ‰∏Ä‰∏™ÂÆâÂÖ®ÁöÑÂ•Ω‰π†ÊÉØ
        # Loading in read-only mode is a safe and good practice
        vector_store = DeepLakeVectorStore(
            dataset_path=VECTOR_STORE_PATH,
            read_only=True
        )

        # ‰ªéÂ∑≤Â≠òÂú®ÁöÑÂêëÈáèÂ∫ì‰∏≠ÈáçÂª∫Á¥¢Âºï
        # Reconstruct the index from the existing vector store
        index = VectorStoreIndex.from_vector_store(vector_store)
        print("-> Vector store loaded successfully.")

    except Exception as e:
        print(f"--- ‚ùå ERROR ---")
        print(f"An error occurred while loading the vector store: {e}")
        return

    # Ê≠•È™§ 3: ÂàõÂª∫Êü•ËØ¢ÂºïÊìéÂπ∂ÊèêÈóÆ
    # Step 3: Create a query engine and ask a question
    print("\nStep 3: Creating query engine...")
    query_engine = index.as_query_engine(similarity_top_k=5) # Á§∫‰æãÔºöËøîÂõûÊúÄÁõ∏‰ººÁöÑ5‰∏™ÁªìÊûú
    print("-> Query engine is ready.")

    # --- Âú®ËøôÈáåËæìÂÖ•‰Ω†ÁöÑÈóÆÈ¢ò ---
    # --- Enter your query here ---
    print("\n--- Ready to Query! ---")
    query_text = "What are the applications of perovskite in solar cells?"

    print(f"\nQuerying with: '{query_text}'")
    response = query_engine.query(query_text)

    print("\n--- Response ---")
    print(str(response))

    print("\n--- Source Nodes ---")
    for i, node in enumerate(response.source_nodes):
        print(f"  Source {i+1} (Score: {node.score:.4f}):")
        # ÊâìÂç∞Êñá‰ª∂ÂêçÂÖÉÊï∞ÊçÆÔºåÂ¶ÇÊûúÂ≠òÂú®ÁöÑËØù
        # Print the source file metadata if it exists
        if 'source_file' in node.metadata:
            print(f"    File: {node.metadata['source_file']}")
        print(f"    Text: {node.get_content()[:200]}...") # ÊâìÂç∞ÈÉ®ÂàÜÊñáÊú¨ÂÜÖÂÆπ
        print("-" * 20)


if __name__ == "__main__":
    main()


--- Starting Query Process ---

Step 2: Loading vector store from: ./dataset/vector_storage_for_colleague
--- ‚ùå ERROR ---
An error occurred while loading the vector store: Please use a url that points to an existing Deep Lake Dataset or an empty folder. If you wish to delete the folder and its contents, you may run deeplake.delete(dataset_path, force=True).


In [None]:
# -*- coding: utf-8 -*-
"""
This script loads a pre-existing DeepLake vector store and prepares it for querying.
It assumes you have already run the 'run_vectorization.py' script and have the
vector store folder available.
"""

import os
from llama_index.core import VectorStoreIndex, Settings
from llama_index.vector_stores.deeplake import DeepLakeVectorStore
import warnings

warnings.filterwarnings("ignore")

# --- 1. ÈÖçÁΩÆË∑ØÂæÑ ---
# --- 1. Configuration ---

# ÂÖ≥ÈîÆÊ≠•È™§ÔºöËøôÈáåÁöÑÊ®°ÂûãË∑ØÂæÑÂøÖÈ°ªÂíå‰Ω†Âêå‰∫ãÁîüÊàêÂêëÈáèÊó∂‰ΩøÁî®ÁöÑÊ®°ÂûãÂÆåÂÖ®‰∏ÄËá¥ÔºÅ
# CRITICAL STEP: The model path here MUST be identical to the one your colleague used for vectorization!
# ËØ∑Ê†πÊçÆ‰Ω†Êú¨Âú∞Â≠òÊîæ embedding Ê®°ÂûãÁöÑ‰ΩçÁΩÆ‰øÆÊîπÊ≠§Ë∑ØÂæÑ„ÄÇ
# Please update this path to where you have stored the embedding model locally.
EMBEDDING_MODEL_PATH = r"F:\Intern\EDF\EmbeddingModels\Qwen3-Embedding-0.6B"

# ÂÖ≥ÈîÆÊ≠•È™§ÔºöÊåáÂêë‰Ω†‰ªéÂêå‰∫ãÈÇ£ÈáåÊã∑Ë¥ùËøáÊù•ÁöÑÂêëÈáèÊï∞ÊçÆÂ∫ìÊñá‰ª∂Â§π„ÄÇ
# CRITICAL STEP: Point this to the vector store folder you copied from your colleague.
VECTOR_STORE_PATH = "./dataset/vector_storage_for_colleague"


# --- 2. ‰∏ªÊâßË°åÂáΩÊï∞ ---
# --- 2. Main Execution Function ---

def main():
    """
    Loads the DeepLake index and runs a sample query.
    """
    print("--- Starting Query Process ---")

    # Ê≠•È™§ 1: ËÆæÁΩÆÂÖ®Â±ÄÁöÑ Embedding Ê®°Âûã
    # Step 1: Set up the global embedding model

    # Ê≠•È™§ 2: Âä†ËΩΩ DeepLake ÂêëÈáèÂ∫ì
    # Step 2: Load the DeepLake vector store
    print(f"\nStep 2: Loading vector store from: {VECTOR_STORE_PATH}")
    if not os.path.exists(VECTOR_STORE_PATH):
        print(f"--- ‚ùå CRITICAL ERROR ---")
        print(f"Vector store not found at: {VECTOR_STORE_PATH}")
        print("Please ensure you have copied the folder correctly.")
        return

    try:
        # ‰ª•Âè™ËØªÊ®°ÂºèÂä†ËΩΩÔºåËøôÊòØ‰∏Ä‰∏™ÂÆâÂÖ®ÁöÑÂ•Ω‰π†ÊÉØ
        # Loading in read-only mode is a safe and good practice
        vector_store = DeepLakeVectorStore(
            dataset_path=VECTOR_STORE_PATH,
            read_only=True
        )

        # ‰ªéÂ∑≤Â≠òÂú®ÁöÑÂêëÈáèÂ∫ì‰∏≠ÈáçÂª∫Á¥¢Âºï
        # Reconstruct the index from the existing vector store
        index = VectorStoreIndex.from_vector_store(vector_store)
        print("-> Vector store loaded successfully.")

    except Exception as e:
        print(f"--- ‚ùå ERROR ---")
        print(f"An error occurred while loading the vector store: {e}")
        return

    # Ê≠•È™§ 3: ÂàõÂª∫Êü•ËØ¢ÂºïÊìéÂπ∂ÊèêÈóÆ
    # Step 3: Create a query engine and ask a question
    print("\nStep 3: Creating query engine...")
    query_engine = index.as_query_engine(similarity_top_k=5) # Á§∫‰æãÔºöËøîÂõûÊúÄÁõ∏‰ººÁöÑ5‰∏™ÁªìÊûú
    print("-> Query engine is ready.")

    # --- Âú®ËøôÈáåËæìÂÖ•‰Ω†ÁöÑÈóÆÈ¢ò ---
    # --- Enter your query here ---
    print("\n--- Ready to Query! ---")
    query_text = "What are the applications of perovskite in solar cells?"

    print(f"\nQuerying with: '{query_text}'")
    response = query_engine.query(query_text)

    print("\n--- Response ---")
    print(str(response))

    print("\n--- Source Nodes ---")
    for i, node in enumerate(response.source_nodes):
        print(f"  Source {i+1} (Score: {node.score:.4f}):")
        # ÊâìÂç∞Êñá‰ª∂ÂêçÂÖÉÊï∞ÊçÆÔºåÂ¶ÇÊûúÂ≠òÂú®ÁöÑËØù
        # Print the source file metadata if it exists
        if 'source_file' in node.metadata:
            print(f"    File: {node.metadata['source_file']}")
        print(f"    Text: {node.get_content()[:200]}...") # ÊâìÂç∞ÈÉ®ÂàÜÊñáÊú¨ÂÜÖÂÆπ
        print("-" * 20)


if __name__ == "__main__":
    main()


--- Starting Query Process ---

Step 2: Loading vector store from: ./dataset/vector_storage_for_colleague
--- ‚ùå ERROR ---
An error occurred while loading the vector store: Please use a url that points to an existing Deep Lake Dataset or an empty folder. If you wish to delete the folder and its contents, you may run deeplake.delete(dataset_path, force=True).


In [None]:
# -*- coding: utf-8 -*-
"""
This script loads a pre-existing DeepLake vector store and prepares it for querying.
It assumes you have already run the 'run_vectorization.py' script and have the
vector store folder available.
"""

import os
from llama_index.core import VectorStoreIndex, Settings
from llama_index.vector_stores.deeplake import DeepLakeVectorStore
import warnings

warnings.filterwarnings("ignore")

# --- 1. ÈÖçÁΩÆË∑ØÂæÑ ---
# --- 1. Configuration ---

# ÂÖ≥ÈîÆÊ≠•È™§ÔºöËøôÈáåÁöÑÊ®°ÂûãË∑ØÂæÑÂøÖÈ°ªÂíå‰Ω†Âêå‰∫ãÁîüÊàêÂêëÈáèÊó∂‰ΩøÁî®ÁöÑÊ®°ÂûãÂÆåÂÖ®‰∏ÄËá¥ÔºÅ
# CRITICAL STEP: The model path here MUST be identical to the one your colleague used for vectorization!
# ËØ∑Ê†πÊçÆ‰Ω†Êú¨Âú∞Â≠òÊîæ embedding Ê®°ÂûãÁöÑ‰ΩçÁΩÆ‰øÆÊîπÊ≠§Ë∑ØÂæÑ„ÄÇ
# Please update this path to where you have stored the embedding model locally.
EMBEDDING_MODEL_PATH = r"F:\Intern\EDF\EmbeddingModels\Qwen3-Embedding-0.6B"

# ÂÖ≥ÈîÆÊ≠•È™§ÔºöÊåáÂêë‰Ω†‰ªéÂêå‰∫ãÈÇ£ÈáåÊã∑Ë¥ùËøáÊù•ÁöÑÂêëÈáèÊï∞ÊçÆÂ∫ìÊñá‰ª∂Â§π„ÄÇ
# CRITICAL STEP: Point this to the vector store folder you copied from your colleague.
VECTOR_STORE_PATH = "./dataset/vector_storage_for_colleague"


# --- 2. ‰∏ªÊâßË°åÂáΩÊï∞ ---
# --- 2. Main Execution Function ---

def main():
    """
    Loads the DeepLake index and runs a sample query.
    """
    print("--- Starting Query Process ---")

    # Ê≠•È™§ 1: ËÆæÁΩÆÂÖ®Â±ÄÁöÑ Embedding Ê®°Âûã
    # Step 1: Set up the global embedding model

    # Ê≠•È™§ 2: Âä†ËΩΩ DeepLake ÂêëÈáèÂ∫ì
    # Step 2: Load the DeepLake vector store
    print(f"\nStep 2: Loading vector store from: {VECTOR_STORE_PATH}")
    if not os.path.exists(VECTOR_STORE_PATH):
        print(f"--- ‚ùå CRITICAL ERROR ---")
        print(f"Vector store not found at: {VECTOR_STORE_PATH}")
        print("Please ensure you have copied the folder correctly.")
        return

    try:
        # ‰ª•Âè™ËØªÊ®°ÂºèÂä†ËΩΩÔºåËøôÊòØ‰∏Ä‰∏™ÂÆâÂÖ®ÁöÑÂ•Ω‰π†ÊÉØ
        # Loading in read-only mode is a safe and good practice
        vector_store = DeepLakeVectorStore(
            dataset_path=VECTOR_STORE_PATH,
            read_only=True
        )

        # ‰ªéÂ∑≤Â≠òÂú®ÁöÑÂêëÈáèÂ∫ì‰∏≠ÈáçÂª∫Á¥¢Âºï
        # Reconstruct the index from the existing vector store
        index = VectorStoreIndex.from_vector_store(vector_store)
        print("-> Vector store loaded successfully.")

    except Exception as e:
        print(f"--- ‚ùå ERROR ---")
        print(f"An error occurred while loading the vector store: {e}")
        return

    # Ê≠•È™§ 3: ÂàõÂª∫Êü•ËØ¢ÂºïÊìéÂπ∂ÊèêÈóÆ
    # Step 3: Create a query engine and ask a question
    print("\nStep 3: Creating query engine...")
    query_engine = index.as_query_engine(similarity_top_k=5) # Á§∫‰æãÔºöËøîÂõûÊúÄÁõ∏‰ººÁöÑ5‰∏™ÁªìÊûú
    print("-> Query engine is ready.")

    # --- Âú®ËøôÈáåËæìÂÖ•‰Ω†ÁöÑÈóÆÈ¢ò ---
    # --- Enter your query here ---
    print("\n--- Ready to Query! ---")
    query_text = "What are the applications of perovskite in solar cells?"

    print(f"\nQuerying with: '{query_text}'")
    response = query_engine.query(query_text)

    print("\n--- Response ---")
    print(str(response))

    print("\n--- Source Nodes ---")
    for i, node in enumerate(response.source_nodes):
        print(f"  Source {i+1} (Score: {node.score:.4f}):")
        # ÊâìÂç∞Êñá‰ª∂ÂêçÂÖÉÊï∞ÊçÆÔºåÂ¶ÇÊûúÂ≠òÂú®ÁöÑËØù
        # Print the source file metadata if it exists
        if 'source_file' in node.metadata:
            print(f"    File: {node.metadata['source_file']}")
        print(f"    Text: {node.get_content()[:200]}...") # ÊâìÂç∞ÈÉ®ÂàÜÊñáÊú¨ÂÜÖÂÆπ
        print("-" * 20)


if __name__ == "__main__":
    main()


--- Starting Query Process ---

Step 2: Loading vector store from: ./dataset/vector_storage_for_colleague
--- ‚ùå ERROR ---
An error occurred while loading the vector store: Please use a url that points to an existing Deep Lake Dataset or an empty folder. If you wish to delete the folder and its contents, you may run deeplake.delete(dataset_path, force=True).


In [None]:
# -*- coding: utf-8 -*-
"""
This script loads a pre-existing DeepLake vector store and prepares it for querying.
It assumes you have already run the 'run_vectorization.py' script and have the
vector store folder available.
"""

import os
from llama_index.core import VectorStoreIndex, Settings
from llama_index.vector_stores.deeplake import DeepLakeVectorStore
import warnings

warnings.filterwarnings("ignore")

# --- 1. ÈÖçÁΩÆË∑ØÂæÑ ---
# --- 1. Configuration ---

# ÂÖ≥ÈîÆÊ≠•È™§ÔºöËøôÈáåÁöÑÊ®°ÂûãË∑ØÂæÑÂøÖÈ°ªÂíå‰Ω†Âêå‰∫ãÁîüÊàêÂêëÈáèÊó∂‰ΩøÁî®ÁöÑÊ®°ÂûãÂÆåÂÖ®‰∏ÄËá¥ÔºÅ
# CRITICAL STEP: The model path here MUST be identical to the one your colleague used for vectorization!
# ËØ∑Ê†πÊçÆ‰Ω†Êú¨Âú∞Â≠òÊîæ embedding Ê®°ÂûãÁöÑ‰ΩçÁΩÆ‰øÆÊîπÊ≠§Ë∑ØÂæÑ„ÄÇ
# Please update this path to where you have stored the embedding model locally.
EMBEDDING_MODEL_PATH = r"F:\Intern\EDF\EmbeddingModels\Qwen3-Embedding-0.6B"

# ÂÖ≥ÈîÆÊ≠•È™§ÔºöÊåáÂêë‰Ω†‰ªéÂêå‰∫ãÈÇ£ÈáåÊã∑Ë¥ùËøáÊù•ÁöÑÂêëÈáèÊï∞ÊçÆÂ∫ìÊñá‰ª∂Â§π„ÄÇ
# CRITICAL STEP: Point this to the vector store folder you copied from your colleague.
VECTOR_STORE_PATH = "./dataset/vector_storage_for_colleague"


# --- 2. ‰∏ªÊâßË°åÂáΩÊï∞ ---
# --- 2. Main Execution Function ---

def main():
    """
    Loads the DeepLake index and runs a sample query.
    """
    print("--- Starting Query Process ---")

    # Ê≠•È™§ 1: ËÆæÁΩÆÂÖ®Â±ÄÁöÑ Embedding Ê®°Âûã
    # Step 1: Set up the global embedding model

    # Ê≠•È™§ 2: Âä†ËΩΩ DeepLake ÂêëÈáèÂ∫ì
    # Step 2: Load the DeepLake vector store
    print(f"\nStep 2: Loading vector store from: {VECTOR_STORE_PATH}")
    if not os.path.exists(VECTOR_STORE_PATH):
        print(f"--- ‚ùå CRITICAL ERROR ---")
        print(f"Vector store not found at: {VECTOR_STORE_PATH}")
        print("Please ensure you have copied the folder correctly.")
        return

    try:
        # ‰ª•Âè™ËØªÊ®°ÂºèÂä†ËΩΩÔºåËøôÊòØ‰∏Ä‰∏™ÂÆâÂÖ®ÁöÑÂ•Ω‰π†ÊÉØ
        # Loading in read-only mode is a safe and good practice
        vector_store = DeepLakeVectorStore(
            dataset_path=VECTOR_STORE_PATH,
            read_only=True
        )

        # ‰ªéÂ∑≤Â≠òÂú®ÁöÑÂêëÈáèÂ∫ì‰∏≠ÈáçÂª∫Á¥¢Âºï
        # Reconstruct the index from the existing vector store
        index = VectorStoreIndex.from_vector_store(vector_store)
        print("-> Vector store loaded successfully.")

    except Exception as e:
        print(f"--- ‚ùå ERROR ---")
        print(f"An error occurred while loading the vector store: {e}")
        return

    # Ê≠•È™§ 3: ÂàõÂª∫Êü•ËØ¢ÂºïÊìéÂπ∂ÊèêÈóÆ
    # Step 3: Create a query engine and ask a question
    print("\nStep 3: Creating query engine...")
    query_engine = index.as_query_engine(similarity_top_k=5) # Á§∫‰æãÔºöËøîÂõûÊúÄÁõ∏‰ººÁöÑ5‰∏™ÁªìÊûú
    print("-> Query engine is ready.")

    # --- Âú®ËøôÈáåËæìÂÖ•‰Ω†ÁöÑÈóÆÈ¢ò ---
    # --- Enter your query here ---
    print("\n--- Ready to Query! ---")
    query_text = "What are the applications of perovskite in solar cells?"

    print(f"\nQuerying with: '{query_text}'")
    response = query_engine.query(query_text)

    print("\n--- Response ---")
    print(str(response))

    print("\n--- Source Nodes ---")
    for i, node in enumerate(response.source_nodes):
        print(f"  Source {i+1} (Score: {node.score:.4f}):")
        # ÊâìÂç∞Êñá‰ª∂ÂêçÂÖÉÊï∞ÊçÆÔºåÂ¶ÇÊûúÂ≠òÂú®ÁöÑËØù
        # Print the source file metadata if it exists
        if 'source_file' in node.metadata:
            print(f"    File: {node.metadata['source_file']}")
        print(f"    Text: {node.get_content()[:200]}...") # ÊâìÂç∞ÈÉ®ÂàÜÊñáÊú¨ÂÜÖÂÆπ
        print("-" * 20)


if __name__ == "__main__":
    main()


--- Starting Query Process ---

Step 2: Loading vector store from: ./dataset/vector_storage_for_colleague
--- ‚ùå ERROR ---
An error occurred while loading the vector store: Please use a url that points to an existing Deep Lake Dataset or an empty folder. If you wish to delete the folder and its contents, you may run deeplake.delete(dataset_path, force=True).


In [None]:
# -*- coding: utf-8 -*-
"""
This script loads a pre-existing DeepLake vector store and prepares it for querying.
It assumes you have already run the 'run_vectorization.py' script and have the
vector store folder available.
"""

import os
from llama_index.core import VectorStoreIndex, Settings
from llama_index.vector_stores.deeplake import DeepLakeVectorStore
import warnings

warnings.filterwarnings("ignore")

# --- 1. ÈÖçÁΩÆË∑ØÂæÑ ---
# --- 1. Configuration ---

# ÂÖ≥ÈîÆÊ≠•È™§ÔºöËøôÈáåÁöÑÊ®°ÂûãË∑ØÂæÑÂøÖÈ°ªÂíå‰Ω†Âêå‰∫ãÁîüÊàêÂêëÈáèÊó∂‰ΩøÁî®ÁöÑÊ®°ÂûãÂÆåÂÖ®‰∏ÄËá¥ÔºÅ
# CRITICAL STEP: The model path here MUST be identical to the one your colleague used for vectorization!
# ËØ∑Ê†πÊçÆ‰Ω†Êú¨Âú∞Â≠òÊîæ embedding Ê®°ÂûãÁöÑ‰ΩçÁΩÆ‰øÆÊîπÊ≠§Ë∑ØÂæÑ„ÄÇ
# Please update this path to where you have stored the embedding model locally.
EMBEDDING_MODEL_PATH = r"F:\Intern\EDF\EmbeddingModels\Qwen3-Embedding-0.6B"

# ÂÖ≥ÈîÆÊ≠•È™§ÔºöÊåáÂêë‰Ω†‰ªéÂêå‰∫ãÈÇ£ÈáåÊã∑Ë¥ùËøáÊù•ÁöÑÂêëÈáèÊï∞ÊçÆÂ∫ìÊñá‰ª∂Â§π„ÄÇ
# CRITICAL STEP: Point this to the vector store folder you copied from your colleague.
VECTOR_STORE_PATH = "./dataset/vector_storage_for_colleague"


# --- 2. ‰∏ªÊâßË°åÂáΩÊï∞ ---
# --- 2. Main Execution Function ---

def main():
    """
    Loads the DeepLake index and runs a sample query.
    """
    print("--- Starting Query Process ---")

    # Ê≠•È™§ 1: ËÆæÁΩÆÂÖ®Â±ÄÁöÑ Embedding Ê®°Âûã
    # Step 1: Set up the global embedding model

    # Ê≠•È™§ 2: Âä†ËΩΩ DeepLake ÂêëÈáèÂ∫ì
    # Step 2: Load the DeepLake vector store
    print(f"\nStep 2: Loading vector store from: {VECTOR_STORE_PATH}")
    if not os.path.exists(VECTOR_STORE_PATH):
        print(f"--- ‚ùå CRITICAL ERROR ---")
        print(f"Vector store not found at: {VECTOR_STORE_PATH}")
        print("Please ensure you have copied the folder correctly.")
        return

    try:
        # ‰ª•Âè™ËØªÊ®°ÂºèÂä†ËΩΩÔºåËøôÊòØ‰∏Ä‰∏™ÂÆâÂÖ®ÁöÑÂ•Ω‰π†ÊÉØ
        # Loading in read-only mode is a safe and good practice
        vector_store = DeepLakeVectorStore(
            dataset_path=VECTOR_STORE_PATH,
            read_only=True
        )

        # ‰ªéÂ∑≤Â≠òÂú®ÁöÑÂêëÈáèÂ∫ì‰∏≠ÈáçÂª∫Á¥¢Âºï
        # Reconstruct the index from the existing vector store
        index = VectorStoreIndex.from_vector_store(vector_store)
        print("-> Vector store loaded successfully.")

    except Exception as e:
        print(f"--- ‚ùå ERROR ---")
        print(f"An error occurred while loading the vector store: {e}")
        return

    # Ê≠•È™§ 3: ÂàõÂª∫Êü•ËØ¢ÂºïÊìéÂπ∂ÊèêÈóÆ
    # Step 3: Create a query engine and ask a question
    print("\nStep 3: Creating query engine...")
    query_engine = index.as_query_engine(similarity_top_k=5) # Á§∫‰æãÔºöËøîÂõûÊúÄÁõ∏‰ººÁöÑ5‰∏™ÁªìÊûú
    print("-> Query engine is ready.")

    # --- Âú®ËøôÈáåËæìÂÖ•‰Ω†ÁöÑÈóÆÈ¢ò ---
    # --- Enter your query here ---
    print("\n--- Ready to Query! ---")
    query_text = "What are the applications of perovskite in solar cells?"

    print(f"\nQuerying with: '{query_text}'")
    response = query_engine.query(query_text)

    print("\n--- Response ---")
    print(str(response))

    print("\n--- Source Nodes ---")
    for i, node in enumerate(response.source_nodes):
        print(f"  Source {i+1} (Score: {node.score:.4f}):")
        # ÊâìÂç∞Êñá‰ª∂ÂêçÂÖÉÊï∞ÊçÆÔºåÂ¶ÇÊûúÂ≠òÂú®ÁöÑËØù
        # Print the source file metadata if it exists
        if 'source_file' in node.metadata:
            print(f"    File: {node.metadata['source_file']}")
        print(f"    Text: {node.get_content()[:200]}...") # ÊâìÂç∞ÈÉ®ÂàÜÊñáÊú¨ÂÜÖÂÆπ
        print("-" * 20)


if __name__ == "__main__":
    main()


--- Starting Query Process ---

Step 2: Loading vector store from: ./dataset/vector_storage_for_colleague
--- ‚ùå ERROR ---
An error occurred while loading the vector store: Please use a url that points to an existing Deep Lake Dataset or an empty folder. If you wish to delete the folder and its contents, you may run deeplake.delete(dataset_path, force=True).


In [12]:
import deeplake
import pandas as pd
import numpy as np

# Create a dictionary to hold the data
data = {}

# Load vector store data
ds = deeplake.load(dataset_path)
ds.summary()

# Iterate through the tensors in the dataset
for tensor_name in ds.tensors:
    tensor_data = ds[tensor_name].numpy()

    # Check if the tensor is multi-dimensional
    if tensor_data.ndim > 1:
        # Flatten multi-dimensional tensors
        data[tensor_name] = [np.array(e).flatten().tolist() for e in tensor_data]
    else:
        # Convert 1D tensors directly to lists and decode text
        if tensor_name == "text":
            data[tensor_name] = [t.tobytes().decode('utf-8') if t else "" for t in tensor_data]
        else:
            data[tensor_name] = tensor_data.tolist()

# Create a Pandas DataFrame from the dictionary
df = pd.DataFrame(data)

./dataset/vector_storage_new loaded successfully.





Dataset(path='./dataset/vector_storage_new', tensors=['embedding', 'id', 'metadata', 'text'])

  tensor      htype        shape       dtype  compression
  -------    -------      -------     -------  ------- 
 embedding  embedding  (18734, 1024)  float32   None   
    id        text      (18734, 1)      str     None   
 metadata     json      (18734, 1)      str     None   
   text       text      (18734, 1)      str     None   


In [13]:
# Function to display a selected record
def display_record(record_number):
    record = df.iloc[record_number]
    display_data = {
        "ID": record.get("id", "N/A"),
        "Metadata": record.get("metadata", "N/A"),
        "Text": record.get("text", "N/A"),
        "Embedding": record.get("embedding", "N/A")
    }

    # Print the ID
    print("ID:")
    print(display_data["ID"])
    print()

    # Print the metadata in a structured format
    print("Metadata:")
    metadata = display_data["Metadata"]
    if isinstance(metadata, list):
        for item in metadata:
            for key, value in item.items():
                print(f"{key}: {value}")
            print()
    else:
        print(metadata)
    print()

    # Print the text
    print("Text:")
    print(display_data["Text"])
    print()

    # Print the embedding
    print("Embedding:")
    print(display_data["Embedding"])
    print()

# Example usage
rec = 7  # Replace with the desired record number
display_record(rec)

ID:
['ade4e164-2d97-484f-80f1-b2bb916e2f18']

Metadata:
_node_content: {"id_": "ade4e164-2d97-484f-80f1-b2bb916e2f18", "embedding": null, "metadata": {}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": [], "relationships": {"1": {"node_id": "b8ec8bfa-a4c3-4f6b-a005-02a743532be5", "node_type": "4", "metadata": {}, "hash": "ba1ef0fa43ec605c97db616ab531061cb885f1e3d5e4392a6a47a5e3b5b8ff85", "class_name": "RelatedNodeInfo"}}, "metadata_template": "{key}: {value}", "metadata_separator": "\n", "text": "Chapin et al., at the Bell Labs in the USA, invented the\ufb01rst solar cell with an\nef\ufb01ciency of about 6%. Since then, research and application of modern photovoltaic\nsolar cells have been booming. Solar cells have been mounted on satellites, space\nstations, remote prairies, mountains, and islands to offer off-grid electricity, and on\nthe roofs of houses, apartments, and public buildings to generate in-grid electricity.\nIn most cases, solar cells have been installe

# Pipeline 3: Traditional RAG
### LlamaIndex (QueryEngine)

In [2]:
import os
os.chdir(r"F:\Intern\EDF\RAG-pR-main")

In [3]:
from domain_aware_rag import DomainAwareRAG

# ÂàùÂßãÂåñÂÆåÊï¥Á≥ªÁªü
rag_lama = DomainAwareRAG(vector_store_path="RAG-Wikipedia/dataset/vector_storage_new")

# Ëá™ÂÆö‰πâÊü•ËØ¢ÂèÇÊï∞
result = rag_lama.query(
    user_query="How to improve the efficiency of silicon solar cells?",
    use_query_expansion=True,    # ÂêØÁî®Êü•ËØ¢Êâ©Â±ï
    use_query_rewriting=True,    # ÂêØÁî®Êü•ËØ¢ÈáçÂÜô
    top_k=5                      # Ê£ÄÁ¥¢ÊñáÊ°£Êï∞Èáè
)

# Êü•ÁúãËØ¶ÁªÜÁªìÊûú
print(f"ÂéüÂßãÊü•ËØ¢: {result['original_query']}")
print(f"Êâ©Â±ïÊü•ËØ¢: {result['expanded_query']}")
print(f"ÊúÄÁªàÊü•ËØ¢: {result['final_query']}")
print(f"Á≠îÊ°à: {result['answer']}")

  import pkg_resources  # type: ignore


‚úì Llama-index embeddings available


  from .autonotebook import tqdm as notebook_tqdm


‚úì Transformers version: 4.51.3
‚úó Sentence-transformers not available
Loading Qwen3 model from: F:\Intern\EDF\EmbeddingModels\Qwen3-Embedding-0.6B
‚úì Qwen3 model loaded successfully (Method 1)
Deep Lake Dataset in RAG-Wikipedia/dataset/vector_storage_new already exists, loading from the storage
‚úì Ê£ÄÁ¥¢Âô®ÂàùÂßãÂåñÂÆåÊàê
‚úì È¢ÜÂüüËØçÂÖ∏Âä†ËΩΩÂÆåÊàêÔºåÂÖ± 1211 ‰∏™ÊúØËØ≠
‚úì LLMÂàùÂßãÂåñÂÆåÊàê: glm-4-flash
‚úì ÂêëÈáèÂ≠òÂÇ®Ë∑ØÂæÑ: RAG-Wikipedia/dataset/vector_storage_new

üîç Â§ÑÁêÜÊü•ËØ¢: How to improve the efficiency of silicon solar cells?
üåê ÁøªËØëÂêéËã±ÊñáÊü•ËØ¢: How to improve the efficiency of silicon solar cells?
üìö ÊèêÂèñÁöÑÈ¢ÜÂüüÊúØËØ≠: ['Solar Cell', 'Efficiency', 'Silicon Solar Cell']
üîç Êü•ËØ¢Êâ©Â±ï: How to improve the efficiency of silicon solar cells? -> How to improve the efficiency of silicon solar cells? Component silicon solar cell solar cells efÔ¨Åciencies Silicon Solar Cell Solar Cell cell solar cells silicon Technology Efficiency
üîÑ Êü•ËØ¢ÈáçÂÜô: How 

### LangChain (RetrievalQA)

In [4]:
from langchain_retrieval_qa import LangChainDomainRAG

rag_langchain = LangChainDomainRAG(vector_store_path="RAG-Wikipedia/dataset/vector_storage_new")
result = rag_langchain.query(
"How to improve the efficiency of solar cells?",
use_query_expansion=True,
use_query_rewriting=True,
top_k=5 
)

# Êü•ÁúãËØ¶ÁªÜÁªìÊûú
print(f"ÂéüÂßãÊü•ËØ¢: {result['original_query']}")
print(f"Êâ©Â±ïÊü•ËØ¢: {result['expanded_query']}")
print(f"ÊúÄÁªàÊü•ËØ¢: {result['final_query']}")
print(f"Á≠îÊ°à: {result['answer']}")

Loading Qwen3 model from: F:\Intern\EDF\EmbeddingModels\Qwen3-Embedding-0.6B
‚úì Qwen3 model loaded successfully (Method 1)
Deep Lake Dataset in RAG-Wikipedia/dataset/vector_storage_new already exists, loading from the storage
‚úì LlamaIndexÁ¥¢ÂºïÂàùÂßãÂåñÂÆåÊàê
‚úì LangChainÁªÑ‰ª∂ÂàùÂßãÂåñÂÆåÊàê
‚úì È¢ÜÂüüËØçÂÖ∏Âä†ËΩΩÂÆåÊàêÔºåÂÖ± 1211 ‰∏™ÊúØËØ≠
‚úì LangChain RetrievalQAÂàùÂßãÂåñÂÆåÊàê
‚úì ÂêëÈáèÂ≠òÂÇ®Ë∑ØÂæÑ: RAG-Wikipedia/dataset/vector_storage_new

üîç Â§ÑÁêÜÊü•ËØ¢: How to improve the efficiency of solar cells?
üåê ÁøªËØëÂêéËã±ÊñáÊü•ËØ¢: How to improve the efficiency of solar cells?
üìö ÊèêÂèñÁöÑÈ¢ÜÂüüÊúØËØ≠: ['Solar Cell', 'Efficiency']
üîç Êü•ËØ¢Êâ©Â±ï: How to improve the efficiency of solar cells? -> How to improve the efficiency of solar cells? solar cells efÔ¨Åciencies Efficiency cells sol cells solar cell Solar Cell solar cell solar cell
üîÑ Êü•ËØ¢ÈáçÂÜô: How to improve the efficiency of solar cells? solar cells efÔ¨Åciencies Efficiency cells sol cells solar cell Solar Cel

### Evaluation

In [4]:
from evaluation_tools import RAGEvaluator
from domain_aware_rag import DomainAwareRAG
from langchain_retrieval_qa import LangChainDomainRAG

evaluator = RAGEvaluator(llm_api_key="41b29e65745d4110a018c5d616b0012f.A6CEwmornnYXSVLC")
rag_lama = DomainAwareRAG(vector_store_path="RAG-Wikipedia/dataset/vector_storage_new")
results_lama = evaluator.evaluate_rag_system(rag_lama)

rag_langchain = LangChainDomainRAG(vector_store_path="RAG-Wikipedia/dataset/vector_storage_new")
results_langchain = evaluator.evaluate_rag_system(rag_langchain)

print("=== LlamaIndex QueryEngine ËØÑ‰º∞Êä•Âëä ===")
print(evaluator.generate_evaluation_report(results_lama))

print("=== LangChain RetrievalQA ËØÑ‰º∞Êä•Âëä ===")
print(evaluator.generate_evaluation_report(results_langchain))

  import pkg_resources  # type: ignore


‚úì Llama-index embeddings available


  from .autonotebook import tqdm as notebook_tqdm


‚úì Transformers version: 4.51.3
‚úó Sentence-transformers not available
Loading Qwen3 model from: F:\Intern\EDF\EmbeddingModels\Qwen3-Embedding-0.6B
‚úì Qwen3 model loaded successfully (Method 1)
Deep Lake Dataset in RAG-Wikipedia/dataset/vector_storage_new already exists, loading from the storage
‚úì Ê£ÄÁ¥¢Âô®ÂàùÂßãÂåñÂÆåÊàê
‚úì È¢ÜÂüüËØçÂÖ∏Âä†ËΩΩÂÆåÊàêÔºåÂÖ± 1211 ‰∏™ÊúØËØ≠
‚úì LLMÂàùÂßãÂåñÂÆåÊàê: glm-4-flash
‚úì ÂêëÈáèÂ≠òÂÇ®Ë∑ØÂæÑ: RAG-Wikipedia/dataset/vector_storage_new
üîß ÁîüÊàêÊµãËØïÁî®‰æã...


  response = self.qa_chain.run({


LLMÂéüÂßãËæìÂá∫Ôºö 1. ÊïàÁéáÁõ∏ÂÖ≥ÈóÆÈ¢òÔºö
   - **ÈóÆÈ¢ò**ÔºöÂú®2019Âπ¥ÔºåÈ´òÊïàÁ°ÖÁîµÊ±†ÁöÑÂÆûÈ™åÂÆ§ËΩ¨Êç¢ÊïàÁéáËææÂà∞‰∫Ü22%ÔºåËØ∑ÈóÆËøô‰∏ÄÊïàÁéáÂú®ÂΩìÊó∂ÂÖ®ÁêÉËåÉÂõ¥ÂÜÖÂ±û‰∫é‰ªÄ‰πàÊ∞¥Âπ≥Ôºå‰∏é‰πãÁõ∏ÊØîÔºåÁõÆÂâçÁöÑÁ°ÖÁîµÊ±†ËΩ¨Êç¢ÊïàÁéáÊúâÊÄéÊ†∑ÁöÑÊèêÂçáÔºü
   - **Á≠îÊ°à**ÔºöÂú®2019Âπ¥ÔºåÂÆûÈ™åÂÆ§ËΩ¨Êç¢ÊïàÁéáËææÂà∞22%ÁöÑÁ°ÖÁîµÊ±†Â±û‰∫éÂΩìÊó∂ÂÖ®ÁêÉÈ¢ÜÂÖàÊ∞¥Âπ≥ÔºåËøúÈ´ò‰∫éÂΩìÊó∂ÁöÑË°å‰∏öÂπ≥ÂùáÊïàÁéá„ÄÇÁõÆÂâçÔºåÈöèÁùÄÊäÄÊúØÁöÑËøõÊ≠•ÔºåÂÆûÈ™åÂÆ§ËΩ¨Êç¢ÊïàÁéáÂ∑≤ÁªèË∂ÖËøá‰∫Ü25%ÔºåËÄåÂú®ÂÆûÈôÖÂ∫îÁî®ÁöÑÁ°ÖÁîµÊ±†‰∏≠ÔºåÊïàÁéá‰πüÊôÆÈÅçË∂ÖËøá‰∫Ü20%ÔºåÁõ∏ÊØî2019Âπ¥Êúâ‰∫ÜÊòæËëóÁöÑÊèêÂçá„ÄÇ

2. Âà∂ÈÄ†Â∑•Ëâ∫ÈóÆÈ¢òÔºö
   - **ÈóÆÈ¢ò**ÔºöÊô∂‰ΩìÁ°ÖÁîµÊ±†ÁöÑÂà∂ÈÄ†Â∑•Ëâ∫‰∏≠Ôºå‰º†ÁªüÁöÑ‰∏ùÁΩëÂç∞Âà∑Â∑•Ëâ∫‰∏éÊúÄÊñ∞ÁöÑÊøÄÂÖâÊâìÂç∞ÊäÄÊúØÁõ∏ÊØîÔºåÂú®ÁîµÊ±†ÊïàÁéá‰∏äÊúâÂì™‰∫õÂ∑ÆÂºÇÔºüÂÖ∑‰ΩìÁöÑÊï∞ÊçÆËÉΩËØ¥ÊòéËøô‰∏ÄÁÇπÂêóÔºü
   - **Á≠îÊ°à**ÔºöÊøÄÂÖâÊâìÂç∞ÊäÄÊúØÂú®ÁîµÊ±†ÊïàÁéá‰∏äÈÄöÂ∏∏‰ºò‰∫é‰∏ùÁΩëÂç∞Âà∑Â∑•Ëâ∫„ÄÇ‰æãÂ¶ÇÔºå‰ΩøÁî®ÊøÄÂÖâÊâìÂç∞ÊäÄÊúØÂèØ‰ª•Âú®Á°ÖÁîµÊ±†‰∏äÂÆûÁé∞Êõ¥Á≤æÁªÜÁöÑÂõæÊ°àÂíåÊõ¥‰ΩéÁöÑÁîµÈòªÔºå‰ªéËÄåÊèê

In [5]:
from evaluation_tools import RAGEvaluator
evaluator = RAGEvaluator(llm_api_key="41b29e65745d4110a018c5d616b0012f.A6CEwmornnYXSVLC")

# ÂØπÊØî LlamaIndex RAG vs LLMÁõ¥Á≠î
results_vs_llm_lama = evaluator.evaluate_rag_vs_llm(rag_lama)
evaluator.print_rag_vs_llm_report(results_vs_llm_lama)

results_vs_llm_langchain = evaluator.evaluate_rag_vs_llm(rag_langchain)
evaluator.print_rag_vs_llm_report(results_vs_llm_langchain)

üîß ÁîüÊàêÊµãËØïÁî®‰æã...
LLMÂéüÂßãËæìÂá∫Ôºö 1. **ÊïàÁéáÁõ∏ÂÖ≥ÈóÆÈ¢ò**
   - **ÈóÆÈ¢ò**ÔºöÁ°ÖÁîµÊ±†ÁöÑÊïàÁéáÂú®ËøáÂéªÁöÑÂçÅÂπ¥‰∏≠Âπ≥ÂùáÊèêÈ´ò‰∫ÜÂ§öÂ∞ëÔºüËøôÁßçÊèêÈ´ò‰∏ªË¶ÅÂΩíÂõ†‰∫éÂì™‰∫õÊäÄÊúØÊîπËøõÔºü
   - **Á≠îÊ°à**ÔºöÂú®ËøáÂéªÁöÑÂçÅÂπ¥‰∏≠ÔºåÁ°ÖÁîµÊ±†ÁöÑÊïàÁéáÂπ≥ÂùáÊèêÈ´ò‰∫ÜÁ∫¶20%„ÄÇËøôÁßçÊèêÈ´ò‰∏ªË¶ÅÂΩíÂõ†‰∫é‰ª•‰∏ãÂá†ÁÇπÊäÄÊúØÊîπËøõÔºöÈ¶ñÂÖàÔºåÂçïÊô∂Á°ÖÂíåÂ§öÊô∂Á°ÖÁîµÊ±†ÁöÑÂà∂Á®ã‰ºòÂåñÔºå‰æãÂ¶Ç‰ΩøÁî®Êõ¥ÂÖàËøõÁöÑËöÄÂàªÂíåÊ≤âÁßØÊäÄÊúØÔºõÂÖ∂Ê¨°ÔºåÁîµÊ±†ÁªìÊûÑÁöÑÂàõÊñ∞ÔºåÂ¶Ç‰ΩøÁî®Á∫≥Á±≥ÁªìÊûÑ„ÄÅ textured surfaces Âíå anti-reflective coatings Êù•ÂáèÂ∞ëÂèçÂ∞ÑÂπ∂Â¢ûÂä†Âê∏Êî∂ÔºõÊúÄÂêéÔºåÈÄöËøáÊé∫ÊùÇÂíåÁîµÊ±†ËÆæËÆ°‰ºòÂåñÔºåÊèêÈ´ò‰∫ÜÁîµÂ≠êÁöÑ‰º†ËæìÊïàÁéáÂíåÂáèÂ∞ëËÉΩÈáèÊçüÂ§±„ÄÇ

2. **Âà∂ÈÄ†Â∑•Ëâ∫ÈóÆÈ¢ò**
   - **ÈóÆÈ¢ò**ÔºöÂú®Á°ÖÁîµÊ±†Âà∂ÈÄ†‰∏≠ÔºåÁ¶ªÂ≠êÊ≥®ÂÖ•ÊäÄÊúØÂ¶Ç‰ΩïÊèêÈ´òÁîµÊ±†ÁöÑÊïàÁéáÔºü
   - **Á≠îÊ°à**ÔºöÁ¶ªÂ≠êÊ≥®ÂÖ•ÊäÄÊúØÈÄöËøáÂêëÁ°ÖÊô∂‰Ωì‰∏≠Ê≥®ÂÖ•Êé∫ÊùÇÂéüÂ≠êÔºåÂ¶ÇÁ°ºÊàñÁ£∑ÔºåÂèØ‰ª•Á≤æÁ°ÆÊéßÂà∂Êé∫ÊùÇÊµìÂ∫¶ÂíåÂàÜÂ∏ÉÔºå‰ªéËÄå‰ºòÂåñÁîµÊ±†ÁöÑËÉΩÂ∏¶ÁªìÊûÑÂíåËΩΩÊµÅÂ≠ê‰º†Ëæì„ÄÇËøôÁßçÊäÄÊúØÂ

## Pipeline 4: Knowledge graph RAG

In [5]:
from enhanced_integrated_rag_system import EnhancedIntegratedRAGSystem

# ÂàùÂßãÂåñÂ¢ûÂº∫Á≥ªÁªü
rag_system = EnhancedIntegratedRAGSystem()

# ÊâßË°åÊ∑∑ÂêàÊ£ÄÁ¥¢
result = rag_system.query("Â¶Ç‰ΩïÊèêÈ´òÂ§™Èò≥ËÉΩÁîµÊ±†ÁöÑÊïàÁéáÔºü")

# ÊØîËæÉ‰∏çÂêåÊñπÊ≥ï
comparison = rag_system.compare_methods("Â¶Ç‰ΩïÊèêÈ´òÂ§™Èò≥ËÉΩÁîµÊ±†ÁöÑÊïàÁéáÔºü")

# # ÁîüÊàêÂèØËßÜÂåñ
# viz_path = rag_system.visualize_graph("material_optimizations")

  import pkg_resources  # type: ignore


Loading Qwen3 model from: F:\Intern\EDF\EmbeddingModels\Qwen3-Embedding-0.6B
‚úì Qwen3 model loaded successfully (Method 1)
Deep Lake Dataset in RAG-Wikipedia/dataset/vector_storage_new already exists, loading from the storage
‚úì LlamaIndexÁ¥¢ÂºïÂàùÂßãÂåñÂÆåÊàê
‚úì LangChainÁªÑ‰ª∂ÂàùÂßãÂåñÂÆåÊàê
‚úì È¢ÜÂüüËØçÂÖ∏Âä†ËΩΩÂÆåÊàêÔºåÂÖ± 1211 ‰∏™ÊúØËØ≠
‚úì LangChain RetrievalQAÂàùÂßãÂåñÂÆåÊàê
‚úì ÂêëÈáèÂ≠òÂÇ®Ë∑ØÂæÑ: RAG-Wikipedia/dataset/vector_storage_new
‚úì Â¢ûÂº∫Ê∑∑ÂêàÊ£ÄÁ¥¢Âô®ÂàùÂßãÂåñÂÆåÊàê
‚úì Â¢ûÂº∫ÈõÜÊàêRAGÁ≥ªÁªüÂàùÂßãÂåñÂÆåÊàê

üîç Â¢ûÂº∫RAGÊü•ËØ¢: Â¶Ç‰ΩïÊèêÈ´òÂ§™Èò≥ËÉΩÁîµÊ±†ÁöÑÊïàÁéáÔºü

üîç Ê∑∑ÂêàÊ£ÄÁ¥¢Êü•ËØ¢: Â¶Ç‰ΩïÊèêÈ´òÂ§™Èò≥ËÉΩÁîµÊ±†ÁöÑÊïàÁéáÔºü
üìö ÊâßË°åLangChainÂêëÈáèÊ£ÄÁ¥¢...

üîç Â§ÑÁêÜÊü•ËØ¢: Â¶Ç‰ΩïÊèêÈ´òÂ§™Èò≥ËÉΩÁîµÊ±†ÁöÑÊïàÁéáÔºü
üåê ÁøªËØëÂêéËã±ÊñáÊü•ËØ¢: How to improve the efficiency of solar cells?
üìö ÊèêÂèñÁöÑÈ¢ÜÂüüÊúØËØ≠: ['Efficiency', 'Solar Cell']
üîç Êü•ËØ¢Êâ©Â±ï: How to improve the efficiency of solar cells? -> How to improve the efficiency of solar c

In [10]:
from enhanced_integrated_rag_system import EnhancedIntegratedRAGSystem

# ÂàùÂßãÂåñÁ≥ªÁªü
rag_system = EnhancedIntegratedRAGSystem()

# ÊâßË°åÊü•ËØ¢ÔºàÂåÖÂê´ÂõæË∞±ÂèØËßÜÂåñÔºâ
result = rag_system.query(
    "Â¶Ç‰ΩïÊèêÈ´òÂ§™Èò≥ËÉΩÁîµÊ±†ÁöÑÊïàÁéáÔºü",
    vector_weight=0.6,
    graph_weight=0.4,

    graph_limit=50,
    use_query_expansion=True,
    use_query_rewriting=True,
    top_k=100,
    prompt_type="qa",
    use_structured_query=False,

    include_graph_visualization=True,
    include_explanation=True,
)

# Êü•ÁúãÁªìÊûú
print(result['hybrid_results']['final_answer'])  # ÊúÄÁªàÁ≠îÊ°à
print(result['graph_entities'])  # ÂõæË∞±ÂÆû‰Ωì
# print(result['graph_visualization_path'])  # ÂèØËßÜÂåñÂõæÁâáË∑ØÂæÑ

Loading Qwen3 model from: F:\Intern\EDF\EmbeddingModels\Qwen3-Embedding-0.6B
‚úì Qwen3 model loaded successfully (Method 1)
Deep Lake Dataset in RAG-Wikipedia/dataset/vector_storage_new already exists, loading from the storage
‚úì LlamaIndexÁ¥¢ÂºïÂàùÂßãÂåñÂÆåÊàê
‚úì LangChainÁªÑ‰ª∂ÂàùÂßãÂåñÂÆåÊàê
‚úì È¢ÜÂüüËØçÂÖ∏Âä†ËΩΩÂÆåÊàêÔºåÂÖ± 1211 ‰∏™ÊúØËØ≠
‚úì LangChain RetrievalQAÂàùÂßãÂåñÂÆåÊàê
‚úì ÂêëÈáèÂ≠òÂÇ®Ë∑ØÂæÑ: RAG-Wikipedia/dataset/vector_storage_new
‚úì Â¢ûÂº∫Ê∑∑ÂêàÊ£ÄÁ¥¢Âô®ÂàùÂßãÂåñÂÆåÊàê
‚úì Â¢ûÂº∫ÈõÜÊàêRAGÁ≥ªÁªüÂàùÂßãÂåñÂÆåÊàê

üîç Â¢ûÂº∫RAGÊü•ËØ¢: Â¶Ç‰ΩïÊèêÈ´òÂ§™Èò≥ËÉΩÁîµÊ±†ÁöÑÊïàÁéáÔºü

üîç Ê∑∑ÂêàÊ£ÄÁ¥¢Êü•ËØ¢: Â¶Ç‰ΩïÊèêÈ´òÂ§™Èò≥ËÉΩÁîµÊ±†ÁöÑÊïàÁéáÔºü
üìö ÊâßË°åLangChainÂêëÈáèÊ£ÄÁ¥¢...

üîç Â§ÑÁêÜÊü•ËØ¢: Â¶Ç‰ΩïÊèêÈ´òÂ§™Èò≥ËÉΩÁîµÊ±†ÁöÑÊïàÁéáÔºü
üåê ÁøªËØëÂêéËã±ÊñáÊü•ËØ¢: How to improve the efficiency of solar cells?
üìö ÊèêÂèñÁöÑÈ¢ÜÂüüÊúØËØ≠: ['Solar Cell', 'Efficiency']
üîç Êü•ËØ¢Êâ©Â±ï: How to improve the efficiency of solar cells? -> How to improve the efficiency of solar c

  plt.savefig(output_path, dpi=300, bbox_inches='tight')
  plt.savefig(output_path, dpi=300, bbox_inches='tight')
  plt.savefig(output_path, dpi=300, bbox_inches='tight')
  plt.savefig(output_path, dpi=300, bbox_inches='tight')
  plt.savefig(output_path, dpi=300, bbox_inches='tight')
  plt.savefig(output_path, dpi=300, bbox_inches='tight')
  plt.savefig(output_path, dpi=300, bbox_inches='tight')
  plt.savefig(output_path, dpi=300, bbox_inches='tight')
  plt.savefig(output_path, dpi=300, bbox_inches='tight')
  plt.savefig(output_path, dpi=300, bbox_inches='tight')
  plt.savefig(output_path, dpi=300, bbox_inches='tight')
  plt.savefig(output_path, dpi=300, bbox_inches='tight')
  plt.savefig(output_path, dpi=300, bbox_inches='tight')
  plt.savefig(output_path, dpi=300, bbox_inches='tight')
  plt.savefig(output_path, dpi=300, bbox_inches='tight')
  plt.savefig(output_path, dpi=300, bbox_inches='tight')
  plt.savefig(output_path, dpi=300, bbox_inches='tight')
  plt.savefig(output_path, dpi=

üé® ÂõæË∞±ÂèØËßÜÂåñÂ∑≤‰øùÂ≠ò: graph_visualization_1754561018.png
üï∏Ô∏è ÂõæË∞±Êü•ËØ¢ÂèëÁé∞ 50 ‰∏™ÂÆû‰Ωì
‰∏∫‰∫ÜÊèêÈ´òÂ§™Èò≥ËÉΩÁîµÊ±†ÁöÑÊïàÁéáÔºåÂèØ‰ª•Ê†πÊçÆÊñáÊ°£‰∏≠Êèê‰æõÁöÑ‰ø°ÊÅØÂíåËØ•È¢ÜÂüüÁöÑÂΩìÂâçÁ†îÁ©∂Áä∂ÂÜµÈááÂèñÂá†ÁßçÁ≠ñÁï•„ÄÇ‰ª•‰∏ãÊòØÊèê‰æõÊñáÊú¨‰∏≠ÊèêÂà∞ÊàñÊöóÁ§∫ÁöÑ‰∏Ä‰∫õÊñπÊ≥ïÔºö

1. **ÊùêÊñôÂºÄÂèë**ÔºöÂºïÂÖ•Êñ∞ÂûãÊùêÊñôÔºåÂ¶ÇÊúâÊú∫ÂÖâ‰ºèÔºàOPVÔºâ‰∏≠ÁöÑÈùûÂØåÂãíÁÉØÂèó‰ΩìÔºå‰ª•ÂèäÊé¢Á¥¢Êñ∞ÁöÑÂê∏Êî∂ÂâÇ„ÄÅ‰º†ËæìÂíåÊé•Ëß¶ÊùêÊñôÔºåÂèØ‰ª•ÊòæËëóÊèêÈ´òÂÖâ‰ºèÂô®‰ª∂ÁöÑÊïàÁéá„ÄÇÂºÄÂèëÂÖ∑ÊúâÊîπËøõÁöÑÂÖâÂ≠¶ÂíåÁîµÂ≠¶ÊÄßË¥®ÁöÑÊñ∞ÊùêÊñôÂèØ‰ª•ÊèêÈ´òÊïàÁéáÂíåÁ®≥ÂÆöÊÄß[1]„ÄÇ

2. **ÂÖàËøõÁöÑÁ∫≥Á±≥ÂÖâÂ≠êËÆæËÆ°**ÔºöÂà©Áî®ÂÖàËøõÁöÑÁ∫≥Á±≥ÂÖâÂ≠êËÆæËÆ°ÂèØ‰ª•ÊîπÂèòÂ§™Èò≥ËÉΩË∞±Ôºå‰ΩøÂÖ∂Êõ¥ÊúâÊïàÂú∞Ë¢´Â§™Èò≥ËÉΩÁîµÊ±†ËΩ¨Âåñ‰∏∫ÁîµËÉΩ„ÄÇËøôÂèØËÉΩÊ∂âÂèä‰ΩøÁî®Á≠âÁ¶ªÂ≠ê‰ΩìÂíåË°çÂ∞ÑÁ∫≥Á±≥ÁªìÊûÑÊù•Â¢ûÂº∫ÂÖâÂê∏Êî∂ÂíåËΩ¨Êç¢[2]„ÄÇ

3. **Â§öÁªìÂ§™Èò≥ËÉΩÁîµÊ±†**ÔºöÈÄöËøáÂÆûÊñΩÂ§öÁªìÂ§™Èò≥ËÉΩÁîµÊ±†Êàñ‰∏≤Âπ∂ËÅîÂ§™Èò≥ËÉΩÁîµÊ±†ÔºåÂèØ‰ª•Âà©Áî®ÂÖ∑Êúâ‰∏çÂêåÂçäÂØº‰ΩìÁªìÁöÑÂ§öÂ±ÇÁªìÊûÑÔºå‰ºòÂåñ‰∏çÂêåÈÉ®ÂàÜÁöÑÂ§™Èò≥ËÉΩË∞±Ôºå

In [3]:
from enhanced_integrated_rag_system_v2 import EnhancedIntegratedRAGSystemV2

# ÂàùÂßãÂåñÁ≥ªÁªü
rag_system = EnhancedIntegratedRAGSystemV2()

# ÊâßË°åÊü•ËØ¢ÔºàÁ°Æ‰øùÊâÄÊúâÂäüËÉΩÈÉΩÂêØÁî®Ôºâ
result = rag_system.query(
    "Â¶Ç‰ΩïÊèêÈ´òÂ§™Èò≥ËÉΩÁîµÊ±†ÁöÑÊïàÁéáÔºü",
    # Á°Æ‰øùÊâÄÊúâÂäüËÉΩÈÉΩÂêØÁî®
    include_citations=True,
    include_figures=True,
    include_graph_insights=True,
    include_graph_visualization=True,
    include_explanation=True,
    include_source_tracking=True,
    # Ê£ÄÁ¥¢ÂèÇÊï∞
    top_k=100,
    graph_limit=100,
    # ÊùÉÈáçËÆæÁΩÆ
    vector_weight=0.6,
    graph_weight=0.4,
    # Êü•ËØ¢‰ºòÂåñ
    use_query_expansion=True,
    use_query_rewriting=True,
    use_structured_query=False,
    prompt_type="qa"
)

# Êü•ÁúãÁªìÊûú
print(f"ÊâßË°åÊó∂Èó¥: {result['total_execution_time']:.2f}Áßí")
print(f"Â¢ûÂº∫Á≠îÊ°àÈïøÂ∫¶: {len(result['enhanced_answer'])} Â≠óÁ¨¶")

# Êü•ÁúãÊù•Ê∫êËøΩË∏™‰ø°ÊÅØ
if result.get('source_tracking_info'):
    tracking_info = result['source_tracking_info']
    print(f"Ê£ÄÁ¥¢Âà∞ÁöÑchunksÊï∞Èáè: {tracking_info['total_chunks_retrieved']}")
    print(f"ÂîØ‰∏ÄÊù•Ê∫êÊï∞Èáè: {len(tracking_info['unique_sources'])}")

print("üìÑ Â¢ûÂº∫Á≠îÊ°à:")
print(result['enhanced_answer'])

print("\nÔøΩÔøΩ ËØ¶ÁªÜ‰ø°ÊÅØ:")
print(f"ÊñáÁåÆÂºïÁî®: {len(result.get('citations', []))} ‰∏™")
print(f"ÂõæË°®‰ø°ÊÅØ: {len(result.get('figures', []))} ‰∏™")
print(f"ÂõæË∞±ÂÆû‰Ωì: {len(result.get('graph_entities', []))} ‰∏™")

  import pkg_resources  # type: ignore


‚úì Llama-index embeddings available


  from .autonotebook import tqdm as notebook_tqdm


‚úì Transformers version: 4.51.3
‚úó Sentence-transformers not available
Loading Qwen3 model from: F:\Intern\EDF\EmbeddingModels\Qwen3-Embedding-0.6B
‚úì Qwen3 model loaded successfully (Method 1)
Deep Lake Dataset in RAG-Wikipedia/dataset/vector_storage_new already exists, loading from the storage
‚úì LlamaIndexÁ¥¢ÂºïÂàùÂßãÂåñÂÆåÊàê
‚úì LangChainÁªÑ‰ª∂ÂàùÂßãÂåñÂÆåÊàê
‚úì È¢ÜÂüüËØçÂÖ∏Âä†ËΩΩÂÆåÊàêÔºåÂÖ± 1211 ‰∏™ÊúØËØ≠
‚úì LangChain RetrievalQAÂàùÂßãÂåñÂÆåÊàê
‚úì ÂêëÈáèÂ≠òÂÇ®Ë∑ØÂæÑ: RAG-Wikipedia/dataset/vector_storage_new
‚úì Â¢ûÂº∫Ê∑∑ÂêàÊ£ÄÁ¥¢Âô®ÂàùÂßãÂåñÂÆåÊàê
‚úì Â¢ûÂº∫ÈõÜÊàêRAGÁ≥ªÁªü V2 ÂàùÂßãÂåñÂÆåÊàê

üîç Â¢ûÂº∫RAGÊü•ËØ¢ V2: Â¶Ç‰ΩïÊèêÈ´òÂ§™Èò≥ËÉΩÁîµÊ±†ÁöÑÊïàÁéáÔºü

üîç Ê∑∑ÂêàÊ£ÄÁ¥¢Êü•ËØ¢: Â¶Ç‰ΩïÊèêÈ´òÂ§™Èò≥ËÉΩÁîµÊ±†ÁöÑÊïàÁéáÔºü
üìö ÊâßË°åLangChainÂêëÈáèÊ£ÄÁ¥¢...

üîç Â§ÑÁêÜÊü•ËØ¢: Â¶Ç‰ΩïÊèêÈ´òÂ§™Èò≥ËÉΩÁîµÊ±†ÁöÑÊïàÁéáÔºü
üåê ÁøªËØëÂêéËã±ÊñáÊü•ËØ¢: How to improve the efficiency of solar cells?
üìö ÊèêÂèñÁöÑÈ¢ÜÂüüÊúØËØ≠: ['Solar Cell', 'Efficiency']
üîç Êü•ËØ¢Êâ©Â±ï: How to im

  plt.savefig(filename, dpi=300, bbox_inches='tight')
  plt.savefig(filename, dpi=300, bbox_inches='tight')
  plt.savefig(filename, dpi=300, bbox_inches='tight')
  plt.savefig(filename, dpi=300, bbox_inches='tight')
  plt.savefig(filename, dpi=300, bbox_inches='tight')
  plt.savefig(filename, dpi=300, bbox_inches='tight')
  plt.savefig(filename, dpi=300, bbox_inches='tight')
  plt.savefig(filename, dpi=300, bbox_inches='tight')
  plt.savefig(filename, dpi=300, bbox_inches='tight')
  plt.savefig(filename, dpi=300, bbox_inches='tight')
  plt.savefig(filename, dpi=300, bbox_inches='tight')
  plt.savefig(filename, dpi=300, bbox_inches='tight')
  plt.savefig(filename, dpi=300, bbox_inches='tight')


‚úÖ ÂõæË∞±ÂèØËßÜÂåñÂ∑≤‰øùÂ≠ò: graph_visualization_1754639968.png
üï∏Ô∏è ÂõæË∞±Êü•ËØ¢ÂèëÁé∞ 100 ‰∏™ÂÆû‰Ωì
ÊâßË°åÊó∂Èó¥: 80.11Áßí
Â¢ûÂº∫Á≠îÊ°àÈïøÂ∫¶: 4908 Â≠óÁ¨¶
Ê£ÄÁ¥¢Âà∞ÁöÑchunksÊï∞Èáè: 100
ÂîØ‰∏ÄÊù•Ê∫êÊï∞Èáè: 0
üìÑ Â¢ûÂº∫Á≠îÊ°à:
ÊèêÈ´òÂ§™Èò≥ËÉΩÁîµÊ±†ÁöÑÊïàÁéáÊ∂âÂèäÂêÑÁßçÁ≠ñÁï•ÔºåÂÖ∂‰∏≠‰∏Ä‰∫õÂú®Êèê‰æõÁöÑÊñáÊ°£‰∏≠ÊúâËØ¶ÁªÜËØ¥ÊòéÔºö

1. **ÂÖàËøõÊùêÊñôÁ†îÁ©∂**ÔºöÊ≠£Â¶ÇÊñáÊ°£‰∏≠Âº∫Ë∞ÉÁöÑÔºåÂú®ÊùêÊñôÁ†îÁ©∂ÊñπÈù¢ÂèñÂæó‰∫ÜÊúÄÊñ∞ËøõÂ±ïÔºåËøô‰∫õËøõÂ±ïÈõÜ‰∏≠Âú®Âê∏Êî∂ÂâÇ„ÄÅ‰º†ËæìÊùêÊñôÂíåÊé•Ëß¶ÊùêÊñô‰∏äÔºåËøô‰∫õÊùêÊñôÂèØ‰ª•ÊîπÂñÑÂÖâ‰ºèÔºàPVÔºâËÆæÂ§áÁöÑÈ´òÊïàÊÄß„ÄÅÁ®≥ÂÆöÊÄßÂíåÊàêÊú¨„ÄÇ[1] ËøôÂåÖÊã¨Âü∫‰∫éHaPÁöÑÂÖâ‰ºèÁîµÊ±†ÁöÑËøõÊ≠•‰ª•ÂèäÊúâÊú∫ÂÖâ‰ºèÔºàOPVÔºâ‰∏≠ÈùûÂØåÂãíÁÉØÂèó‰ΩìÁöÑÂ∫îÁî®„ÄÇ[1]

2. **Â§™Èò≥ËÉΩÁîµÊ±†ÁªìÊûÑÁöÑ‰ºòÂåñ**ÔºöÊñáÊ°£ÊèêÂà∞ÔºåÊèêÈ´òÂ§™Èò≥ËÉΩÁîµÊ±†ÊÄßËÉΩÁöÑ‰∏Ä‰∏™ÈáçÂ§ßÈóÆÈ¢òÊòØ‰ºòÂåñÂÖ∂ÁªìÊûÑ‰ª•ÊçïËé∑Êõ¥Â§öÁöÑÂÖ•Â∞ÑÂÖâ„ÄÇÁ†îÁ©∂‰∫ÜËÆ∏Â§öÊäÄÊúØÔºåÂ¶ÇÁ≠âÁ¶ªÂ≠ê‰ΩìÂíåË°çÂ∞ÑÁ∫≥Á±≥ÁªìÊûÑ„ÄÅÂêë‰∏ãËΩ¨Êç¢Á≤íÂ≠ê„ÄÅË°®Èù¢Á∫πÁêÜÂåñÂíåÁ∫≥Á±≥Â≠îÂõæÊ°àÂåñÔºå‰ª•ÂÆûÁé∞Ëøô‰∏ÄÁõÆÊ†á„ÄÇ[1] ‰æãÂ¶

## Pipeline 4: Knowledge graph index-based RAG

In [9]:
from llama_index.core import Document

# Ensure 'text' column is of type string
df['text'] = df['text'].astype(str)
# Create documents with IDs
documents = [Document(text=row['text'], doc_id=str(row['id'])) for _, row in df.iterrows()]

### Generating the graph index

In [12]:
from llama_index.core import KnowledgeGraphIndex
import time
from typing import List

# Start the timer
start_time = time.time()

class RobustKnowledgeGraphIndex(KnowledgeGraphIndex):
    def _extract_triplets(self, text: str) -> List[tuple]:
        """Extract triplets with error handling to ignore failures."""
        try:
            # Call the parent class's triplet extraction method
            return super()._extract_triplets(text)
        except Exception as e:
            # Log the error (optional) and return an empty list to continue processing
            print(f"Error extracting triplets for text chunk: {e}")
            return []

# Graph index with embeddings
# test_docs = documents[:100]
graph_index = RobustKnowledgeGraphIndex.from_documents(
    documents,
    max_triplets_per_chunk=5,
    include_embeddings=True,
    show_progress=False,
)

# Stop the timer
end_time = time.time()

# Calculate and print the execution time
elapsed_time = end_time - start_time
print(f"Index creation time: {elapsed_time:.4f} seconds")

print(type(graph_index))

# Save the graph index to a file
graph_index_saving_path = "./dataset/graph_storage"
graph_index.storage_context.persist(persist_dir=graph_index_saving_path)

Error extracting triplets for text chunk: Error code: 400, with error text {"contentFilter":[{"level":1,"role":"assistant"}],"error":{"code":"1301","message":"Á≥ªÁªüÊ£ÄÊµãÂà∞ËæìÂÖ•ÊàñÁîüÊàêÂÜÖÂÆπÂèØËÉΩÂåÖÂê´‰∏çÂÆâÂÖ®ÊàñÊïèÊÑüÂÜÖÂÆπÔºåËØ∑ÊÇ®ÈÅøÂÖçËæìÂÖ•Êòì‰∫ßÁîüÊïèÊÑüÂÜÖÂÆπÁöÑÊèêÁ§∫ËØ≠ÔºåÊÑüË∞¢ÊÇ®ÁöÑÈÖçÂêà„ÄÇ"}}
Error extracting triplets for text chunk: Error code: 400, with error text {"contentFilter":[{"level":1,"role":"user"}],"error":{"code":"1301","message":"Á≥ªÁªüÊ£ÄÊµãÂà∞ËæìÂÖ•ÊàñÁîüÊàêÂÜÖÂÆπÂèØËÉΩÂåÖÂê´‰∏çÂÆâÂÖ®ÊàñÊïèÊÑüÂÜÖÂÆπÔºåËØ∑ÊÇ®ÈÅøÂÖçËæìÂÖ•Êòì‰∫ßÁîüÊïèÊÑüÂÜÖÂÆπÁöÑÊèêÁ§∫ËØ≠ÔºåÊÑüË∞¢ÊÇ®ÁöÑÈÖçÂêà„ÄÇ"}}
Error extracting triplets for text chunk: Error code: 400, with error text {"contentFilter":[{"level":1,"role":"user"}],"error":{"code":"1301","message":"Á≥ªÁªüÊ£ÄÊµãÂà∞ËæìÂÖ•ÊàñÁîüÊàêÂÜÖÂÆπÂèØËÉΩÂåÖÂê´‰∏çÂÆâÂÖ®ÊàñÊïèÊÑüÂÜÖÂÆπÔºåËØ∑ÊÇ®ÈÅøÂÖçËæìÂÖ•Êòì‰∫ßÁîüÊïèÊÑüÂÜÖÂÆπÁöÑÊèêÁ§∫ËØ≠ÔºåÊÑüË∞¢ÊÇ®ÁöÑÈÖçÂêà„ÄÇ"}}
Error extracting triplets for text chunk: Error code: 400, with error t

### Displaying the graph in HTML file

In [18]:
# Load the Graph data
from llama_index.core import StorageContext, load_index_from_storage

graph_index_saving_path = "./dataset/graph_storage"
storage_context = StorageContext.from_defaults(persist_dir=graph_index_saving_path)

graph_index = load_index_from_storage(storage_context)

# Create graph
from pyvis.network import Network

g = graph_index.get_networkx_graph()
net = Network(notebook=True, cdn_resources="in_line", directed=True)
net.from_nx(g)

# Set node and edge properties: colors and sizes
for node in net.nodes:
    node['color'] = 'lightgray'
    node['size'] = 10

for edge in net.edges:
    edge['color'] = 'black'
    edge['width'] = 1

fgraph="Knowledge_graph_visual.html"
net.write_html(fgraph)
print(fgraph)

Loading llama_index.core.storage.kvstore.simple_kvstore from ./dataset/graph_storage\docstore.json.
Loading llama_index.core.storage.kvstore.simple_kvstore from ./dataset/graph_storage\index_store.json.


ModuleNotFoundError: No module named 'pyvis'

## Interacting with the Knowledge graph index

In [14]:
import time
import textwrap



#similarity_top_k
k=3
#temperature
temp=0.1
#num_output
mt=1024
graph_query_engine = graph_index.as_query_engine(similarity_top_k=k, temperature=temp, num_output=mt)

def execute_query(user_input, k=3, temp=0.1, mt=1024):

    # Start the timer
    start_time = time.time()

    # Execute the query with additional parameters
    response = graph_query_engine.query(user_input)

    # Stop the timer
    end_time = time.time()

    # Calculate and print the execution time
    elapsed_time = end_time - start_time
    print(f"Query execution time: {elapsed_time:.4f} seconds")

    # Print the response, wrapped to 100 characters per line
    print(textwrap.fill(str(response), 100))
    return response

In [15]:
user_query="Summarise the methods to improve efficiency of solar cells, in ordered list."

In [16]:
import time
import textwrap
import sys
import io
# Start the timer
start_time = time.time()
# Capture the output
old_stdout = sys.stdout
new_stdout = io.StringIO()
sys.stdout = new_stdout
response = execute_query(user_query)
# Restore stdout
sys.stdout = old_stdout
# Stop the timer
end_time = time.time()
# Calculate and print the execution time
elapsed_time = end_time - start_time
print(f"Query execution time: {elapsed_time:.4f} seconds")

print(textwrap.fill(str(response), 120))

Query execution time: 10.6188 seconds
1. Introduce internal polymer/nanotube junctions within the polymer matrix to enhance charge separation and collection.
2. Use a bulk donor-acceptor heterojunction with a bicontinuous network to allow electrons and holes to travel toward
their respective contacts. 3. Implement charge separation at polymer-SWCNT connections for more efficient electron
transport. 4. Blend functionalized MWCNTs into P3HT polymer to create a P3HT-MWCNT with fullerene C60 double-layered
device. 5. Use C60-modified SWCNTs and P3HT to fabricate polymer-SWCNT composites for improved short circuit current
density and electron transport. 6. Heat the blend to the glass transition temperature of the polymer to manipulate phase
separation and improve charge transfer, transport, and collection. 7. Employ tetraoctylammonium bromide in
tetrahydrofuran to assist in suspension and expose SWCNTs to an electrophoretic field for deposition.


In [None]:
# Â¢ûÂº∫ÁâàÊñáÊ°£ÂàÜÂâ≤‰ª£Á†Å - ÂåÖÂê´ÂÆåÊï¥metadata‰ø°ÊÅØ
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import glob
from llama_index.core import Document

import pdfplumber
import fitz  # PyMuPDF
import os
from datetime import datetime
import hashlib

import warnings
warnings.filterwarnings("ignore")

def extract_pdf_basic_info(pdf_path):
    """ÊèêÂèñPDFÂü∫Êú¨‰ø°ÊÅØ‰Ωú‰∏∫metadata"""
    try:
        doc = fitz.open(pdf_path)
        metadata = doc.metadata
        doc.close()
        
        # Ëé∑ÂèñÊñá‰ª∂‰ø°ÊÅØ
        file_stats = os.stat(pdf_path)
        creation_time = datetime.fromtimestamp(file_stats.st_ctime).isoformat()
        modification_time = datetime.fromtimestamp(file_stats.st_mtime).isoformat()
        file_size = file_stats.st_size
        
        return {
            'title': metadata.get('title', ''),
            'author': metadata.get('author', ''),
            'subject': metadata.get('subject', ''),
            'creator': metadata.get('creator', ''),
            'producer': metadata.get('producer', ''),
            'creation_date': metadata.get('creationDate', ''),
            'modification_date': metadata.get('modDate', ''),
            'file_creation_time': creation_time,
            'file_modification_time': modification_time,
            'file_size_bytes': file_size
        }
    except Exception as e:
        print(f"Warning: Could not extract metadata from {pdf_path}: {e}")
        return {}

def create_enhanced_documents_with_metadata():
    """ÂàõÂª∫ÂåÖÂê´ÂÆåÊï¥metadataÁöÑÊñáÊ°£Âùó"""
    
    pdf_files = glob.glob("../zotero/*.pdf")
    all_docs = []
    
    # ÂàÜÂâ≤Âô®
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=100,
        separators=["\n\n", "\n", ".", "„ÄÇ", "ÔºÅ", "Ôºü", "!", "?"]
    )
    
    for pdf in pdf_files:
        file_name = os.path.basename(pdf)
        file_path = os.path.abspath(pdf)
        
        # ÊèêÂèñPDFÂü∫Êú¨‰ø°ÊÅØ
        pdf_metadata = extract_pdf_basic_info(pdf)
        
        # 1. ÊñáÊú¨ÂùóÊèêÂèñÔºàÂ¢ûÂº∫ÁâàÔºöÂåÖÂê´ÂÆåÊï¥metadataÔºâ
        loader = PyPDFLoader(pdf)
        documents = loader.load()
        docs = splitter.split_documents(documents)
        
        for chunk_idx, doc in enumerate(docs):
            # ÂàõÂª∫ËØ¶ÁªÜÁöÑmetadata
            chunk_metadata = {
                # Ê∫êÊñá‰ª∂‰ø°ÊÅØ
                'source_file': file_name,
                'source_path': file_path,
                'file_type': 'pdf',
                
                # PDFÊñáÊ°£‰ø°ÊÅØ
                'pdf_title': pdf_metadata.get('title', ''),
                'pdf_author': pdf_metadata.get('author', ''),
                'pdf_subject': pdf_metadata.get('subject', ''),
                'pdf_creator': pdf_metadata.get('creator', ''),
                'pdf_creation_date': pdf_metadata.get('creation_date', ''),
                
                # Êñá‰ª∂Á≥ªÁªü‰ø°ÊÅØ
                'file_size_bytes': pdf_metadata.get('file_size_bytes', 0),
                'file_creation_time': pdf_metadata.get('file_creation_time', ''),
                'file_modification_time': pdf_metadata.get('file_modification_time', ''),
                
                # Âùó‰ø°ÊÅØ
                'chunk_type': 'text',
                'chunk_index': chunk_idx,
                'page_number': doc.metadata.get('page', 'unknown'),
                'chunk_size': len(doc.page_content),
                
                # ÂéüÂßãPyPDFLoader metadata
                'original_metadata': doc.metadata,
                
                # Â§ÑÁêÜÊó∂Èó¥Êà≥
                'processing_timestamp': datetime.now().isoformat(),
                
                # ÂÜÖÂÆπÂìàÂ∏åÔºàÁî®‰∫éÂéªÈáçÂíåÁâàÊú¨ÊéßÂà∂Ôºâ
                'content_hash': hashlib.md5(doc.page_content.encode()).hexdigest(),
                
                # ÁªüËÆ°‰ø°ÊÅØ
                'word_count': len(doc.page_content.split()),
                'char_count': len(doc.page_content)
            }
            
            # ÂàõÂª∫Â∏¶metadataÁöÑDocument
            enhanced_doc = Document(
                text=doc.page_content,
                metadata=chunk_metadata
            )
            all_docs.append(enhanced_doc)

        # 2. Ë°®Ê†ºÊèêÂèñÔºàÂ¢ûÂº∫ÁâàÔºöÂåÖÂê´ÂÆåÊï¥metadataÔºâ
        with pdfplumber.open(pdf) as pdf_doc:
            for page_num, page in enumerate(pdf_doc.pages):
                tables = page.extract_tables()
                for table_idx, table in enumerate(tables):
                    # ËΩ¨‰∏∫ÁªìÊûÑÂåñÊñáÊú¨
                    table_text = "\n".join([", ".join([str(cell) if cell is not None else "" for cell in row]) for row in table])
                    
                    # Ë°®Ê†ºÊ†áËØÜ‰ø°ÊÅØ
                    table_header = f"„ÄêË°®Ê†º„ÄëÊñá‰ª∂:{file_name}, È°µÁ†Å:{page_num+1}, Ë°®Ê†ºÂ∫èÂè∑:{table_idx+1}"
                    full_table_text = f"{table_header}\n{table_text}"
                    
                    # Ë°®Ê†ºmetadata
                    table_metadata = {
                        # Ê∫êÊñá‰ª∂‰ø°ÊÅØ
                        'source_file': file_name,
                        'source_path': file_path,
                        'file_type': 'pdf',
                        
                        # PDFÊñáÊ°£‰ø°ÊÅØ
                        'pdf_title': pdf_metadata.get('title', ''),
                        'pdf_author': pdf_metadata.get('author', ''),
                        'pdf_subject': pdf_metadata.get('subject', ''),
                        
                        # Ë°®Ê†ºÁâπÂÆö‰ø°ÊÅØ
                        'chunk_type': 'table',
                        'page_number': page_num + 1,
                        'table_index': table_idx + 1,
                        'table_rows': len(table),
                        'table_cols': len(table[0]) if table else 0,
                        
                        # Â§ÑÁêÜ‰ø°ÊÅØ
                        'processing_timestamp': datetime.now().isoformat(),
                        'content_hash': hashlib.md5(full_table_text.encode()).hexdigest(),
                        'word_count': len(full_table_text.split()),
                        'char_count': len(full_table_text)
                    }
                    
                    table_doc = Document(
                        text=full_table_text,
                        metadata=table_metadata
                    )
                    all_docs.append(table_doc)

        # 3. ÂõæÂÉè/ÂõæË°®ÊèêÂèñÔºàÂ¢ûÂº∫ÁâàÔºöÂåÖÂê´ÂÆåÊï¥metadataÔºâ
        doc = fitz.open(pdf)
        for page_index in range(len(doc)):
            page = doc[page_index]
            for img_index, img in enumerate(page.get_images(full=True)):
                xref = img[0]
                base_image = doc.extract_image(xref)
                image_bytes = base_image["image"]
                img_ext = base_image["ext"]
                img_name = f"{os.path.splitext(file_name)[0]}_page{page_index+1}_img{img_index+1}.{img_ext}"
                img_save_path = os.path.join("./extracted_images", img_name)
                os.makedirs("./extracted_images", exist_ok=True)
                
                with open(img_save_path, "wb") as f:
                    f.write(image_bytes)
                
                # ÂõæÂÉèÊèèËø∞ÊñáÊú¨
                image_description = f"„ÄêÂõæË°®„ÄëÊñá‰ª∂:{file_name}, È°µÁ†Å:{page_index+1}, ÂõæÁâáÂ∫èÂè∑:{img_index+1}, Ë∑ØÂæÑ:{img_save_path}"
                
                # ÂõæÂÉèmetadata
                image_metadata = {
                    # Ê∫êÊñá‰ª∂‰ø°ÊÅØ
                    'source_file': file_name,
                    'source_path': file_path,
                    'file_type': 'pdf',
                    
                    # PDFÊñáÊ°£‰ø°ÊÅØ
                    'pdf_title': pdf_metadata.get('title', ''),
                    'pdf_author': pdf_metadata.get('author', ''),
                    'pdf_subject': pdf_metadata.get('subject', ''),
                    
                    # ÂõæÂÉèÁâπÂÆö‰ø°ÊÅØ
                    'chunk_type': 'image',
                    'page_number': page_index + 1,
                    'image_index': img_index + 1,
                    'image_format': img_ext,
                    'image_path': img_save_path,
                    'image_size_bytes': len(image_bytes),
                    
                    # ÂõæÂÉèÊäÄÊúØ‰ø°ÊÅØ
                    'image_width': base_image.get('width', 0),
                    'image_height': base_image.get('height', 0),
                    'image_colorspace': base_image.get('colorspace', 'unknown'),
                    
                    # Â§ÑÁêÜ‰ø°ÊÅØ
                    'processing_timestamp': datetime.now().isoformat(),
                    'content_hash': hashlib.md5(image_description.encode()).hexdigest(),
                    'word_count': len(image_description.split()),
                    'char_count': len(image_description)
                }
                
                image_doc = Document(
                    text=image_description,
                    metadata=image_metadata
                )
                all_docs.append(image_doc)
        
        doc.close()
    
    return all_docs

# ÊâßË°åÊñáÊ°£Â§ÑÁêÜ
print("ÂºÄÂßãÂ§ÑÁêÜÊñáÊ°£ÔºåÊ∑ªÂä†ÂÆåÊï¥metadata‰ø°ÊÅØ...")
enhanced_documents = create_enhanced_documents_with_metadata()

print(f"ÂÖ±Âä†ËΩΩÂàÜÂâ≤ {len(enhanced_documents)} ‰∏™ÊñáÊ°£ÂùóÔºàÂê´ÊñáÊú¨„ÄÅË°®Ê†º„ÄÅÂõæË°®‰ø°ÊÅØÔºâ")
print(f"ÊñáÊ°£Á±ªÂûãÂàÜÂ∏É:")
chunk_types = {}
for doc in enhanced_documents:
    chunk_type = doc.metadata.get('chunk_type', 'unknown')
    chunk_types[chunk_type] = chunk_types.get(chunk_type, 0) + 1

for chunk_type, count in chunk_types.items():
    print(f"  {chunk_type}: {count} ‰∏™")

print(f"\nÁ§∫‰æãÊñáÊ°£Âùómetadata:")
if enhanced_documents:
    example_doc = enhanced_documents[0]
    print(f"ÊñáÊú¨È¢ÑËßà: {example_doc.text[:200]}...")
    print(f"Metadata keys: {list(example_doc.metadata.keys())}")
    print(f"Ê∫êÊñá‰ª∂: {example_doc.metadata.get('source_file', 'N/A')}")
    print(f"ÂùóÁ±ªÂûã: {example_doc.metadata.get('chunk_type', 'N/A')}")
    print(f"È°µÁ†Å: {example_doc.metadata.get('page_number', 'N/A')}")
    print(f"PDFÊ†áÈ¢ò: {example_doc.metadata.get('pdf_title', 'N/A')}")
    print(f"PDF‰ΩúËÄÖ: {example_doc.metadata.get('pdf_author', 'N/A')}")

# Êõ¥Êñ∞documentsÂèòÈáèÁî®‰∫éÂêéÁª≠Â§ÑÁêÜ
documents = enhanced_documents


In [8]:
# È™åËØÅmetadata‰ø°ÊÅØ
def verify_metadata_completeness(documents):
    """È™åËØÅmetadata‰ø°ÊÅØÊòØÂê¶ÂÆåÊï¥"""
    print("=== MetadataÂÆåÊï¥ÊÄßÈ™åËØÅ ===")
    
    required_fields = ['source_file', 'chunk_type', 'page_number', 'processing_timestamp']
    
    for i, doc in enumerate(documents[:5]):  # Ê£ÄÊü•Ââç5‰∏™ÊñáÊ°£
        print(f"\nÊñáÊ°£ {i+1}:")
        print(f"ÊñáÊú¨ÈïøÂ∫¶: {len(doc.text)} Â≠óÁ¨¶")
        
        missing_fields = []
        for field in required_fields:
            if field not in doc.metadata:
                missing_fields.append(field)
            else:
                print(f"  {field}: {doc.metadata[field]}")
        
        if missing_fields:
            print(f"  ‚ùå Áº∫Â§±Â≠óÊÆµ: {missing_fields}")
        else:
            print(f"  ‚úÖ ÊâÄÊúâÂøÖÈúÄÂ≠óÊÆµÂÆåÊï¥")
            
        # ÊòæÁ§∫ÊñáÁåÆ‰ø°ÊÅØ
        if doc.metadata.get('pdf_title'):
            print(f"  üìñ ÊñáÁåÆÊ†áÈ¢ò: {doc.metadata['pdf_title']}")
        if doc.metadata.get('pdf_author'):
            print(f"  üë§ ÊñáÁåÆ‰ΩúËÄÖ: {doc.metadata['pdf_author']}")

# ÊâßË°åÈ™åËØÅ
verify_metadata_completeness(documents)

print("\n=== Ê∫êÊñáÁåÆËøΩË∏™ÂäüËÉΩÊºîÁ§∫ ===")
def get_source_citation(doc_metadata):
    """Ê†πÊçÆmetadataÁîüÊàêÂºïÁî®‰ø°ÊÅØ"""
    citation_parts = []
    
    # ‰ΩúËÄÖ
    if doc_metadata.get('pdf_author'):
        citation_parts.append(doc_metadata['pdf_author'])
    
    # Ê†áÈ¢ò
    if doc_metadata.get('pdf_title'):
        citation_parts.append(f'"{doc_metadata["pdf_title"]}"')
    
    # Êñá‰ª∂ÂêçÔºàÂ¶ÇÊûúÊ≤°ÊúâÊ†áÈ¢òÔºâ
    if not doc_metadata.get('pdf_title') and doc_metadata.get('source_file'):
        citation_parts.append(doc_metadata['source_file'])
    
    # È°µÁ†Å
    if doc_metadata.get('page_number') and doc_metadata['page_number'] != 'unknown':
        citation_parts.append(f"p. {doc_metadata['page_number']}")
    
    # ÂùóÁ±ªÂûã
    if doc_metadata.get('chunk_type'):
        if doc_metadata['chunk_type'] == 'table':
            citation_parts.append(f"(Ë°®Ê†º{doc_metadata.get('table_index', '')})")
        elif doc_metadata['chunk_type'] == 'image':
            citation_parts.append(f"(Âõæ{doc_metadata.get('image_index', '')})")
    
    return ", ".join(citation_parts)

# Á§∫‰æãÔºö‰∏∫Ââç3‰∏™ÊñáÊ°£ÁîüÊàêÂºïÁî®
for i, doc in enumerate(documents[:3]):
    citation = get_source_citation(doc.metadata)
    print(f"ÊñáÊ°£{i+1}ÂºïÁî®: {citation}")
    print(f"ÂÜÖÂÆπÈ¢ÑËßà: {doc.text[:100]}...\n")


=== MetadataÂÆåÊï¥ÊÄßÈ™åËØÅ ===

ÊñáÊ°£ 1:
ÊñáÊú¨ÈïøÂ∫¶: 52 Â≠óÁ¨¶
  source_file: 978-3-662-56472-1.pdf
  chunk_type: text
  page_number: 0
  processing_timestamp: 2025-08-16
  ‚úÖ ÊâÄÊúâÂøÖÈúÄÂ≠óÊÆµÂÆåÊï¥

ÊñáÊ°£ 2:
ÊñáÊú¨ÈïøÂ∫¶: 32 Â≠óÁ¨¶
  source_file: 978-3-662-56472-1.pdf
  chunk_type: text
  page_number: 1
  processing_timestamp: 2025-08-16
  ‚úÖ ÊâÄÊúâÂøÖÈúÄÂ≠óÊÆµÂÆåÊï¥

ÊñáÊ°£ 3:
ÊñáÊú¨ÈïøÂ∫¶: 81 Â≠óÁ¨¶
  source_file: 978-3-662-56472-1.pdf
  chunk_type: text
  page_number: 2
  processing_timestamp: 2025-08-16
  ‚úÖ ÊâÄÊúâÂøÖÈúÄÂ≠óÊÆµÂÆåÊï¥

ÊñáÊ°£ 4:
ÊñáÊú¨ÈïøÂ∫¶: 780 Â≠óÁ¨¶
  source_file: 978-3-662-56472-1.pdf
  chunk_type: text
  page_number: 3
  processing_timestamp: 2025-08-16
  ‚úÖ ÊâÄÊúâÂøÖÈúÄÂ≠óÊÆµÂÆåÊï¥

ÊñáÊ°£ 5:
ÊñáÊú¨ÈïøÂ∫¶: 795 Â≠óÁ¨¶
  source_file: 978-3-662-56472-1.pdf
  chunk_type: text
  page_number: 3
  processing_timestamp: 2025-08-16
  ‚úÖ ÊâÄÊúâÂøÖÈúÄÂ≠óÊÆµÂÆåÊï¥

=== Ê∫êÊñáÁåÆËøΩË∏™ÂäüËÉΩÊºîÁ§∫ ===
ÊñáÊ°£1ÂºïÁî®: 978-3-662-56472-1.pdf
ÂÜÖÂÆπÈ¢ÑËßà: Handbook

In [5]:
# ‰øÆÂ§çmetadataËøáÈïøÈóÆÈ¢ò - ÂàõÂª∫Á≤æÁÆÄÁâàmetadata

from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import glob
from llama_index.core import Document

import pdfplumber
import fitz  # PyMuPDF
import os
from datetime import datetime
import hashlib

import warnings
warnings.filterwarnings("ignore")

def extract_pdf_basic_info(pdf_path):
    """ÊèêÂèñPDFÂü∫Êú¨‰ø°ÊÅØ‰Ωú‰∏∫metadataÔºà‰ºòÂåñÁâàÔºöÂáèÂ∞ëÂÜó‰Ωô‰ø°ÊÅØÔºâ"""
    try:
        doc = fitz.open(pdf_path)
        metadata = doc.metadata
        doc.close()
        
        # Ëé∑ÂèñÊñá‰ª∂‰ø°ÊÅØ
        file_stats = os.stat(pdf_path)
        file_size = file_stats.st_size
        
        # Âè™ËøîÂõûÊ†∏ÂøÉÂ≠óÊÆµÔºåÂπ∂ÈôêÂà∂ÈïøÂ∫¶ÈÅøÂÖçmetadataËøáÈïø
        return {
            'title': (metadata.get('title', '') or '')[:100],  # ÈôêÂà∂ÈïøÂ∫¶
            'author': (metadata.get('author', '') or '')[:50],   # ÈôêÂà∂ÈïøÂ∫¶
            'subject': (metadata.get('subject', '') or '')[:50], # ÈôêÂà∂ÈïøÂ∫¶
            'file_size_bytes': file_size
        }
    except Exception as e:
        print(f"Warning: Could not extract metadata from {pdf_path}: {e}")
        return {}

def create_optimized_documents_with_metadata():
    """ÂàõÂª∫ÂåÖÂê´‰ºòÂåñmetadataÁöÑÊñáÊ°£ÂùóÔºåÈÅøÂÖçmetadataËøáÈïøÈóÆÈ¢ò"""
    
    pdf_files = glob.glob("../zotero/*.pdf")
    all_docs = []
    
    # ÂàÜÂâ≤Âô®
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=100,
        separators=["\n\n", "\n", ".", "„ÄÇ", "ÔºÅ", "Ôºü", "!", "?"]
    )
    
    for pdf in pdf_files:
        file_name = os.path.basename(pdf)
        file_path = os.path.abspath(pdf)
        
        # ÊèêÂèñPDFÂü∫Êú¨‰ø°ÊÅØ
        pdf_metadata = extract_pdf_basic_info(pdf)
        
        # 1. ÊñáÊú¨ÂùóÊèêÂèñÔºà‰ºòÂåñÁâàÔºöÁ≤æÁÆÄmetadataÔºâ
        loader = PyPDFLoader(pdf)
        documents = loader.load()
        docs = splitter.split_documents(documents)
        
        for chunk_idx, doc in enumerate(docs):
            # ÂàõÂª∫Á≤æÁÆÄÁöÑmetadataÔºàÂè™‰øùÁïôÊ†∏ÂøÉÂ≠óÊÆµÔºâ
            chunk_metadata = {
                # Ê†∏ÂøÉÊ∫êÊñá‰ª∂‰ø°ÊÅØ
                'source_file': file_name,
                'file_type': 'pdf',
                
                # Ê†∏ÂøÉPDFÊñáÊ°£‰ø°ÊÅØÔºà‰ªÖ‰øùÁïôÈùûÁ©∫Â≠óÊÆµÔºâ
                'pdf_title': pdf_metadata.get('title', '')[:100] if pdf_metadata.get('title') else '',  # ÈôêÂà∂ÈïøÂ∫¶
                'pdf_author': pdf_metadata.get('author', '')[:50] if pdf_metadata.get('author') else '',  # ÈôêÂà∂ÈïøÂ∫¶
                
                # Ê†∏ÂøÉÂùó‰ø°ÊÅØ
                'chunk_type': 'text',
                'chunk_index': chunk_idx,
                'page_number': doc.metadata.get('page', 'unknown'),
                
                # Â§ÑÁêÜÊó∂Èó¥Êà≥ÔºàÁÆÄÂåñÊ†ºÂºèÔºâ
                'processing_timestamp': datetime.now().strftime('%Y-%m-%d'),
                
                # ÂÜÖÂÆπÁªüËÆ°ÔºàÁ≤æÁÆÄÔºâ
                'word_count': len(doc.page_content.split()),
                'char_count': len(doc.page_content)
            }
            
            # ÂàõÂª∫Â∏¶Á≤æÁÆÄmetadataÁöÑDocument
            enhanced_doc = Document(
                text=doc.page_content,
                metadata=chunk_metadata
            )
            all_docs.append(enhanced_doc)

        # 2. Ë°®Ê†ºÊèêÂèñÔºà‰ºòÂåñÁâàÔºöÁ≤æÁÆÄmetadataÔºâ
        with pdfplumber.open(pdf) as pdf_doc:
            for page_num, page in enumerate(pdf_doc.pages):
                tables = page.extract_tables()
                for table_idx, table in enumerate(tables):
                    # ËΩ¨‰∏∫ÁªìÊûÑÂåñÊñáÊú¨
                    table_text = "\n".join([", ".join([str(cell) if cell is not None else "" for cell in row]) for row in table])
                    
                    # Ë°®Ê†ºÊ†áËØÜ‰ø°ÊÅØ
                    table_header = f"„ÄêË°®Ê†º„ÄëÊñá‰ª∂:{file_name}, È°µÁ†Å:{page_num+1}, Ë°®Ê†ºÂ∫èÂè∑:{table_idx+1}"
                    full_table_text = f"{table_header}\n{table_text}"
                    
                    # Á≤æÁÆÄË°®Ê†ºmetadata
                    table_metadata = {
                        'source_file': file_name,
                        'file_type': 'pdf',
                        'pdf_title': pdf_metadata.get('title', '')[:100] if pdf_metadata.get('title') else '',
                        'pdf_author': pdf_metadata.get('author', '')[:50] if pdf_metadata.get('author') else '',
                        'chunk_type': 'table',
                        'page_number': page_num + 1,
                        'table_index': table_idx + 1,
                        'table_rows': len(table),
                        'table_cols': len(table[0]) if table else 0,
                        'processing_timestamp': datetime.now().strftime('%Y-%m-%d'),
                        'word_count': len(full_table_text.split()),
                        'char_count': len(full_table_text)
                    }
                    
                    table_doc = Document(
                        text=full_table_text,
                        metadata=table_metadata
                    )
                    all_docs.append(table_doc)

        # 3. ÂõæÂÉè/ÂõæË°®ÊèêÂèñÔºà‰ºòÂåñÁâàÔºöÁ≤æÁÆÄmetadataÔºâ
        doc = fitz.open(pdf)
        for page_index in range(len(doc)):
            page = doc[page_index]
            for img_index, img in enumerate(page.get_images(full=True)):
                xref = img[0]
                base_image = doc.extract_image(xref)
                image_bytes = base_image["image"]
                img_ext = base_image["ext"]
                img_name = f"{os.path.splitext(file_name)[0]}_page{page_index+1}_img{img_index+1}.{img_ext}"
                img_save_path = os.path.join("./extracted_images", img_name)
                os.makedirs("./extracted_images", exist_ok=True)
                
                with open(img_save_path, "wb") as f:
                    f.write(image_bytes)
                
                # ÂõæÂÉèÊèèËø∞ÊñáÊú¨
                image_description = f"„ÄêÂõæË°®„ÄëÊñá‰ª∂:{file_name}, È°µÁ†Å:{page_index+1}, ÂõæÁâáÂ∫èÂè∑:{img_index+1}, Ë∑ØÂæÑ:{img_save_path}"
                
                # Á≤æÁÆÄÂõæÂÉèmetadata
                image_metadata = {
                    'source_file': file_name,
                    'file_type': 'pdf',
                    'pdf_title': pdf_metadata.get('title', '')[:100] if pdf_metadata.get('title') else '',
                    'pdf_author': pdf_metadata.get('author', '')[:50] if pdf_metadata.get('author') else '',
                    'chunk_type': 'image',
                    'page_number': page_index + 1,
                    'image_index': img_index + 1,
                    'image_format': img_ext,
                    'image_path': img_save_path,
                    'processing_timestamp': datetime.now().strftime('%Y-%m-%d'),
                    'word_count': len(image_description.split()),
                    'char_count': len(image_description)
                }
                
                image_doc = Document(
                    text=image_description,
                    metadata=image_metadata
                )
                all_docs.append(image_doc)
        
        doc.close()
    
    return all_docs

# ËÆ°ÁÆómetadataÈïøÂ∫¶ÁöÑÂáΩÊï∞
def calculate_metadata_length(doc):
    """ËÆ°ÁÆóÂçï‰∏™ÊñáÊ°£ÁöÑmetadataÈïøÂ∫¶"""
    import json
    metadata_str = json.dumps(doc.metadata, ensure_ascii=False)
    return len(metadata_str)

print("=== ÂàõÂª∫‰ºòÂåñÁâàÊñáÊ°£ÔºàÁ≤æÁÆÄmetadataÔºâ===")
optimized_documents = create_optimized_documents_with_metadata()

print(f"ÂÖ±Âä†ËΩΩÂàÜÂâ≤ {len(optimized_documents)} ‰∏™ÊñáÊ°£ÂùóÔºàÂê´ÊñáÊú¨„ÄÅË°®Ê†º„ÄÅÂõæË°®‰ø°ÊÅØÔºâ")

# Ê£ÄÊü•metadataÈïøÂ∫¶
metadata_lengths = [calculate_metadata_length(doc) for doc in optimized_documents[:10]]
max_metadata_length = max(metadata_lengths)
avg_metadata_length = sum(metadata_lengths) / len(metadata_lengths)

print(f"\nüìä MetadataÈïøÂ∫¶ÁªüËÆ°ÔºàÂâç10‰∏™ÊñáÊ°£Ôºâ:")
print(f"ÊúÄÂ§ßmetadataÈïøÂ∫¶: {max_metadata_length} Â≠óÁ¨¶")
print(f"Âπ≥ÂùámetadataÈïøÂ∫¶: {avg_metadata_length:.1f} Â≠óÁ¨¶")

# ÊòæÁ§∫‰ºòÂåñÂêéÁöÑmetadataÁ§∫‰æã
print(f"\nÁ§∫‰æã‰ºòÂåñÂêémetadata:")
if optimized_documents:
    example_doc = optimized_documents[0]
    print(f"ÊñáÊú¨È¢ÑËßà: {example_doc.text[:200]}...")
    print(f"Metadata keys: {list(example_doc.metadata.keys())}")
    print(f"MetadataÈïøÂ∫¶: {calculate_metadata_length(example_doc)} Â≠óÁ¨¶")
    
    import json
    print(f"MetadataÂÜÖÂÆπ:")
    print(json.dumps(example_doc.metadata, ensure_ascii=False, indent=2))

# Êõ¥Êñ∞documentsÂèòÈáè
documents = optimized_documents
print(f"\n‚úÖ Â∑≤Êõ¥Êñ∞documentsÂèòÈáèÔºåÂÖ± {len(documents)} ‰∏™ÊñáÊ°£Âùó")


=== ÂàõÂª∫‰ºòÂåñÁâàÊñáÊ°£ÔºàÁ≤æÁÆÄmetadataÔºâ===


Cannot set gray non-stroke color because /'Pa1' is an invalid float value
Cannot set gray non-stroke color because /'Pa2' is an invalid float value
Cannot set gray non-stroke color because /'Pa3' is an invalid float value
Cannot set gray non-stroke color because /'Pa1' is an invalid float value
Cannot set gray non-stroke color because /'Pa2' is an invalid float value
Cannot set gray non-stroke color because /'Pa1' is an invalid float value
Cannot set gray non-stroke color because /'Pa1' is an invalid float value
Cannot set gray non-stroke color because /'Pa1' is an invalid float value
Cannot set gray non-stroke color because /'Pa1' is an invalid float value
Cannot set gray non-stroke color because /'Pa1' is an invalid float value
Cannot set gray non-stroke color because /'Pa1' is an invalid float value
Cannot set gray non-stroke color because /'Pa2' is an invalid float value
Cannot set gray non-stroke color because /'Pa1' is an invalid float value
Cannot set gray non-stroke color becau

ÂÖ±Âä†ËΩΩÂàÜÂâ≤ 18727 ‰∏™ÊñáÊ°£ÂùóÔºàÂê´ÊñáÊú¨„ÄÅË°®Ê†º„ÄÅÂõæË°®‰ø°ÊÅØÔºâ

üìä MetadataÈïøÂ∫¶ÁªüËÆ°ÔºàÂâç10‰∏™ÊñáÊ°£Ôºâ:
ÊúÄÂ§ßmetadataÈïøÂ∫¶: 229 Â≠óÁ¨¶
Âπ≥ÂùámetadataÈïøÂ∫¶: 228.0 Â≠óÁ¨¶

Á§∫‰æã‰ºòÂåñÂêémetadata:
ÊñáÊú¨È¢ÑËßà: Handbook of 
Photovoltaic 
Silicon
Deren Yang
Editor...
Metadata keys: ['source_file', 'file_type', 'pdf_title', 'pdf_author', 'chunk_type', 'chunk_index', 'page_number', 'processing_timestamp', 'word_count', 'char_count']
MetadataÈïøÂ∫¶: 226 Â≠óÁ¨¶
MetadataÂÜÖÂÆπ:
{
  "source_file": "978-3-662-56472-1.pdf",
  "file_type": "pdf",
  "pdf_title": "",
  "pdf_author": "",
  "chunk_type": "text",
  "chunk_index": 0,
  "page_number": 0,
  "processing_timestamp": "2025-08-17",
  "word_count": 7,
  "char_count": 52
}

‚úÖ Â∑≤Êõ¥Êñ∞documentsÂèòÈáèÔºåÂÖ± 18727 ‰∏™ÊñáÊ°£Âùó


In [7]:
# ‰ΩøÁî®Â¢ûÂº∫ÁâàÊñáÊ°£ËøõË°åÂêëÈáèÂåñ
print("=== ÂáÜÂ§áËøõË°åÂêëÈáèÂåñÂ≠òÂÇ® ===")
print(f"ÊÄªÊñáÊ°£Êï∞Èáè: {len(documents)}")

# ÊòæÁ§∫metadataÁªüËÆ°‰ø°ÊÅØ
metadata_stats = {}
for doc in documents:
    for key in doc.metadata.keys():
        if key not in metadata_stats:
            metadata_stats[key] = 0
        metadata_stats[key] += 1

print(f"MetadataÂ≠óÊÆµË¶ÜÁõñÁéá:")
for key, count in sorted(metadata_stats.items()):
    coverage = (count / len(documents)) * 100
    print(f"  {key}: {count}/{len(documents)} ({coverage:.1f}%)")

# ÂàõÂª∫ÂêëÈáèÂ≠òÂÇ®Ôºà‰ΩøÁî®Êñ∞ÁöÑË∑ØÂæÑ‰ª•Âå∫Âà´‰∫éÊóßÁâàÊú¨Ôºâ
from llama_index.vector_stores.deeplake import DeepLakeVectorStore
from llama_index.core import StorageContext, VectorStoreIndex

vector_dataset_new = "./dataset/vector_storage_with_metadata"
print(f"\nÂáÜÂ§áÂàõÂª∫ÂêëÈáèÂ≠òÂÇ®: {vector_dataset_new}")

# ÂàõÂª∫ÂåÖÂê´ÂÆåÊï¥metadataÁöÑÂêëÈáèÁ¥¢Âºï
try:
    vector_store = DeepLakeVectorStore(dataset_path=vector_dataset_new, overwrite=True)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    
    print("ÂºÄÂßãÂêëÈáèÂåñÂ§ÑÁêÜ...")
    # ‰ΩøÁî®Â∏¶ÊúâÂÆåÊï¥metadataÁöÑÊñáÊ°£ÂàõÂª∫Á¥¢Âºï
    index_with_metadata = VectorStoreIndex.from_documents(
        documents, 
        storage_context=storage_context, 
        show_progress=True
    )
    
    print(f"‚úÖ ÂêëÈáèÂåñÂÆåÊàêÔºÅÁ¥¢ÂºïÂ∑≤‰øùÂ≠òÂà∞: {vector_dataset_new}")
    
except Exception as e:
    print(f"‚ùå ÂêëÈáèÂåñËøáÁ®ã‰∏≠Âá∫Áé∞ÈîôËØØ: {e}")

print("\n=== RAGÁ≥ªÁªü‰∏≠ÁöÑÊ∫êÊñáÁåÆËøΩË∏™ÊºîÁ§∫ ===")
# ÊºîÁ§∫Â¶Ç‰ΩïÂú®Êü•ËØ¢Êó∂ËøîÂõûÊ∫êÊñáÁåÆ‰ø°ÊÅØ
def enhanced_query_with_sources(query, top_k=3):
    """ÊâßË°åÊü•ËØ¢Âπ∂ËøîÂõûÂ∏¶Ê∫êÊñáÁåÆ‰ø°ÊÅØÁöÑÁªìÊûú"""
    try:
        # ÂàõÂª∫Êü•ËØ¢ÂºïÊìé
        query_engine = index_with_metadata.as_query_engine(
            similarity_top_k=top_k,
            response_mode="tree_summarize"
        )
        
        # ÊâßË°åÊü•ËØ¢
        response = query_engine.query(query)
        
        print(f"Êü•ËØ¢: {query}")
        print(f"Á≠îÊ°à: {response}")
        
        # Ëé∑ÂèñÊ∫êËäÇÁÇπ‰ø°ÊÅØ
        if hasattr(response, 'source_nodes'):
            print(f"\nüìö ÂºïÁî®Êù•Ê∫ê ({len(response.source_nodes)} ‰∏™):")
            for i, node in enumerate(response.source_nodes, 1):
                metadata = node.node.metadata
                citation = get_source_citation(metadata)
                score = getattr(node, 'score', 'N/A')
                
                print(f"{i}. {citation}")
                print(f"   Áõ∏‰ººÂ∫¶ÂàÜÊï∞: {score}")
                print(f"   ÂÜÖÂÆπ: {node.node.text[:150]}...")
                print()
        
        return response
        
    except Exception as e:
        print(f"Êü•ËØ¢ÊâßË°åÂ§±Ë¥•: {e}")
        return None

# Â¶ÇÊûúÂêëÈáèÂåñÊàêÂäüÔºåÊâßË°åÊºîÁ§∫Êü•ËØ¢
if 'index_with_metadata' in locals():
    print("ÊâßË°åÊºîÁ§∫Êü•ËØ¢...")
    enhanced_query_with_sources("Â¶Ç‰ΩïÊèêÈ´òÂ§™Èò≥ËÉΩÁîµÊ±†ÊïàÁéáÔºü", top_k=3)
else:
    print("ÂêëÈáèÂåñÊú™ÊàêÂäüÔºåË∑≥ËøáÊü•ËØ¢ÊºîÁ§∫")


=== ÂáÜÂ§áËøõË°åÂêëÈáèÂåñÂ≠òÂÇ® ===
ÊÄªÊñáÊ°£Êï∞Èáè: 18727
MetadataÂ≠óÊÆµË¶ÜÁõñÁéá:
  char_count: 18727/18727 (100.0%)
  chunk_index: 13528/18727 (72.2%)
  chunk_type: 18727/18727 (100.0%)
  file_type: 18727/18727 (100.0%)
  image_format: 4320/18727 (23.1%)
  image_index: 4320/18727 (23.1%)
  image_path: 4320/18727 (23.1%)
  page_number: 18727/18727 (100.0%)
  pdf_author: 18727/18727 (100.0%)
  pdf_title: 18727/18727 (100.0%)
  processing_timestamp: 18727/18727 (100.0%)
  source_file: 18727/18727 (100.0%)
  table_cols: 879/18727 (4.7%)
  table_index: 879/18727 (4.7%)
  table_rows: 879/18727 (4.7%)
  word_count: 18727/18727 (100.0%)

ÂáÜÂ§áÂàõÂª∫ÂêëÈáèÂ≠òÂÇ®: ./dataset/vector_storage_with_metadata




ÂºÄÂßãÂêëÈáèÂåñÂ§ÑÁêÜ...


Parsing nodes: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 18727/18727 [00:25<00:00, 733.64it/s] 
Generating embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2048/2048 [3:24:25<00:00,  5.99s/it]  


Uploading data to deeplake dataset.


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2048/2048 [00:14<00:00, 137.93it/s]


Dataset(path='./dataset/vector_storage_with_metadata', tensors=['text', 'metadata', 'embedding', 'id'])

  tensor      htype       shape       dtype  compression
  -------    -------     -------     -------  ------- 
   text       text      (2048, 1)      str     None   
 metadata     json      (2048, 1)      str     None   
 embedding  embedding  (2048, 1024)  float32   None   
    id        text      (2048, 1)      str     None   


Generating embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2048/2048 [3:25:53<00:00,  6.03s/it]  


Uploading data to deeplake dataset.


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2048/2048 [00:16<00:00, 127.25it/s]


Dataset(path='./dataset/vector_storage_with_metadata', tensors=['text', 'metadata', 'embedding', 'id'])

  tensor      htype       shape       dtype  compression
  -------    -------     -------     -------  ------- 
   text       text      (4096, 1)      str     None   
 metadata     json      (4096, 1)      str     None   
 embedding  embedding  (4096, 1024)  float32   None   
    id        text      (4096, 1)      str     None   


Generating embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2048/2048 [3:45:47<00:00,  6.62s/it]  


Uploading data to deeplake dataset.


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2048/2048 [00:27<00:00, 75.66it/s] 


Dataset(path='./dataset/vector_storage_with_metadata', tensors=['text', 'metadata', 'embedding', 'id'])

  tensor      htype       shape       dtype  compression
  -------    -------     -------     -------  ------- 
   text       text      (6144, 1)      str     None   
 metadata     json      (6144, 1)      str     None   
 embedding  embedding  (6144, 1024)  float32   None   
    id        text      (6144, 1)      str     None   


Generating embeddings:  71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 1460/2048 [3:08:05<1:17:02,  7.86s/it]

: 

In [6]:
# Âø´ÈÄüËØäÊñ≠ÊµãËØï - ‰ΩøÁî®ÊûÅÂ∞èÊâπÈáèÈ™åËØÅDeepLakeÂ≠òÂÇ®
import os
import shutil
from llama_index.vector_stores.deeplake import DeepLakeVectorStore
from llama_index.core import StorageContext, VectorStoreIndex

print("üöÄ ÂºÄÂßãÂø´ÈÄüËØäÊñ≠ÊµãËØï: DeepLakeÊú¨Âú∞Â≠òÂÇ®")

try:
    # ÂÆö‰πâ‰∏Ä‰∏™ÊûÅÂ∞èÁöÑÊâπÈáèËøõË°åÊµãËØï
    batch_size = 5 
    if 'documents' not in locals() or not documents:
        print("‚ö†Ô∏è 'documents' ÂèòÈáèÊú™ÂÆö‰πâÊàñ‰∏∫Á©∫„ÄÇËØ∑Á°Æ‰øùÂú®ËøêË°åÊ≠§ÂçïÂÖÉÊ†º‰πãÂâçÔºåÂÖàËøêË°åÊñáÊ°£Âä†ËΩΩÂíåÂ§ÑÁêÜÁöÑÂçïÂÖÉÊ†º„ÄÇ")
    elif len(documents) < batch_size:
        print(f"‚ö†Ô∏è ÊñáÊ°£ÊÄªÊï∞ ({len(documents)}) Â∞è‰∫éÊµãËØïÊâπÈáè ({batch_size}), Â∞Ü‰ΩøÁî®ÊâÄÊúâÂèØÁî®ÊñáÊ°£„ÄÇ")
        test_docs = documents
    else:
        test_docs = documents[:batch_size]
        print(f"üìã ÊµãËØïÊñáÊ°£Êï∞Èáè: {len(test_docs)}")

        # ÂÆö‰πâÊµãËØïÁî®ÁöÑÊï∞ÊçÆÂ∫ìË∑ØÂæÑ
        test_path = "./dataset/vector_storage_quick_test"
        
        # Â¶ÇÊûúÊóßÁöÑÊµãËØïË∑ØÂæÑÂ≠òÂú®ÔºåÂÖàÊ∏ÖÁêÜ
        if os.path.exists(test_path):
            try:
                shutil.rmtree(test_path)
                print(f"‚úÖ Â∑≤Ê∏ÖÁêÜÊóßÁöÑÊµãËØïÊï∞ÊçÆÂ∫ì: {test_path}")
            except Exception as e:
                print(f"‚ö†Ô∏è Ê∏ÖÁêÜÊóßÁöÑÊµãËØïÊï∞ÊçÆÂ∫ìÂ§±Ë¥• {test_path}: {e}")

        # ÂàùÂßãÂåñDeepLakeÂêëÈáèÂ≠òÂÇ® (Â∑≤ÁßªÈô§‰∏çÂÖºÂÆπÁöÑruntimeÂèÇÊï∞)
        vector_store_test = DeepLakeVectorStore(
            dataset_path=test_path, 
            overwrite=True  # Á°Æ‰øùÊØèÊ¨°ÊµãËØïÈÉΩÊòØÂÖ®Êñ∞ÁöÑ
        )
        storage_context_test = StorageContext.from_defaults(vector_store=vector_store_test)
        
        print("‚è≥ Ê≠£Âú®ÂêëÈáèÂåñÂπ∂Â≠òÂÇ®ÊûÅÂ∞èÊâπÈáè...")
        
        # ‰ªéÊñáÊ°£ÂàõÂª∫Á¥¢Âºï
        test_index = VectorStoreIndex.from_documents(
            test_docs, 
            storage_context_test, 
            show_progress=True
        )
        
        print(f"\n‚úÖ Âø´ÈÄüËØäÊñ≠ÊµãËØïÊàêÂäüÔºÅ")
        print(f"üéâ DeepLakeÊàêÂäüÂú® '{test_path}' ÂàõÂª∫Âπ∂Â≠òÂÇ®‰∫Ü {len(test_docs)} ‰∏™ÊñáÊ°£ÁöÑÂêëÈáè„ÄÇ")
        print("üí° ÊÇ®Áé∞Âú®ÂèØ‰ª•ÊÅ¢Â§çËøô‰∏™ÂçïÂÖÉÊ†ºÁöÑ‰ª£Á†ÅÔºåÂ¢ûÂä† 'batch_size' ÊàñÂ§ÑÁêÜÊâÄÊúâÊñáÊ°£ÔºåÁÑ∂ÂêéËøêË°åÂÆåÊï¥ÁöÑÂêëÈáèÂåñÊµÅÁ®ã„ÄÇ")

except Exception as e:
    import traceback
    print(f"\n‚ùå Âø´ÈÄüËØäÊñ≠ÊµãËØïÂ§±Ë¥•: {e}")
    print(f"   ÈîôËØØÁ±ªÂûã: {type(e)}")
    print("   Traceback:")
    traceback.print_exc()
    print("\nüí° Âª∫ËÆÆÔºö")
    print("   1. Ê£ÄÊü•ÈîôËØØ‰ø°ÊÅØÔºåÁâπÂà´ÊòØÂÖ≥‰∫éÁΩëÁªú„ÄÅÊùÉÈôêÊàñÁ£ÅÁõòÁ©∫Èó¥ÁöÑÈÉ®ÂàÜ„ÄÇ")
    print("   2. Â∞ùËØïÈáçÂêØJupyter NotebookÂÜÖÊ†∏ÔºàKernel -> Restart KernelÔºâ„ÄÇ")
    print("   3. Á°ÆËÆ§ 'documents' ÂèòÈáèÂ∑≤Ê≠£Á°ÆÂä†ËΩΩ„ÄÇ")


üöÄ ÂºÄÂßãÂø´ÈÄüËØäÊñ≠ÊµãËØï: DeepLakeÊú¨Âú∞Â≠òÂÇ®
üìã ÊµãËØïÊñáÊ°£Êï∞Èáè: 5




‚è≥ Ê≠£Âú®ÂêëÈáèÂåñÂπ∂Â≠òÂÇ®ÊûÅÂ∞èÊâπÈáè...


Parsing nodes: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:00<00:00, 952.04it/s]
Generating embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:28<00:00,  5.80s/it]

Uploading data to deeplake dataset.



100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:00<00:00, 45.44it/s]


Dataset(path='./dataset/vector_storage_quick_test', tensors=['text', 'metadata', 'embedding', 'id'])

  tensor      htype      shape     dtype  compression
  -------    -------    -------   -------  ------- 
   text       text      (5, 1)      str     None   
 metadata     json      (5, 1)      str     None   
 embedding  embedding  (5, 1024)  float32   None   
    id        text      (5, 1)      str     None   

‚úÖ Âø´ÈÄüËØäÊñ≠ÊµãËØïÊàêÂäüÔºÅ
üéâ DeepLakeÊàêÂäüÂú® './dataset/vector_storage_quick_test' ÂàõÂª∫Âπ∂Â≠òÂÇ®‰∫Ü 5 ‰∏™ÊñáÊ°£ÁöÑÂêëÈáè„ÄÇ
üí° ÊÇ®Áé∞Âú®ÂèØ‰ª•ÊÅ¢Â§çËøô‰∏™ÂçïÂÖÉÊ†ºÁöÑ‰ª£Á†ÅÔºåÂ¢ûÂä† 'batch_size' ÊàñÂ§ÑÁêÜÊâÄÊúâÊñáÊ°£ÔºåÁÑ∂ÂêéËøêË°åÂÆåÊï¥ÁöÑÂêëÈáèÂåñÊµÅÁ®ã„ÄÇ


## ÊñáÊ°£ÂàÜÂâ≤‰ª£Á†ÅÊîπËøõÊÄªÁªì

### üîç **ÈóÆÈ¢òÂàÜÊûê**
ÂéüÂßã‰ª£Á†ÅÁº∫Â∞ëÂéüÂßãÊñáÁåÆÁöÑmetadata‰ø°ÊÅØÔºåÂØºËá¥RAGÁ≥ªÁªüÊó†Ê≥ïËøΩË∏™ÂíåËøîÂõûÊ∫êÊñáÁåÆ‰ø°ÊÅØ„ÄÇ

### ‚úÖ **ÊîπËøõÂÜÖÂÆπ**

#### 1. **ÂÆåÊï¥ÁöÑPDFÂÖÉÊï∞ÊçÆÊèêÂèñ**
- PDFÊ†áÈ¢ò„ÄÅ‰ΩúËÄÖ„ÄÅ‰∏ªÈ¢ò„ÄÅÂàõÂª∫ËÄÖÁ≠â
- Êñá‰ª∂ÂàõÂª∫/‰øÆÊîπÊó∂Èó¥
- Êñá‰ª∂Â§ßÂ∞èÁ≠âÊäÄÊúØ‰ø°ÊÅØ

#### 2. **ËØ¶ÁªÜÁöÑÊñáÊ°£Âùómetadata**
ÊØè‰∏™ÊñáÊ°£ÂùóÁé∞Âú®ÂåÖÂê´Ôºö
- **Ê∫êÊñá‰ª∂‰ø°ÊÅØ**: `source_file`, `source_path`, `file_type`
- **PDFÊñáÊ°£‰ø°ÊÅØ**: `pdf_title`, `pdf_author`, `pdf_subject`
- **ÂùóÁâπÂÆö‰ø°ÊÅØ**: `chunk_type`, `chunk_index`, `page_number`
- **ÂÜÖÂÆπÁªüËÆ°**: `word_count`, `char_count`, `chunk_size`
- **Â§ÑÁêÜ‰ø°ÊÅØ**: `processing_timestamp`, `content_hash`
- **ÂéüÂßãmetadata**: ‰øùÁïôPyPDFLoaderÁöÑÂéüÂßã‰ø°ÊÅØ

#### 3. **ÂàÜÁ±ªÂûãmetadata**
- **ÊñáÊú¨Âùó**: È°µÁ†Å„ÄÅÂùóÁ¥¢Âºï„ÄÅÂÜÖÂÆπÁªüËÆ°
- **Ë°®Ê†º**: Ë°®Ê†ºË°åÂàóÊï∞„ÄÅË°®Ê†ºÁ¥¢Âºï
- **ÂõæÂÉè**: ÂõæÂÉèÂ∞∫ÂØ∏„ÄÅÊ†ºÂºè„ÄÅ‰øùÂ≠òË∑ØÂæÑ

#### 4. **Ê∫êÊñáÁåÆËøΩË∏™ÂäüËÉΩ**
- `get_source_citation()`: Ê†πÊçÆmetadataÁîüÊàêÊ†áÂáÜÂºïÁî®Ê†ºÂºè
- `enhanced_query_with_sources()`: Êü•ËØ¢Êó∂ËøîÂõûÂÆåÊï¥Êù•Ê∫ê‰ø°ÊÅØ

### üéØ **‰ΩøÁî®ÊïàÊûú**

Áé∞Âú®RAGÁ≥ªÁªüÂèØ‰ª•Ôºö
1. **ÂáÜÁ°ÆËøΩË∏™Êù•Ê∫ê**: ËøîÂõûÂÖ∑‰ΩìÁöÑÊñá‰ª∂Âêç„ÄÅÈ°µÁ†Å„ÄÅÊÆµËêΩ‰ΩçÁΩÆ
2. **ÁîüÊàêÊ†áÂáÜÂºïÁî®**: Ëá™Âä®Ê†ºÂºèÂåñ‰ΩúËÄÖ„ÄÅÊ†áÈ¢ò„ÄÅÈ°µÁ†Å‰ø°ÊÅØ
3. **Âå∫ÂàÜÂÜÖÂÆπÁ±ªÂûã**: Ê†áËØÜÊñáÊú¨„ÄÅË°®Ê†º„ÄÅÂõæÂÉèÊù•Ê∫ê
4. **Êèê‰æõÂèØ‰ø°Â∫¶**: ÈÄöËøáÁõ∏‰ººÂ∫¶ÂàÜÊï∞ËØÑ‰º∞Êù•Ê∫êÂèØÈù†ÊÄß

### üìÅ **Êñ∞ÁöÑÂêëÈáèÂ≠òÂÇ®Ë∑ØÂæÑ**
‰ΩøÁî® `./dataset/vector_storage_with_metadata` Â≠òÂÇ®ÂåÖÂê´ÂÆåÊï¥metadataÁöÑÂêëÈáèÊï∞ÊçÆÂ∫ìÔºåÁ°Æ‰øùRAGÁ≥ªÁªüËÉΩÂ§üËøîÂõûËØ¶ÁªÜÁöÑÊ∫êÊñáÁåÆ‰ø°ÊÅØ„ÄÇ

### üîÑ **‰∏ã‰∏ÄÊ≠•**
ËøêË°åÊñ∞ÁöÑÊñáÊ°£ÂàÜÂâ≤‰ª£Á†ÅÔºåÈáçÊñ∞ÂàõÂª∫ÂêëÈáèÊï∞ÊçÆÂ∫ìÔºåÂç≥ÂèØÂú®RAGÊü•ËØ¢‰∏≠Ëé∑ÂæóÂÆåÊï¥ÁöÑÊ∫êÊñáÁåÆËøΩË∏™ÂäüËÉΩ„ÄÇ


In [17]:
print(llm_extraction.complete("Summarise the methods to improve efficiency of solar cells."))

Improving the efficiency of solar cells involves several methods aimed at enhancing the conversion of sunlight into electricity. Here's a summary of key strategies:

1. **Material Improvements**:
   - **High-efficiency semiconductors**: Use of materials with a high absorption coefficient and direct bandgap, such as gallium arsenide (GaAs) or perovskites.
   - **Multiple-junction cells**: Stack multiple cells with different bandgaps to absorb a broader range of the solar spectrum.

2. **Texturing and Antireflection Coatings**:
   - **Surface texturing**: Roughen the surface of the solar cell to increase light scattering and reduce reflection.
   - **Antireflection coatings**: Apply coatings that minimize light reflection, allowing more sunlight to be absorbed.

3. **Enhanced Light Absorption**:
   - **Light-trapping structures**: Use microlenses or 3D structures to focus light and trap it within the cell for better absorption.
   - **Dye-sensitized solar cells (DSCs)**: Incorporate dye 