## Setting up the environment

In [1]:
%load_ext autoreload
%autoreload 2

import os
from dotenv import load_dotenv
load_dotenv()

# Setup embedding model
from llama_index.core import Settings

# Use local embedding models served by LM Studio
# Use fake API key (LM Studio doesn't validate it)
from llama_index.embeddings.openai import OpenAIEmbedding
# embed_model = OpenAIEmbedding(
#     api_base = os.getenv("LM_STUDIO_API_BASE"),
#     api_key = "whatever-it-is",
#     model_name = os.getenv("LM_STUDIO_EMBED_MODEL"),
#     embed_batch_size = 2
# )

# # Embedding model verification
# Settings.embed_model = embed_model
# embed = embed_model.get_text_embedding("The quick brown fox jumps over the lazy dog.")
# print(embed[:5])  # Should print a list of floats

In [2]:
# Import the embedding solution
from embedding_solution import create_embedding_model

# Use raw string to avoid escape sequence warnings
model_path = r"F:\Intern\EDF\EmbeddingModels\Qwen3-Embedding-0.6B"

print(f"Loading model from: {model_path}")

# Create the embedding model with fallback
try:
    embed_model = create_embedding_model(model_path, fallback=True)
    print(f"✓ Successfully created embedding model: {type(embed_model).__name__}")
    
    # Test the embedding
    test_text = "Silicon battery technology is promising."
    embedding = embed_model.get_text_embedding(test_text)
    print(f"✓ Test embedding generated successfully")
    print(f"Embedding dimension: {len(embedding)}")
    
    # Set as default for llama-index
    try:
        from llama_index.core import Settings
        Settings.embed_model = embed_model
        print("✓ Successfully set as default embedding model for llama-index")
        
        # Test llama-index integration
        test_embedding = Settings.embed_model.get_text_embedding("Test query")
        print(f"✓ Llama-index integration working")
        
    except ImportError:
        print("llama-index not available, but embedding model works")
    except Exception as e:
        print(f"Llama-index integration failed: {e}")
    
except Exception as e:
    print(f"✗ Failed to create embedding model: {e}")
    print("Trying fallback models...")
    
    try:
        embed_model = create_embedding_model(fallback=True)
        print(f"✓ Created fallback embedding model: {type(embed_model).__name__}")
        
        # Set as default for llama-index
        try:
            from llama_index.core import Settings
            Settings.embed_model = embed_model
            print("✓ Successfully set fallback model as default for llama-index")
        except ImportError:
            print("llama-index not available, but fallback embedding model works")
            
    except Exception as e2:
        print(f"✗ Even fallback failed: {e2}")
        embed_model = None

print("\n" + "="*50)
if embed_model:
    print("SUCCESS: Embedding model is ready to use!")
    print("You can now use llama-index with this embedding model")
else:
    print("FAILED: No working embedding model found")
    print("Please check your dependencies and model path")
print("="*50) 

✓ Llama-index embeddings available


  from .autonotebook import tqdm as notebook_tqdm


✓ Transformers version: 4.51.3
✗ Sentence-transformers not available
Loading model from: F:\Intern\EDF\EmbeddingModels\Qwen3-Embedding-0.6B
Loading Qwen3 model from: F:\Intern\EDF\EmbeddingModels\Qwen3-Embedding-0.6B
✓ Qwen3 model loaded successfully (Method 1)
✓ Successfully created embedding model: Qwen3Embedding
✓ Test embedding generated successfully
Embedding dimension: 1024
✓ Successfully set as default embedding model for llama-index
✓ Llama-index integration working

SUCCESS: Embedding model is ready to use!
You can now use llama-index with this embedding model


In [3]:
# Embedding model verification
Settings.embed_model = embed_model
embed = embed_model.get_text_embedding("The quick brown fox jumps over the lazy dog.")
print(embed[:5])  # Should print a list of floats

[2.673593044281006, -6.904200553894043, -0.5414437055587769, 1.5437066555023193, 0.8829537630081177]


In [4]:
# 使用智谱免费模型，提取节点关系
from llama_index.llms.zhipuai import ZhipuAI
ZHIPU_API_KEY = "41b29e65745d4110a018c5d616b0012f.A6CEwmornnYXSVLC"
ZHIPU_LLM_MODEL_NAME = "glm-4-flash"
llm_extraction = ZhipuAI(
    api_key=ZHIPU_API_KEY,
    model=ZHIPU_LLM_MODEL_NAME
)
Settings.llm = llm_extraction

# Verify the LLM
print(llm_extraction.complete("\nBriefly introduce yourself in 50 Chinese characters."))

我是人工智能助手，为您服务。


## Pipeline 1: Collecting & preparing the documents

In [None]:
# from llama_index.core import SimpleDirectoryReader

# # Load documents
# documents = SimpleDirectoryReader("../papers").load_data()
# print(documents[2])

Doc ID: 1447df38-7d1d-491b-9f4a-8cbd6a401164
Text: Deren Yang Editor Handbook of Photovoltaic Silicon With 578
Figures and 71 Tables


In [5]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import glob
from llama_index.core import Document

pdf_files = glob.glob("../zotero/*.pdf")
all_docs = []

splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=100,
    separators=["\n\n", "\n", ".", "。", "！", "？", "!", "?"]
)

for pdf in pdf_files:
    loader = PyPDFLoader(pdf)
    documents = loader.load()
    docs = splitter.split_documents(documents)
    all_docs.extend(docs)

print(f"共加载分割 {len(all_docs)} 个文档块")
print(all_docs[0].page_content[:500])

# 转为 llama_index Document
documents = [Document(text=doc.page_content) for doc in all_docs]

共加载分割 13528 个文档块
Handbook of 
Photovoltaic 
Silicon
Deren Yang
Editor


In [5]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import glob
from llama_index.core import Document

import pdfplumber
import fitz  # PyMuPDF
import os
from datetime import datetime
import hashlib

import warnings
warnings.filterwarnings("ignore")

pdf_files = glob.glob("../zotero/*.pdf")
all_docs = []

# 分割器
splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=100,
    separators=["\n\n", "\n", ".", "。", "！", "？", "!", "?"]
)

def extract_pdf_basic_info(pdf_path):
    """提取PDF基本信息作为metadata"""
    try:
        doc = fitz.open(pdf_path)
        metadata = doc.metadata
        doc.close()
        
        # 获取文件信息
        file_stats = os.stat(pdf_path)
        creation_time = datetime.fromtimestamp(file_stats.st_ctime).isoformat()
        modification_time = datetime.fromtimestamp(file_stats.st_mtime).isoformat()
        file_size = file_stats.st_size
        
        return {
            'title': metadata.get('title', ''),
            'author': metadata.get('author', ''),
            'subject': metadata.get('subject', ''),
            'creator': metadata.get('creator', ''),
            'producer': metadata.get('producer', ''),
            'creation_date': metadata.get('creationDate', ''),
            'modification_date': metadata.get('modDate', ''),
            'file_creation_time': creation_time,
            'file_modification_time': modification_time,
            'file_size_bytes': file_size
        }
    except Exception as e:
        print(f"Warning: Could not extract metadata from {pdf_path}: {e}")
        return {}

for pdf in pdf_files:
    file_name = os.path.basename(pdf)
    file_path = os.path.abspath(pdf)
    
    # 提取PDF基本信息
    pdf_metadata = extract_pdf_basic_info(pdf)
    
    # 1. 文本块提取（增强版：包含完整metadata）
    loader = PyPDFLoader(pdf)
    documents = loader.load()
    docs = splitter.split_documents(documents)
    
    for chunk_idx, doc in enumerate(docs):
        # 创建详细的metadata
        chunk_metadata = {
            # 源文件信息
            'source_file': file_name,
            'source_path': file_path,
            'file_type': 'pdf',
            
            # PDF文档信息
            'pdf_title': pdf_metadata.get('title', ''),
            'pdf_author': pdf_metadata.get('author', ''),
            'pdf_subject': pdf_metadata.get('subject', ''),
            'pdf_creator': pdf_metadata.get('creator', ''),
            'pdf_creation_date': pdf_metadata.get('creation_date', ''),
            
            # 文件系统信息
            'file_size_bytes': pdf_metadata.get('file_size_bytes', 0),
            'file_creation_time': pdf_metadata.get('file_creation_time', ''),
            'file_modification_time': pdf_metadata.get('file_modification_time', ''),
            
            # 块信息
            'chunk_type': 'text',
            'chunk_index': chunk_idx,
            'page_number': doc.metadata.get('page', 'unknown'),
            'chunk_size': len(doc.page_content),
            
            # 原始PyPDFLoader metadata
            'original_metadata': doc.metadata,
            
            # 处理时间戳
            'processing_timestamp': datetime.now().isoformat(),
            
            # 内容哈希（用于去重和版本控制）
            'content_hash': hashlib.md5(doc.page_content.encode()).hexdigest(),
            
            # 统计信息
            'word_count': len(doc.page_content.split()),
            'char_count': len(doc.page_content)
        }
        
        # 创建带metadata的Document
        enhanced_doc = Document(
            text=doc.page_content,
            metadata=chunk_metadata
        )
        all_docs.append(enhanced_doc)

    # 2. 表格提取（增强版：包含完整metadata）
    with pdfplumber.open(pdf) as pdf_doc:
        for page_num, page in enumerate(pdf_doc.pages):
            tables = page.extract_tables()
            for table_idx, table in enumerate(tables):
                # 转为结构化文本
                table_text = "\n".join([", ".join([str(cell) if cell is not None else "" for cell in row]) for row in table])
                
                # 表格标识信息
                table_header = f"【表格】文件:{file_name}, 页码:{page_num+1}, 表格序号:{table_idx+1}"
                full_table_text = f"{table_header}\n{table_text}"
                
                # 表格metadata
                table_metadata = {
                    # 源文件信息
                    'source_file': file_name,
                    'source_path': file_path,
                    'file_type': 'pdf',
                    
                    # PDF文档信息
                    'pdf_title': pdf_metadata.get('title', ''),
                    'pdf_author': pdf_metadata.get('author', ''),
                    'pdf_subject': pdf_metadata.get('subject', ''),
                    
                    # 表格特定信息
                    'chunk_type': 'table',
                    'page_number': page_num + 1,
                    'table_index': table_idx + 1,
                    'table_rows': len(table),
                    'table_cols': len(table[0]) if table else 0,
                    
                    # 处理信息
                    'processing_timestamp': datetime.now().isoformat(),
                    'content_hash': hashlib.md5(full_table_text.encode()).hexdigest(),
                    'word_count': len(full_table_text.split()),
                    'char_count': len(full_table_text)
                }
                
                table_doc = Document(
                    text=full_table_text,
                    metadata=table_metadata
                )
                all_docs.append(table_doc)

    # 3. 图像/图表提取（增强版：包含完整metadata）
    doc = fitz.open(pdf)
    for page_index in range(len(doc)):
        page = doc[page_index]
        for img_index, img in enumerate(page.get_images(full=True)):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            img_ext = base_image["ext"]
            img_name = f"{os.path.splitext(file_name)[0]}_page{page_index+1}_img{img_index+1}.{img_ext}"
            img_save_path = os.path.join("./extracted_images", img_name)
            os.makedirs("./extracted_images", exist_ok=True)
            
            with open(img_save_path, "wb") as f:
                f.write(image_bytes)
            
            # 图像描述文本
            image_description = f"【图表】文件:{file_name}, 页码:{page_index+1}, 图片序号:{img_index+1}, 路径:{img_save_path}"
            
            # 图像metadata
            image_metadata = {
                # 源文件信息
                'source_file': file_name,
                'source_path': file_path,
                'file_type': 'pdf',
                
                # PDF文档信息
                'pdf_title': pdf_metadata.get('title', ''),
                'pdf_author': pdf_metadata.get('author', ''),
                'pdf_subject': pdf_metadata.get('subject', ''),
                
                # 图像特定信息
                'chunk_type': 'image',
                'page_number': page_index + 1,
                'image_index': img_index + 1,
                'image_format': img_ext,
                'image_path': img_save_path,
                'image_size_bytes': len(image_bytes),
                
                # 图像技术信息
                'image_width': base_image.get('width', 0),
                'image_height': base_image.get('height', 0),
                'image_colorspace': base_image.get('colorspace', 'unknown'),
                
                # 处理信息
                'processing_timestamp': datetime.now().isoformat(),
                'content_hash': hashlib.md5(image_description.encode()).hexdigest(),
                'word_count': len(image_description.split()),
                'char_count': len(image_description)
            }
            
            image_doc = Document(
                text=image_description,
                metadata=image_metadata
            )
            all_docs.append(image_doc)
    
    doc.close()

print(f"共加载分割 {len(all_docs)} 个文档块（含文本、表格、图表信息）")
print(f"文档类型分布:")
chunk_types = {}
for doc in all_docs:
    chunk_type = doc.metadata.get('chunk_type', 'unknown')
    chunk_types[chunk_type] = chunk_types.get(chunk_type, 0) + 1

for chunk_type, count in chunk_types.items():
    print(f"  {chunk_type}: {count} 个")

print(f"\n示例文档块metadata:")
if all_docs:
    example_doc = all_docs[0]
    print(f"文本预览: {example_doc.text[:200]}...")
    print(f"Metadata keys: {list(example_doc.metadata.keys())}")
    print(f"源文件: {example_doc.metadata.get('source_file', 'N/A')}")
    print(f"块类型: {example_doc.metadata.get('chunk_type', 'N/A')}")
    print(f"页码: {example_doc.metadata.get('page_number', 'N/A')}")

# 后续与原 pipeline 保持一致
documents = all_docs

Cannot set gray non-stroke color because /'Pa1' is an invalid float value
Cannot set gray non-stroke color because /'Pa2' is an invalid float value
Cannot set gray non-stroke color because /'Pa3' is an invalid float value
Cannot set gray non-stroke color because /'Pa1' is an invalid float value
Cannot set gray non-stroke color because /'Pa2' is an invalid float value
Cannot set gray non-stroke color because /'Pa1' is an invalid float value
Cannot set gray non-stroke color because /'Pa1' is an invalid float value
Cannot set gray non-stroke color because /'Pa1' is an invalid float value
Cannot set gray non-stroke color because /'Pa1' is an invalid float value
Cannot set gray non-stroke color because /'Pa1' is an invalid float value
Cannot set gray non-stroke color because /'Pa1' is an invalid float value
Cannot set gray non-stroke color because /'Pa2' is an invalid float value
Cannot set gray non-stroke color because /'Pa1' is an invalid float value
Cannot set gray non-stroke color becau

共加载分割 18727 个文档块（含文本、表格、图表信息）
文档类型分布:
  text: 13528 个
  table: 879 个
  image: 4320 个

示例文档块metadata:
文本预览: Handbook of 
Photovoltaic 
Silicon
Deren Yang
Editor...
Metadata keys: ['source_file', 'source_path', 'file_type', 'pdf_title', 'pdf_author', 'pdf_subject', 'pdf_creator', 'pdf_creation_date', 'file_size_bytes', 'file_creation_time', 'file_modification_time', 'chunk_type', 'chunk_index', 'page_number', 'chunk_size', 'original_metadata', 'processing_timestamp', 'content_hash', 'word_count', 'char_count']
源文件: 978-3-662-56472-1.pdf
块类型: text
页码: 0


## Pipeline 2: Creating vector store

In [3]:
from llama_index.vector_stores.deeplake import DeepLakeVectorStore
from llama_index.core import StorageContext, VectorStoreIndex

# Path for vector store and dataset
vector_dataset = "./dataset/vector_storage_with_metadata" # local storage
vector_store_path = vector_dataset
dataset_path = vector_dataset

  import pkg_resources  # type: ignore


In [11]:
from llama_index.vector_stores.deeplake import DeepLakeVectorStore
from llama_index.core import StorageContext, VectorStoreIndex

# Path for vector store and dataset
vector_dataset = "./dataset/vector_storage_with_metadata" # local storage
vector_store_path = vector_dataset
dataset_path = vector_dataset

# Create an index over the documents
# Overwrites the existing dataset if True
ow = True

if ow==True:
    try:
        vector_store = DeepLakeVectorStore(dataset_path=dataset_path, overwrite=ow)
        storage_context = StorageContext.from_defaults(vector_store=vector_store)
        index = VectorStoreIndex.from_documents(documents, storage_context, show_progress=True)
    except Exception as e:
        print(f"An error occurred: {e}")
        print(f"Error type: {type(e)}")
        print(f"Error traceback: {e.__traceback__}")

Parsing nodes: 100%|██████████| 18727/18727 [00:21<00:00, 874.97it/s] 
Generating embeddings: 100%|██████████| 2048/2048 [3:25:09<00:00,  6.01s/it]  


Uploading data to deeplake dataset.


  0%|          | 0/2048 [00:00<?, ?it/s]

An error occurred: Error while attempting to rollback appends
Error type: <class 'Exception'>
Error traceback: <traceback object at 0x0000018467C793C0>





In [5]:
# -*- coding: utf-8 -*-
"""
This script loads a pre-existing DeepLake vector store and prepares it for querying.
It assumes you have already run the 'run_vectorization.py' script and have the
vector store folder available.
"""

import os
from llama_index.core import VectorStoreIndex, Settings
from llama_index.vector_stores.deeplake import DeepLakeVectorStore
import warnings

warnings.filterwarnings("ignore")

# --- 1. 配置路径 ---
# --- 1. Configuration ---

# 关键步骤：这里的模型路径必须和你同事生成向量时使用的模型完全一致！
# CRITICAL STEP: The model path here MUST be identical to the one your colleague used for vectorization!
# 请根据你本地存放 embedding 模型的位置修改此路径。
# Please update this path to where you have stored the embedding model locally.
EMBEDDING_MODEL_PATH = r"F:\Intern\EDF\EmbeddingModels\Qwen3-Embedding-0.6B"

# 关键步骤：指向你从同事那里拷贝过来的向量数据库文件夹。
# CRITICAL STEP: Point this to the vector store folder you copied from your colleague.
VECTOR_STORE_PATH = "./dataset/vector_storage_for_colleague"


# --- 2. 主执行函数 ---
# --- 2. Main Execution Function ---

def main():
    """
    Loads the DeepLake index and runs a sample query.
    """
    print("--- Starting Query Process ---")

    # 步骤 1: 设置全局的 Embedding 模型
    # Step 1: Set up the global embedding model

    # 步骤 2: 加载 DeepLake 向量库
    # Step 2: Load the DeepLake vector store
    print(f"\nStep 2: Loading vector store from: {VECTOR_STORE_PATH}")
    if not os.path.exists(VECTOR_STORE_PATH):
        print(f"--- ❌ CRITICAL ERROR ---")
        print(f"Vector store not found at: {VECTOR_STORE_PATH}")
        print("Please ensure you have copied the folder correctly.")
        return

    try:
        # 以只读模式加载，这是一个安全的好习惯
        # Loading in read-only mode is a safe and good practice
        vector_store = DeepLakeVectorStore(
            dataset_path=VECTOR_STORE_PATH,
            read_only=True
        )

        # 从已存在的向量库中重建索引
        # Reconstruct the index from the existing vector store
        index = VectorStoreIndex.from_vector_store(vector_store)
        print("-> Vector store loaded successfully.")

    except Exception as e:
        print(f"--- ❌ ERROR ---")
        print(f"An error occurred while loading the vector store: {e}")
        return

    # 步骤 3: 创建查询引擎并提问
    # Step 3: Create a query engine and ask a question
    print("\nStep 3: Creating query engine...")
    query_engine = index.as_query_engine(similarity_top_k=5) # 示例：返回最相似的5个结果
    print("-> Query engine is ready.")

    # --- 在这里输入你的问题 ---
    # --- Enter your query here ---
    print("\n--- Ready to Query! ---")
    query_text = "What are the applications of perovskite in solar cells?"

    print(f"\nQuerying with: '{query_text}'")
    response = query_engine.query(query_text)

    print("\n--- Response ---")
    print(str(response))

    print("\n--- Source Nodes ---")
    for i, node in enumerate(response.source_nodes):
        print(f"  Source {i+1} (Score: {node.score:.4f}):")
        # 打印文件名元数据，如果存在的话
        # Print the source file metadata if it exists
        if 'source_file' in node.metadata:
            print(f"    File: {node.metadata['source_file']}")
        print(f"    Text: {node.get_content()[:200]}...") # 打印部分文本内容
        print("-" * 20)


if __name__ == "__main__":
    main()


  import pkg_resources  # type: ignore


--- Starting Query Process ---

Step 2: Loading vector store from: ./dataset/vector_storage_for_colleague
Deep Lake Dataset in ./dataset/vector_storage_for_colleague already exists, loading from the storage




-> Vector store loaded successfully.

Step 3: Creating query engine...
-> Query engine is ready.

--- Ready to Query! ---

Querying with: 'What are the applications of perovskite in solar cells?'

--- Response ---
Perovskite materials have several applications in solar cells, including enhancing the efficiency of photovoltaic devices, providing a cheaper alternative to traditional solar cell materials, and enabling the development of flexible and lightweight solar panels. They can be used to create solar cells with improved stability and higher light absorption capabilities, contributing to advancements in photovoltaic technology.

--- Source Nodes ---
  Source 1 (Score: 0.5236):
    File: 978-3-662-56472-1.pdf
    Text: WCPEC-3 Organizing Committee, (Osaka, 2003), p. 1112–1115
D. Karg, H. Chariﬁ, G. Pensl, M. Schulz, G. Hahn, in19th Europ. Photovoltaic Solar Energy Conf.:
Proc. of the Int. Conf. held in Paris, France...
--------------------
  Source 2 (Score: 0.5194):
    File: Zaidi 

In [None]:
# -*- coding: utf-8 -*-
"""
This script loads a pre-existing DeepLake vector store and prepares it for querying.
It assumes you have already run the 'run_vectorization.py' script and have the
vector store folder available.
"""

import os
from llama_index.core import VectorStoreIndex, Settings
from llama_index.vector_stores.deeplake import DeepLakeVectorStore
import warnings

warnings.filterwarnings("ignore")

# --- 1. 配置路径 ---
# --- 1. Configuration ---

# 关键步骤：这里的模型路径必须和你同事生成向量时使用的模型完全一致！
# CRITICAL STEP: The model path here MUST be identical to the one your colleague used for vectorization!
# 请根据你本地存放 embedding 模型的位置修改此路径。
# Please update this path to where you have stored the embedding model locally.
EMBEDDING_MODEL_PATH = r"F:\Intern\EDF\EmbeddingModels\Qwen3-Embedding-0.6B"

# 关键步骤：指向你从同事那里拷贝过来的向量数据库文件夹。
# CRITICAL STEP: Point this to the vector store folder you copied from your colleague.
VECTOR_STORE_PATH = "./dataset/vector_storage_for_colleague"


# --- 2. 主执行函数 ---
# --- 2. Main Execution Function ---

def main():
    """
    Loads the DeepLake index and runs a sample query.
    """
    print("--- Starting Query Process ---")

    # 步骤 1: 设置全局的 Embedding 模型
    # Step 1: Set up the global embedding model

    # 步骤 2: 加载 DeepLake 向量库
    # Step 2: Load the DeepLake vector store
    print(f"\nStep 2: Loading vector store from: {VECTOR_STORE_PATH}")
    if not os.path.exists(VECTOR_STORE_PATH):
        print(f"--- ❌ CRITICAL ERROR ---")
        print(f"Vector store not found at: {VECTOR_STORE_PATH}")
        print("Please ensure you have copied the folder correctly.")
        return

    try:
        # 以只读模式加载，这是一个安全的好习惯
        # Loading in read-only mode is a safe and good practice
        vector_store = DeepLakeVectorStore(
            dataset_path=VECTOR_STORE_PATH,
            read_only=True
        )

        # 从已存在的向量库中重建索引
        # Reconstruct the index from the existing vector store
        index = VectorStoreIndex.from_vector_store(vector_store)
        print("-> Vector store loaded successfully.")

    except Exception as e:
        print(f"--- ❌ ERROR ---")
        print(f"An error occurred while loading the vector store: {e}")
        return

    # 步骤 3: 创建查询引擎并提问
    # Step 3: Create a query engine and ask a question
    print("\nStep 3: Creating query engine...")
    query_engine = index.as_query_engine(similarity_top_k=5) # 示例：返回最相似的5个结果
    print("-> Query engine is ready.")

    # --- 在这里输入你的问题 ---
    # --- Enter your query here ---
    print("\n--- Ready to Query! ---")
    query_text = "What are the applications of perovskite in solar cells?"

    print(f"\nQuerying with: '{query_text}'")
    response = query_engine.query(query_text)

    print("\n--- Response ---")
    print(str(response))

    print("\n--- Source Nodes ---")
    for i, node in enumerate(response.source_nodes):
        print(f"  Source {i+1} (Score: {node.score:.4f}):")
        # 打印文件名元数据，如果存在的话
        # Print the source file metadata if it exists
        if 'source_file' in node.metadata:
            print(f"    File: {node.metadata['source_file']}")
        print(f"    Text: {node.get_content()[:200]}...") # 打印部分文本内容
        print("-" * 20)


if __name__ == "__main__":
    main()


--- Starting Query Process ---

Step 2: Loading vector store from: ./dataset/vector_storage_for_colleague
--- ❌ ERROR ---
An error occurred while loading the vector store: Please use a url that points to an existing Deep Lake Dataset or an empty folder. If you wish to delete the folder and its contents, you may run deeplake.delete(dataset_path, force=True).


In [None]:
# -*- coding: utf-8 -*-
"""
This script loads a pre-existing DeepLake vector store and prepares it for querying.
It assumes you have already run the 'run_vectorization.py' script and have the
vector store folder available.
"""

import os
from llama_index.core import VectorStoreIndex, Settings
from llama_index.vector_stores.deeplake import DeepLakeVectorStore
import warnings

warnings.filterwarnings("ignore")

# --- 1. 配置路径 ---
# --- 1. Configuration ---

# 关键步骤：这里的模型路径必须和你同事生成向量时使用的模型完全一致！
# CRITICAL STEP: The model path here MUST be identical to the one your colleague used for vectorization!
# 请根据你本地存放 embedding 模型的位置修改此路径。
# Please update this path to where you have stored the embedding model locally.
EMBEDDING_MODEL_PATH = r"F:\Intern\EDF\EmbeddingModels\Qwen3-Embedding-0.6B"

# 关键步骤：指向你从同事那里拷贝过来的向量数据库文件夹。
# CRITICAL STEP: Point this to the vector store folder you copied from your colleague.
VECTOR_STORE_PATH = "./dataset/vector_storage_for_colleague"


# --- 2. 主执行函数 ---
# --- 2. Main Execution Function ---

def main():
    """
    Loads the DeepLake index and runs a sample query.
    """
    print("--- Starting Query Process ---")

    # 步骤 1: 设置全局的 Embedding 模型
    # Step 1: Set up the global embedding model

    # 步骤 2: 加载 DeepLake 向量库
    # Step 2: Load the DeepLake vector store
    print(f"\nStep 2: Loading vector store from: {VECTOR_STORE_PATH}")
    if not os.path.exists(VECTOR_STORE_PATH):
        print(f"--- ❌ CRITICAL ERROR ---")
        print(f"Vector store not found at: {VECTOR_STORE_PATH}")
        print("Please ensure you have copied the folder correctly.")
        return

    try:
        # 以只读模式加载，这是一个安全的好习惯
        # Loading in read-only mode is a safe and good practice
        vector_store = DeepLakeVectorStore(
            dataset_path=VECTOR_STORE_PATH,
            read_only=True
        )

        # 从已存在的向量库中重建索引
        # Reconstruct the index from the existing vector store
        index = VectorStoreIndex.from_vector_store(vector_store)
        print("-> Vector store loaded successfully.")

    except Exception as e:
        print(f"--- ❌ ERROR ---")
        print(f"An error occurred while loading the vector store: {e}")
        return

    # 步骤 3: 创建查询引擎并提问
    # Step 3: Create a query engine and ask a question
    print("\nStep 3: Creating query engine...")
    query_engine = index.as_query_engine(similarity_top_k=5) # 示例：返回最相似的5个结果
    print("-> Query engine is ready.")

    # --- 在这里输入你的问题 ---
    # --- Enter your query here ---
    print("\n--- Ready to Query! ---")
    query_text = "What are the applications of perovskite in solar cells?"

    print(f"\nQuerying with: '{query_text}'")
    response = query_engine.query(query_text)

    print("\n--- Response ---")
    print(str(response))

    print("\n--- Source Nodes ---")
    for i, node in enumerate(response.source_nodes):
        print(f"  Source {i+1} (Score: {node.score:.4f}):")
        # 打印文件名元数据，如果存在的话
        # Print the source file metadata if it exists
        if 'source_file' in node.metadata:
            print(f"    File: {node.metadata['source_file']}")
        print(f"    Text: {node.get_content()[:200]}...") # 打印部分文本内容
        print("-" * 20)


if __name__ == "__main__":
    main()


--- Starting Query Process ---

Step 2: Loading vector store from: ./dataset/vector_storage_for_colleague
--- ❌ ERROR ---
An error occurred while loading the vector store: Please use a url that points to an existing Deep Lake Dataset or an empty folder. If you wish to delete the folder and its contents, you may run deeplake.delete(dataset_path, force=True).


In [None]:
# -*- coding: utf-8 -*-
"""
This script loads a pre-existing DeepLake vector store and prepares it for querying.
It assumes you have already run the 'run_vectorization.py' script and have the
vector store folder available.
"""

import os
from llama_index.core import VectorStoreIndex, Settings
from llama_index.vector_stores.deeplake import DeepLakeVectorStore
import warnings

warnings.filterwarnings("ignore")

# --- 1. 配置路径 ---
# --- 1. Configuration ---

# 关键步骤：这里的模型路径必须和你同事生成向量时使用的模型完全一致！
# CRITICAL STEP: The model path here MUST be identical to the one your colleague used for vectorization!
# 请根据你本地存放 embedding 模型的位置修改此路径。
# Please update this path to where you have stored the embedding model locally.
EMBEDDING_MODEL_PATH = r"F:\Intern\EDF\EmbeddingModels\Qwen3-Embedding-0.6B"

# 关键步骤：指向你从同事那里拷贝过来的向量数据库文件夹。
# CRITICAL STEP: Point this to the vector store folder you copied from your colleague.
VECTOR_STORE_PATH = "./dataset/vector_storage_for_colleague"


# --- 2. 主执行函数 ---
# --- 2. Main Execution Function ---

def main():
    """
    Loads the DeepLake index and runs a sample query.
    """
    print("--- Starting Query Process ---")

    # 步骤 1: 设置全局的 Embedding 模型
    # Step 1: Set up the global embedding model

    # 步骤 2: 加载 DeepLake 向量库
    # Step 2: Load the DeepLake vector store
    print(f"\nStep 2: Loading vector store from: {VECTOR_STORE_PATH}")
    if not os.path.exists(VECTOR_STORE_PATH):
        print(f"--- ❌ CRITICAL ERROR ---")
        print(f"Vector store not found at: {VECTOR_STORE_PATH}")
        print("Please ensure you have copied the folder correctly.")
        return

    try:
        # 以只读模式加载，这是一个安全的好习惯
        # Loading in read-only mode is a safe and good practice
        vector_store = DeepLakeVectorStore(
            dataset_path=VECTOR_STORE_PATH,
            read_only=True
        )

        # 从已存在的向量库中重建索引
        # Reconstruct the index from the existing vector store
        index = VectorStoreIndex.from_vector_store(vector_store)
        print("-> Vector store loaded successfully.")

    except Exception as e:
        print(f"--- ❌ ERROR ---")
        print(f"An error occurred while loading the vector store: {e}")
        return

    # 步骤 3: 创建查询引擎并提问
    # Step 3: Create a query engine and ask a question
    print("\nStep 3: Creating query engine...")
    query_engine = index.as_query_engine(similarity_top_k=5) # 示例：返回最相似的5个结果
    print("-> Query engine is ready.")

    # --- 在这里输入你的问题 ---
    # --- Enter your query here ---
    print("\n--- Ready to Query! ---")
    query_text = "What are the applications of perovskite in solar cells?"

    print(f"\nQuerying with: '{query_text}'")
    response = query_engine.query(query_text)

    print("\n--- Response ---")
    print(str(response))

    print("\n--- Source Nodes ---")
    for i, node in enumerate(response.source_nodes):
        print(f"  Source {i+1} (Score: {node.score:.4f}):")
        # 打印文件名元数据，如果存在的话
        # Print the source file metadata if it exists
        if 'source_file' in node.metadata:
            print(f"    File: {node.metadata['source_file']}")
        print(f"    Text: {node.get_content()[:200]}...") # 打印部分文本内容
        print("-" * 20)


if __name__ == "__main__":
    main()


--- Starting Query Process ---

Step 2: Loading vector store from: ./dataset/vector_storage_for_colleague
--- ❌ ERROR ---
An error occurred while loading the vector store: Please use a url that points to an existing Deep Lake Dataset or an empty folder. If you wish to delete the folder and its contents, you may run deeplake.delete(dataset_path, force=True).


In [None]:
# -*- coding: utf-8 -*-
"""
This script loads a pre-existing DeepLake vector store and prepares it for querying.
It assumes you have already run the 'run_vectorization.py' script and have the
vector store folder available.
"""

import os
from llama_index.core import VectorStoreIndex, Settings
from llama_index.vector_stores.deeplake import DeepLakeVectorStore
import warnings

warnings.filterwarnings("ignore")

# --- 1. 配置路径 ---
# --- 1. Configuration ---

# 关键步骤：这里的模型路径必须和你同事生成向量时使用的模型完全一致！
# CRITICAL STEP: The model path here MUST be identical to the one your colleague used for vectorization!
# 请根据你本地存放 embedding 模型的位置修改此路径。
# Please update this path to where you have stored the embedding model locally.
EMBEDDING_MODEL_PATH = r"F:\Intern\EDF\EmbeddingModels\Qwen3-Embedding-0.6B"

# 关键步骤：指向你从同事那里拷贝过来的向量数据库文件夹。
# CRITICAL STEP: Point this to the vector store folder you copied from your colleague.
VECTOR_STORE_PATH = "./dataset/vector_storage_for_colleague"


# --- 2. 主执行函数 ---
# --- 2. Main Execution Function ---

def main():
    """
    Loads the DeepLake index and runs a sample query.
    """
    print("--- Starting Query Process ---")

    # 步骤 1: 设置全局的 Embedding 模型
    # Step 1: Set up the global embedding model

    # 步骤 2: 加载 DeepLake 向量库
    # Step 2: Load the DeepLake vector store
    print(f"\nStep 2: Loading vector store from: {VECTOR_STORE_PATH}")
    if not os.path.exists(VECTOR_STORE_PATH):
        print(f"--- ❌ CRITICAL ERROR ---")
        print(f"Vector store not found at: {VECTOR_STORE_PATH}")
        print("Please ensure you have copied the folder correctly.")
        return

    try:
        # 以只读模式加载，这是一个安全的好习惯
        # Loading in read-only mode is a safe and good practice
        vector_store = DeepLakeVectorStore(
            dataset_path=VECTOR_STORE_PATH,
            read_only=True
        )

        # 从已存在的向量库中重建索引
        # Reconstruct the index from the existing vector store
        index = VectorStoreIndex.from_vector_store(vector_store)
        print("-> Vector store loaded successfully.")

    except Exception as e:
        print(f"--- ❌ ERROR ---")
        print(f"An error occurred while loading the vector store: {e}")
        return

    # 步骤 3: 创建查询引擎并提问
    # Step 3: Create a query engine and ask a question
    print("\nStep 3: Creating query engine...")
    query_engine = index.as_query_engine(similarity_top_k=5) # 示例：返回最相似的5个结果
    print("-> Query engine is ready.")

    # --- 在这里输入你的问题 ---
    # --- Enter your query here ---
    print("\n--- Ready to Query! ---")
    query_text = "What are the applications of perovskite in solar cells?"

    print(f"\nQuerying with: '{query_text}'")
    response = query_engine.query(query_text)

    print("\n--- Response ---")
    print(str(response))

    print("\n--- Source Nodes ---")
    for i, node in enumerate(response.source_nodes):
        print(f"  Source {i+1} (Score: {node.score:.4f}):")
        # 打印文件名元数据，如果存在的话
        # Print the source file metadata if it exists
        if 'source_file' in node.metadata:
            print(f"    File: {node.metadata['source_file']}")
        print(f"    Text: {node.get_content()[:200]}...") # 打印部分文本内容
        print("-" * 20)


if __name__ == "__main__":
    main()


--- Starting Query Process ---

Step 2: Loading vector store from: ./dataset/vector_storage_for_colleague
--- ❌ ERROR ---
An error occurred while loading the vector store: Please use a url that points to an existing Deep Lake Dataset or an empty folder. If you wish to delete the folder and its contents, you may run deeplake.delete(dataset_path, force=True).


In [None]:
# -*- coding: utf-8 -*-
"""
This script loads a pre-existing DeepLake vector store and prepares it for querying.
It assumes you have already run the 'run_vectorization.py' script and have the
vector store folder available.
"""

import os
from llama_index.core import VectorStoreIndex, Settings
from llama_index.vector_stores.deeplake import DeepLakeVectorStore
import warnings

warnings.filterwarnings("ignore")

# --- 1. 配置路径 ---
# --- 1. Configuration ---

# 关键步骤：这里的模型路径必须和你同事生成向量时使用的模型完全一致！
# CRITICAL STEP: The model path here MUST be identical to the one your colleague used for vectorization!
# 请根据你本地存放 embedding 模型的位置修改此路径。
# Please update this path to where you have stored the embedding model locally.
EMBEDDING_MODEL_PATH = r"F:\Intern\EDF\EmbeddingModels\Qwen3-Embedding-0.6B"

# 关键步骤：指向你从同事那里拷贝过来的向量数据库文件夹。
# CRITICAL STEP: Point this to the vector store folder you copied from your colleague.
VECTOR_STORE_PATH = "./dataset/vector_storage_for_colleague"


# --- 2. 主执行函数 ---
# --- 2. Main Execution Function ---

def main():
    """
    Loads the DeepLake index and runs a sample query.
    """
    print("--- Starting Query Process ---")

    # 步骤 1: 设置全局的 Embedding 模型
    # Step 1: Set up the global embedding model

    # 步骤 2: 加载 DeepLake 向量库
    # Step 2: Load the DeepLake vector store
    print(f"\nStep 2: Loading vector store from: {VECTOR_STORE_PATH}")
    if not os.path.exists(VECTOR_STORE_PATH):
        print(f"--- ❌ CRITICAL ERROR ---")
        print(f"Vector store not found at: {VECTOR_STORE_PATH}")
        print("Please ensure you have copied the folder correctly.")
        return

    try:
        # 以只读模式加载，这是一个安全的好习惯
        # Loading in read-only mode is a safe and good practice
        vector_store = DeepLakeVectorStore(
            dataset_path=VECTOR_STORE_PATH,
            read_only=True
        )

        # 从已存在的向量库中重建索引
        # Reconstruct the index from the existing vector store
        index = VectorStoreIndex.from_vector_store(vector_store)
        print("-> Vector store loaded successfully.")

    except Exception as e:
        print(f"--- ❌ ERROR ---")
        print(f"An error occurred while loading the vector store: {e}")
        return

    # 步骤 3: 创建查询引擎并提问
    # Step 3: Create a query engine and ask a question
    print("\nStep 3: Creating query engine...")
    query_engine = index.as_query_engine(similarity_top_k=5) # 示例：返回最相似的5个结果
    print("-> Query engine is ready.")

    # --- 在这里输入你的问题 ---
    # --- Enter your query here ---
    print("\n--- Ready to Query! ---")
    query_text = "What are the applications of perovskite in solar cells?"

    print(f"\nQuerying with: '{query_text}'")
    response = query_engine.query(query_text)

    print("\n--- Response ---")
    print(str(response))

    print("\n--- Source Nodes ---")
    for i, node in enumerate(response.source_nodes):
        print(f"  Source {i+1} (Score: {node.score:.4f}):")
        # 打印文件名元数据，如果存在的话
        # Print the source file metadata if it exists
        if 'source_file' in node.metadata:
            print(f"    File: {node.metadata['source_file']}")
        print(f"    Text: {node.get_content()[:200]}...") # 打印部分文本内容
        print("-" * 20)


if __name__ == "__main__":
    main()


--- Starting Query Process ---

Step 2: Loading vector store from: ./dataset/vector_storage_for_colleague
--- ❌ ERROR ---
An error occurred while loading the vector store: Please use a url that points to an existing Deep Lake Dataset or an empty folder. If you wish to delete the folder and its contents, you may run deeplake.delete(dataset_path, force=True).


In [None]:
# -*- coding: utf-8 -*-
"""
This script loads a pre-existing DeepLake vector store and prepares it for querying.
It assumes you have already run the 'run_vectorization.py' script and have the
vector store folder available.
"""

import os
from llama_index.core import VectorStoreIndex, Settings
from llama_index.vector_stores.deeplake import DeepLakeVectorStore
import warnings

warnings.filterwarnings("ignore")

# --- 1. 配置路径 ---
# --- 1. Configuration ---

# 关键步骤：这里的模型路径必须和你同事生成向量时使用的模型完全一致！
# CRITICAL STEP: The model path here MUST be identical to the one your colleague used for vectorization!
# 请根据你本地存放 embedding 模型的位置修改此路径。
# Please update this path to where you have stored the embedding model locally.
EMBEDDING_MODEL_PATH = r"F:\Intern\EDF\EmbeddingModels\Qwen3-Embedding-0.6B"

# 关键步骤：指向你从同事那里拷贝过来的向量数据库文件夹。
# CRITICAL STEP: Point this to the vector store folder you copied from your colleague.
VECTOR_STORE_PATH = "./dataset/vector_storage_for_colleague"


# --- 2. 主执行函数 ---
# --- 2. Main Execution Function ---

def main():
    """
    Loads the DeepLake index and runs a sample query.
    """
    print("--- Starting Query Process ---")

    # 步骤 1: 设置全局的 Embedding 模型
    # Step 1: Set up the global embedding model

    # 步骤 2: 加载 DeepLake 向量库
    # Step 2: Load the DeepLake vector store
    print(f"\nStep 2: Loading vector store from: {VECTOR_STORE_PATH}")
    if not os.path.exists(VECTOR_STORE_PATH):
        print(f"--- ❌ CRITICAL ERROR ---")
        print(f"Vector store not found at: {VECTOR_STORE_PATH}")
        print("Please ensure you have copied the folder correctly.")
        return

    try:
        # 以只读模式加载，这是一个安全的好习惯
        # Loading in read-only mode is a safe and good practice
        vector_store = DeepLakeVectorStore(
            dataset_path=VECTOR_STORE_PATH,
            read_only=True
        )

        # 从已存在的向量库中重建索引
        # Reconstruct the index from the existing vector store
        index = VectorStoreIndex.from_vector_store(vector_store)
        print("-> Vector store loaded successfully.")

    except Exception as e:
        print(f"--- ❌ ERROR ---")
        print(f"An error occurred while loading the vector store: {e}")
        return

    # 步骤 3: 创建查询引擎并提问
    # Step 3: Create a query engine and ask a question
    print("\nStep 3: Creating query engine...")
    query_engine = index.as_query_engine(similarity_top_k=5) # 示例：返回最相似的5个结果
    print("-> Query engine is ready.")

    # --- 在这里输入你的问题 ---
    # --- Enter your query here ---
    print("\n--- Ready to Query! ---")
    query_text = "What are the applications of perovskite in solar cells?"

    print(f"\nQuerying with: '{query_text}'")
    response = query_engine.query(query_text)

    print("\n--- Response ---")
    print(str(response))

    print("\n--- Source Nodes ---")
    for i, node in enumerate(response.source_nodes):
        print(f"  Source {i+1} (Score: {node.score:.4f}):")
        # 打印文件名元数据，如果存在的话
        # Print the source file metadata if it exists
        if 'source_file' in node.metadata:
            print(f"    File: {node.metadata['source_file']}")
        print(f"    Text: {node.get_content()[:200]}...") # 打印部分文本内容
        print("-" * 20)


if __name__ == "__main__":
    main()


--- Starting Query Process ---

Step 2: Loading vector store from: ./dataset/vector_storage_for_colleague
--- ❌ ERROR ---
An error occurred while loading the vector store: Please use a url that points to an existing Deep Lake Dataset or an empty folder. If you wish to delete the folder and its contents, you may run deeplake.delete(dataset_path, force=True).


In [12]:
import deeplake
import pandas as pd
import numpy as np

# Create a dictionary to hold the data
data = {}

# Load vector store data
ds = deeplake.load(dataset_path)
ds.summary()

# Iterate through the tensors in the dataset
for tensor_name in ds.tensors:
    tensor_data = ds[tensor_name].numpy()

    # Check if the tensor is multi-dimensional
    if tensor_data.ndim > 1:
        # Flatten multi-dimensional tensors
        data[tensor_name] = [np.array(e).flatten().tolist() for e in tensor_data]
    else:
        # Convert 1D tensors directly to lists and decode text
        if tensor_name == "text":
            data[tensor_name] = [t.tobytes().decode('utf-8') if t else "" for t in tensor_data]
        else:
            data[tensor_name] = tensor_data.tolist()

# Create a Pandas DataFrame from the dictionary
df = pd.DataFrame(data)

./dataset/vector_storage_new loaded successfully.





Dataset(path='./dataset/vector_storage_new', tensors=['embedding', 'id', 'metadata', 'text'])

  tensor      htype        shape       dtype  compression
  -------    -------      -------     -------  ------- 
 embedding  embedding  (18734, 1024)  float32   None   
    id        text      (18734, 1)      str     None   
 metadata     json      (18734, 1)      str     None   
   text       text      (18734, 1)      str     None   


In [13]:
# Function to display a selected record
def display_record(record_number):
    record = df.iloc[record_number]
    display_data = {
        "ID": record.get("id", "N/A"),
        "Metadata": record.get("metadata", "N/A"),
        "Text": record.get("text", "N/A"),
        "Embedding": record.get("embedding", "N/A")
    }

    # Print the ID
    print("ID:")
    print(display_data["ID"])
    print()

    # Print the metadata in a structured format
    print("Metadata:")
    metadata = display_data["Metadata"]
    if isinstance(metadata, list):
        for item in metadata:
            for key, value in item.items():
                print(f"{key}: {value}")
            print()
    else:
        print(metadata)
    print()

    # Print the text
    print("Text:")
    print(display_data["Text"])
    print()

    # Print the embedding
    print("Embedding:")
    print(display_data["Embedding"])
    print()

# Example usage
rec = 7  # Replace with the desired record number
display_record(rec)

ID:
['ade4e164-2d97-484f-80f1-b2bb916e2f18']

Metadata:
_node_content: {"id_": "ade4e164-2d97-484f-80f1-b2bb916e2f18", "embedding": null, "metadata": {}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": [], "relationships": {"1": {"node_id": "b8ec8bfa-a4c3-4f6b-a005-02a743532be5", "node_type": "4", "metadata": {}, "hash": "ba1ef0fa43ec605c97db616ab531061cb885f1e3d5e4392a6a47a5e3b5b8ff85", "class_name": "RelatedNodeInfo"}}, "metadata_template": "{key}: {value}", "metadata_separator": "\n", "text": "Chapin et al., at the Bell Labs in the USA, invented the\ufb01rst solar cell with an\nef\ufb01ciency of about 6%. Since then, research and application of modern photovoltaic\nsolar cells have been booming. Solar cells have been mounted on satellites, space\nstations, remote prairies, mountains, and islands to offer off-grid electricity, and on\nthe roofs of houses, apartments, and public buildings to generate in-grid electricity.\nIn most cases, solar cells have been installe

# Pipeline 3: Traditional RAG
### LlamaIndex (QueryEngine)

In [2]:
import os
os.chdir(r"F:\Intern\EDF\RAG-pR-main")

In [3]:
from domain_aware_rag import DomainAwareRAG

# 初始化完整系统
rag_lama = DomainAwareRAG(vector_store_path="RAG-Wikipedia/dataset/vector_storage_new")

# 自定义查询参数
result = rag_lama.query(
    user_query="How to improve the efficiency of silicon solar cells?",
    use_query_expansion=True,    # 启用查询扩展
    use_query_rewriting=True,    # 启用查询重写
    top_k=5                      # 检索文档数量
)

# 查看详细结果
print(f"原始查询: {result['original_query']}")
print(f"扩展查询: {result['expanded_query']}")
print(f"最终查询: {result['final_query']}")
print(f"答案: {result['answer']}")

  import pkg_resources  # type: ignore


✓ Llama-index embeddings available


  from .autonotebook import tqdm as notebook_tqdm


✓ Transformers version: 4.51.3
✗ Sentence-transformers not available
Loading Qwen3 model from: F:\Intern\EDF\EmbeddingModels\Qwen3-Embedding-0.6B
✓ Qwen3 model loaded successfully (Method 1)
Deep Lake Dataset in RAG-Wikipedia/dataset/vector_storage_new already exists, loading from the storage
✓ 检索器初始化完成
✓ 领域词典加载完成，共 1211 个术语
✓ LLM初始化完成: glm-4-flash
✓ 向量存储路径: RAG-Wikipedia/dataset/vector_storage_new

🔍 处理查询: How to improve the efficiency of silicon solar cells?
🌐 翻译后英文查询: How to improve the efficiency of silicon solar cells?
📚 提取的领域术语: ['Solar Cell', 'Efficiency', 'Silicon Solar Cell']
🔍 查询扩展: How to improve the efficiency of silicon solar cells? -> How to improve the efficiency of silicon solar cells? Component silicon solar cell solar cells efﬁciencies Silicon Solar Cell Solar Cell cell solar cells silicon Technology Efficiency
🔄 查询重写: How to improve the efficiency of silicon solar cells? Component silicon solar cell solar cells efﬁciencies Silicon Solar Cell Solar Cell cell solar cel

### LangChain (RetrievalQA)

In [4]:
from langchain_retrieval_qa import LangChainDomainRAG

rag_langchain = LangChainDomainRAG(vector_store_path="RAG-Wikipedia/dataset/vector_storage_new")
result = rag_langchain.query(
"How to improve the efficiency of solar cells?",
use_query_expansion=True,
use_query_rewriting=True,
top_k=5 
)

# 查看详细结果
print(f"原始查询: {result['original_query']}")
print(f"扩展查询: {result['expanded_query']}")
print(f"最终查询: {result['final_query']}")
print(f"答案: {result['answer']}")

Loading Qwen3 model from: F:\Intern\EDF\EmbeddingModels\Qwen3-Embedding-0.6B
✓ Qwen3 model loaded successfully (Method 1)
Deep Lake Dataset in RAG-Wikipedia/dataset/vector_storage_new already exists, loading from the storage
✓ LlamaIndex索引初始化完成
✓ LangChain组件初始化完成
✓ 领域词典加载完成，共 1211 个术语
✓ LangChain RetrievalQA初始化完成
✓ 向量存储路径: RAG-Wikipedia/dataset/vector_storage_new

🔍 处理查询: How to improve the efficiency of solar cells?
🌐 翻译后英文查询: How to improve the efficiency of solar cells?
📚 提取的领域术语: ['Solar Cell', 'Efficiency']
🔍 查询扩展: How to improve the efficiency of solar cells? -> How to improve the efficiency of solar cells? solar cells efﬁciencies Efficiency cells sol cells solar cell Solar Cell solar cell solar cell
🔄 查询重写: How to improve the efficiency of solar cells? solar cells efﬁciencies Efficiency cells sol cells solar cell Solar Cell solar cell solar cell -> How to enhance the efficiency of photovoltaic solar cells?
🔍 使用LangChain RetrievalQA查询: How to enhance the efficiency of photovoltai

### Evaluation

In [4]:
from evaluation_tools import RAGEvaluator
from domain_aware_rag import DomainAwareRAG
from langchain_retrieval_qa import LangChainDomainRAG

evaluator = RAGEvaluator(llm_api_key="41b29e65745d4110a018c5d616b0012f.A6CEwmornnYXSVLC")
rag_lama = DomainAwareRAG(vector_store_path="RAG-Wikipedia/dataset/vector_storage_new")
results_lama = evaluator.evaluate_rag_system(rag_lama)

rag_langchain = LangChainDomainRAG(vector_store_path="RAG-Wikipedia/dataset/vector_storage_new")
results_langchain = evaluator.evaluate_rag_system(rag_langchain)

print("=== LlamaIndex QueryEngine 评估报告 ===")
print(evaluator.generate_evaluation_report(results_lama))

print("=== LangChain RetrievalQA 评估报告 ===")
print(evaluator.generate_evaluation_report(results_langchain))

  import pkg_resources  # type: ignore


✓ Llama-index embeddings available


  from .autonotebook import tqdm as notebook_tqdm


✓ Transformers version: 4.51.3
✗ Sentence-transformers not available
Loading Qwen3 model from: F:\Intern\EDF\EmbeddingModels\Qwen3-Embedding-0.6B
✓ Qwen3 model loaded successfully (Method 1)
Deep Lake Dataset in RAG-Wikipedia/dataset/vector_storage_new already exists, loading from the storage
✓ 检索器初始化完成
✓ 领域词典加载完成，共 1211 个术语
✓ LLM初始化完成: glm-4-flash
✓ 向量存储路径: RAG-Wikipedia/dataset/vector_storage_new
🔧 生成测试用例...


  response = self.qa_chain.run({


LLM原始输出： 1. 效率相关问题：
   - **问题**：在2019年，高效硅电池的实验室转换效率达到了22%，请问这一效率在当时全球范围内属于什么水平，与之相比，目前的硅电池转换效率有怎样的提升？
   - **答案**：在2019年，实验室转换效率达到22%的硅电池属于当时全球领先水平，远高于当时的行业平均效率。目前，随着技术的进步，实验室转换效率已经超过了25%，而在实际应用的硅电池中，效率也普遍超过了20%，相比2019年有了显著的提升。

2. 制造工艺问题：
   - **问题**：晶体硅电池的制造工艺中，传统的丝网印刷工艺与最新的激光打印技术相比，在电池效率上有哪些差异？具体的数据能说明这一点吗？
   - **答案**：激光打印技术在电池效率上通常优于丝网印刷工艺。例如，使用激光打印技术可以在硅电池上实现更精细的图案和更低的电阻，从而提高电池的填充因子。实验数据显示，激光打印技术可以使电池效率提高约1-2%，达到22%以上，而传统的丝网印刷工艺的电池效率通常在20%左右。

3. 材料科学问题：
   - **问题**：在硅电池中引入掺杂剂如硼或磷，这些掺杂剂对电池的转换效率有何影响？能否给出具体影响程度的数据？
   - **答案**：掺杂剂如硼和磷能够调节硅电池中的电荷载流子浓度，从而影响电池的转换效率。硼掺杂可以增加电子浓度，而磷掺杂可以增加空穴浓度。研究表明，适当掺杂可以提升电池效率约0.5-1%。例如，硼掺杂的硅电池效率可以提升至20.5%，而磷掺杂的电池效率可以提升至20.2%。

4. 技术发展趋势问题：
   - **问题**：近年来，多晶硅电池与单晶硅电池在效率提升方面有何不同的发展趋势？能否预测未来几年这两种电池的效率提升空间？
   - **答案**：多晶硅电池和单晶硅电池在效率提升方面有着不同的趋势。多晶硅电池由于制造工艺相对简单，其效率提升速度较慢，但成本较低。单晶硅电池则由于更高的纯度和更好的晶体结构，效率提升速度更快。预计未来几年，多晶硅电池的转换效率有望提升至22-23%，而单晶硅电池的转换效率有望提升至25%以上。

5. 应用场景问题：
   - **问题**：在光伏发电领域，硅电池主要用于地面电站和分布式光伏系统。考虑到这些应用场景的特定需求，硅电池在效率、成本和可靠性方面应如何平衡？
   - **答案*

In [5]:
from evaluation_tools import RAGEvaluator
evaluator = RAGEvaluator(llm_api_key="41b29e65745d4110a018c5d616b0012f.A6CEwmornnYXSVLC")

# 对比 LlamaIndex RAG vs LLM直答
results_vs_llm_lama = evaluator.evaluate_rag_vs_llm(rag_lama)
evaluator.print_rag_vs_llm_report(results_vs_llm_lama)

results_vs_llm_langchain = evaluator.evaluate_rag_vs_llm(rag_langchain)
evaluator.print_rag_vs_llm_report(results_vs_llm_langchain)

🔧 生成测试用例...
LLM原始输出： 1. **效率相关问题**
   - **问题**：硅电池的效率在过去的十年中平均提高了多少？这种提高主要归因于哪些技术改进？
   - **答案**：在过去的十年中，硅电池的效率平均提高了约20%。这种提高主要归因于以下几点技术改进：首先，单晶硅和多晶硅电池的制程优化，例如使用更先进的蚀刻和沉积技术；其次，电池结构的创新，如使用纳米结构、 textured surfaces 和 anti-reflective coatings 来减少反射并增加吸收；最后，通过掺杂和电池设计优化，提高了电子的传输效率和减少能量损失。

2. **制造工艺问题**
   - **问题**：在硅电池制造中，离子注入技术如何提高电池的效率？
   - **答案**：离子注入技术通过向硅晶体中注入掺杂原子，如硼或磷，可以精确控制掺杂浓度和分布，从而优化电池的能带结构和载流子传输。这种技术在N型硅电池中特别有效，可以减少串联电阻，提高电池的短路电流，从而在25摄氏度时将电池效率提高约0.3%。

3. **材料科学问题**
   - **问题**：为什么使用钝化层对硅电池的性能至关重要，以及它如何影响开路电压？
   - **答案**：钝化层对硅电池性能至关重要，因为它可以防止硅表面与周围介质发生反应，减少表面复合，提高开路电压。通过使用高介电常数材料如SiO2或SiNx作为钝化层，可以显著提高电池的开路电压，通常在0.1-0.2V的范围内，从而提高整体电池效率。

4. **技术发展趋势问题**
   - **问题**：目前硅电池领域有哪些新兴技术可能会在未来五年内对效率提升产生显著影响？
   - **答案**：在未来五年内，以下新兴技术可能会对硅电池效率提升产生显著影响：
     - **钙钛矿/硅异质结**：结合钙钛矿太阳能电池的高效率和硅电池的稳定性，有望实现超过20%的转换效率。
     - **叠层电池**：通过将不同光谱响应的电池层叠，可以更有效地利用太阳光，提高整体效率。
     - **量子点太阳能电池**：利用量子点材料的光吸收特性，可以实现更宽的光谱响应范围，从而提高电池效率。

5. **应用场景问题**
   - **问题**：硅电池在大型太阳能发电场和便携式电子设备中的应用各有何特点？
   - **答案**：硅电

## Pipeline 4: Knowledge graph RAG

In [5]:
from enhanced_integrated_rag_system import EnhancedIntegratedRAGSystem

# 初始化增强系统
rag_system = EnhancedIntegratedRAGSystem()

# 执行混合检索
result = rag_system.query("如何提高太阳能电池的效率？")

# 比较不同方法
comparison = rag_system.compare_methods("如何提高太阳能电池的效率？")

# # 生成可视化
# viz_path = rag_system.visualize_graph("material_optimizations")

  import pkg_resources  # type: ignore


Loading Qwen3 model from: F:\Intern\EDF\EmbeddingModels\Qwen3-Embedding-0.6B
✓ Qwen3 model loaded successfully (Method 1)
Deep Lake Dataset in RAG-Wikipedia/dataset/vector_storage_new already exists, loading from the storage
✓ LlamaIndex索引初始化完成
✓ LangChain组件初始化完成
✓ 领域词典加载完成，共 1211 个术语
✓ LangChain RetrievalQA初始化完成
✓ 向量存储路径: RAG-Wikipedia/dataset/vector_storage_new
✓ 增强混合检索器初始化完成
✓ 增强集成RAG系统初始化完成

🔍 增强RAG查询: 如何提高太阳能电池的效率？

🔍 混合检索查询: 如何提高太阳能电池的效率？
📚 执行LangChain向量检索...

🔍 处理查询: 如何提高太阳能电池的效率？
🌐 翻译后英文查询: How to improve the efficiency of solar cells?
📚 提取的领域术语: ['Efficiency', 'Solar Cell']
🔍 查询扩展: How to improve the efficiency of solar cells? -> How to improve the efficiency of solar cells? solar cell solar cell solar efﬁciencies Solar Cell solar cells Efficiency cells cell sol cells
🔄 查询重写: How to improve the efficiency of solar cells? solar cell solar cell solar efﬁciencies Solar Cell solar cells Efficiency cells cell sol cells -> How to enhance Efficiency (Metric) of Solar Cells (Component

In [10]:
from enhanced_integrated_rag_system import EnhancedIntegratedRAGSystem

# 初始化系统
rag_system = EnhancedIntegratedRAGSystem()

# 执行查询（包含图谱可视化）
result = rag_system.query(
    "如何提高太阳能电池的效率？",
    vector_weight=0.6,
    graph_weight=0.4,

    graph_limit=50,
    use_query_expansion=True,
    use_query_rewriting=True,
    top_k=100,
    prompt_type="qa",
    use_structured_query=False,

    include_graph_visualization=True,
    include_explanation=True,
)

# 查看结果
print(result['hybrid_results']['final_answer'])  # 最终答案
print(result['graph_entities'])  # 图谱实体
# print(result['graph_visualization_path'])  # 可视化图片路径

Loading Qwen3 model from: F:\Intern\EDF\EmbeddingModels\Qwen3-Embedding-0.6B
✓ Qwen3 model loaded successfully (Method 1)
Deep Lake Dataset in RAG-Wikipedia/dataset/vector_storage_new already exists, loading from the storage
✓ LlamaIndex索引初始化完成
✓ LangChain组件初始化完成
✓ 领域词典加载完成，共 1211 个术语
✓ LangChain RetrievalQA初始化完成
✓ 向量存储路径: RAG-Wikipedia/dataset/vector_storage_new
✓ 增强混合检索器初始化完成
✓ 增强集成RAG系统初始化完成

🔍 增强RAG查询: 如何提高太阳能电池的效率？

🔍 混合检索查询: 如何提高太阳能电池的效率？
📚 执行LangChain向量检索...

🔍 处理查询: 如何提高太阳能电池的效率？
🌐 翻译后英文查询: How to improve the efficiency of solar cells?
📚 提取的领域术语: ['Solar Cell', 'Efficiency']
🔍 查询扩展: How to improve the efficiency of solar cells? -> How to improve the efficiency of solar cells? solar cells cells Efficiency cell Solar Cell solar cell solar solar cell sol cells efﬁciencies
🔄 查询重写: How to improve the efficiency of solar cells? solar cells cells Efficiency cell Solar Cell solar cell solar solar cell sol cells efﬁciencies -> How to enhance the efficiency of solar cells?
🔍 使用LangChain 

  plt.savefig(output_path, dpi=300, bbox_inches='tight')
  plt.savefig(output_path, dpi=300, bbox_inches='tight')
  plt.savefig(output_path, dpi=300, bbox_inches='tight')
  plt.savefig(output_path, dpi=300, bbox_inches='tight')
  plt.savefig(output_path, dpi=300, bbox_inches='tight')
  plt.savefig(output_path, dpi=300, bbox_inches='tight')
  plt.savefig(output_path, dpi=300, bbox_inches='tight')
  plt.savefig(output_path, dpi=300, bbox_inches='tight')
  plt.savefig(output_path, dpi=300, bbox_inches='tight')
  plt.savefig(output_path, dpi=300, bbox_inches='tight')
  plt.savefig(output_path, dpi=300, bbox_inches='tight')
  plt.savefig(output_path, dpi=300, bbox_inches='tight')
  plt.savefig(output_path, dpi=300, bbox_inches='tight')
  plt.savefig(output_path, dpi=300, bbox_inches='tight')
  plt.savefig(output_path, dpi=300, bbox_inches='tight')
  plt.savefig(output_path, dpi=300, bbox_inches='tight')
  plt.savefig(output_path, dpi=300, bbox_inches='tight')
  plt.savefig(output_path, dpi=

🎨 图谱可视化已保存: graph_visualization_1754561018.png
🕸️ 图谱查询发现 50 个实体
为了提高太阳能电池的效率，可以根据文档中提供的信息和该领域的当前研究状况采取几种策略。以下是提供文本中提到或暗示的一些方法：

1. **材料开发**：引入新型材料，如有机光伏（OPV）中的非富勒烯受体，以及探索新的吸收剂、传输和接触材料，可以显著提高光伏器件的效率。开发具有改进的光学和电学性质的新材料可以提高效率和稳定性[1]。

2. **先进的纳米光子设计**：利用先进的纳米光子设计可以改变太阳能谱，使其更有效地被太阳能电池转化为电能。这可能涉及使用等离子体和衍射纳米结构来增强光吸收和转换[2]。

3. **多结太阳能电池**：通过实施多结太阳能电池或串并联太阳能电池，可以利用具有不同半导体结的多层结构，优化不同部分的太阳能谱，从而增加太阳能电池可以转化为电能的波长范围[2]。

4. **中间带太阳能电池**：这些电池可以吸收更广泛的太阳能谱，通过利用更多阳光的能量来提高效率[2]。

5. **高效的表面钝化**：使用如高效表面钝化等技术可以减少太阳能电池的损失并提高阳光转化为电能的转换效率[2]。

6. **纳米结构径向结硅太阳能电池**：这些具有多带隙径向结的电池的开发具有提高效率和可扩展性的潜力。这种纳米结构径向结的大规模未开发潜力表明，有进一步性能提升的空间，并有可能在未来成为革命性技术[2]。

7. **结构优化**：提高太阳能电池结构以捕获更多入射光至关重要。可以采用如颗粒活性层、表面纹理、纳米孔图案化和下转换粒子等技术来优化太阳能电池结构[1]。

这些策略旨在通过捕获更广泛的太阳能谱、减少能量损失和改善材料特性来最大化太阳能电池的转换效率。然而，需要注意的是，这些方法的详细情况，特别是涉及新材料和技术的方法，可能没有在提供的文本中得到充分体现，并且在这些领域可能正在进行进一步的研究。

📚 知识来源信息：
- 检索到 100 个相关文档片段
- 来源 1: efficiency, and because they can be prepared and processed under 
mild conditions. These incentives have triggered academic inte

In [3]:
from enhanced_integrated_rag_system_v2 import EnhancedIntegratedRAGSystemV2

# 初始化系统
rag_system = EnhancedIntegratedRAGSystemV2()

# 执行查询（确保所有功能都启用）
result = rag_system.query(
    "如何提高太阳能电池的效率？",
    # 确保所有功能都启用
    include_citations=True,
    include_figures=True,
    include_graph_insights=True,
    include_graph_visualization=True,
    include_explanation=True,
    include_source_tracking=True,
    # 检索参数
    top_k=100,
    graph_limit=100,
    # 权重设置
    vector_weight=0.6,
    graph_weight=0.4,
    # 查询优化
    use_query_expansion=True,
    use_query_rewriting=True,
    use_structured_query=False,
    prompt_type="qa"
)

# 查看结果
print(f"执行时间: {result['total_execution_time']:.2f}秒")
print(f"增强答案长度: {len(result['enhanced_answer'])} 字符")

# 查看来源追踪信息
if result.get('source_tracking_info'):
    tracking_info = result['source_tracking_info']
    print(f"检索到的chunks数量: {tracking_info['total_chunks_retrieved']}")
    print(f"唯一来源数量: {len(tracking_info['unique_sources'])}")

print("📄 增强答案:")
print(result['enhanced_answer'])

print("\n�� 详细信息:")
print(f"文献引用: {len(result.get('citations', []))} 个")
print(f"图表信息: {len(result.get('figures', []))} 个")
print(f"图谱实体: {len(result.get('graph_entities', []))} 个")

  import pkg_resources  # type: ignore


✓ Llama-index embeddings available


  from .autonotebook import tqdm as notebook_tqdm


✓ Transformers version: 4.51.3
✗ Sentence-transformers not available
Loading Qwen3 model from: F:\Intern\EDF\EmbeddingModels\Qwen3-Embedding-0.6B
✓ Qwen3 model loaded successfully (Method 1)
Deep Lake Dataset in RAG-Wikipedia/dataset/vector_storage_new already exists, loading from the storage
✓ LlamaIndex索引初始化完成
✓ LangChain组件初始化完成
✓ 领域词典加载完成，共 1211 个术语
✓ LangChain RetrievalQA初始化完成
✓ 向量存储路径: RAG-Wikipedia/dataset/vector_storage_new
✓ 增强混合检索器初始化完成
✓ 增强集成RAG系统 V2 初始化完成

🔍 增强RAG查询 V2: 如何提高太阳能电池的效率？

🔍 混合检索查询: 如何提高太阳能电池的效率？
📚 执行LangChain向量检索...

🔍 处理查询: 如何提高太阳能电池的效率？
🌐 翻译后英文查询: How to improve the efficiency of solar cells?
📚 提取的领域术语: ['Solar Cell', 'Efficiency']
🔍 查询扩展: How to improve the efficiency of solar cells? -> How to improve the efficiency of solar cells? efﬁciencies cell Solar Cell cells solar cell solar Efficiency solar cells sol cells solar cell
🔄 查询重写: How to improve the efficiency of solar cells? efﬁciencies cell Solar Cell cells solar cell solar Efficiency solar cells sol cell

  plt.savefig(filename, dpi=300, bbox_inches='tight')
  plt.savefig(filename, dpi=300, bbox_inches='tight')
  plt.savefig(filename, dpi=300, bbox_inches='tight')
  plt.savefig(filename, dpi=300, bbox_inches='tight')
  plt.savefig(filename, dpi=300, bbox_inches='tight')
  plt.savefig(filename, dpi=300, bbox_inches='tight')
  plt.savefig(filename, dpi=300, bbox_inches='tight')
  plt.savefig(filename, dpi=300, bbox_inches='tight')
  plt.savefig(filename, dpi=300, bbox_inches='tight')
  plt.savefig(filename, dpi=300, bbox_inches='tight')
  plt.savefig(filename, dpi=300, bbox_inches='tight')
  plt.savefig(filename, dpi=300, bbox_inches='tight')
  plt.savefig(filename, dpi=300, bbox_inches='tight')


✅ 图谱可视化已保存: graph_visualization_1754639968.png
🕸️ 图谱查询发现 100 个实体
执行时间: 80.11秒
增强答案长度: 4908 字符
检索到的chunks数量: 100
唯一来源数量: 0
📄 增强答案:
提高太阳能电池的效率涉及各种策略，其中一些在提供的文档中有详细说明：

1. **先进材料研究**：正如文档中强调的，在材料研究方面取得了最新进展，这些进展集中在吸收剂、传输材料和接触材料上，这些材料可以改善光伏（PV）设备的高效性、稳定性和成本。[1] 这包括基于HaP的光伏电池的进步以及有机光伏（OPV）中非富勒烯受体的应用。[1]

2. **太阳能电池结构的优化**：文档提到，提高太阳能电池性能的一个重大问题是优化其结构以捕获更多的入射光。研究了许多技术，如等离子体和衍射纳米结构、向下转换粒子、表面纹理化和纳米孔图案化，以实现这一目标。[1] 例如，引入颗粒活性层可能有助于提高光吸收。[1]

3. **多结太阳能电池**：文档讨论了多结太阳能电池（也称为串联太阳能电池）的概念，作为增加用于转换成电能的波长范围的方法。[1] 这些电池由多层结构组成，每一层都是具有不同能级的半导体结，从而允许更广泛的光谱被转换成电能。[1]

4. **中间带太阳能电池**：文档中提到的另一种方法是中间带太阳能电池，它重新塑造太阳能谱，以便太阳能电池可以更有效地将其转换成电能。[1]

5. **纳米结构径向结硅太阳能电池**：文档强调，纳米结构径向结硅太阳能电池具有很大的未开发潜力。这些电池可以显著提高性能，并最终可能成为革命性的主流光伏技术。[1]

6. **技术创新生态系统**：建立技术创新生态系统对于进一步发展至关重要。这包括实施新的理解和技术，包括先进的纳米光子设计、高效的表面钝化和多带隙径向结。[1]

总之，提高太阳能电池的效率涉及结合先进的材料研究、优化太阳能电池结构、开发新的电池配置，如多结和中间带太阳能电池，以及利用新型技术，如纳米结构径向结的潜力。这些方法共同旨在提高太阳能转换系统的效率、稳定性和成本效益。

📚 知识来源信息：
- 检索到 100 个相关文档片段
- 来源 1: efficiency, and because they can be prepared and processed 

## Pipeline 4: Knowledge graph index-based RAG

In [9]:
from llama_index.core import Document

# Ensure 'text' column is of type string
df['text'] = df['text'].astype(str)
# Create documents with IDs
documents = [Document(text=row['text'], doc_id=str(row['id'])) for _, row in df.iterrows()]

### Generating the graph index

In [12]:
from llama_index.core import KnowledgeGraphIndex
import time
from typing import List

# Start the timer
start_time = time.time()

class RobustKnowledgeGraphIndex(KnowledgeGraphIndex):
    def _extract_triplets(self, text: str) -> List[tuple]:
        """Extract triplets with error handling to ignore failures."""
        try:
            # Call the parent class's triplet extraction method
            return super()._extract_triplets(text)
        except Exception as e:
            # Log the error (optional) and return an empty list to continue processing
            print(f"Error extracting triplets for text chunk: {e}")
            return []

# Graph index with embeddings
# test_docs = documents[:100]
graph_index = RobustKnowledgeGraphIndex.from_documents(
    documents,
    max_triplets_per_chunk=5,
    include_embeddings=True,
    show_progress=False,
)

# Stop the timer
end_time = time.time()

# Calculate and print the execution time
elapsed_time = end_time - start_time
print(f"Index creation time: {elapsed_time:.4f} seconds")

print(type(graph_index))

# Save the graph index to a file
graph_index_saving_path = "./dataset/graph_storage"
graph_index.storage_context.persist(persist_dir=graph_index_saving_path)

Error extracting triplets for text chunk: Error code: 400, with error text {"contentFilter":[{"level":1,"role":"assistant"}],"error":{"code":"1301","message":"系统检测到输入或生成内容可能包含不安全或敏感内容，请您避免输入易产生敏感内容的提示语，感谢您的配合。"}}
Error extracting triplets for text chunk: Error code: 400, with error text {"contentFilter":[{"level":1,"role":"user"}],"error":{"code":"1301","message":"系统检测到输入或生成内容可能包含不安全或敏感内容，请您避免输入易产生敏感内容的提示语，感谢您的配合。"}}
Error extracting triplets for text chunk: Error code: 400, with error text {"contentFilter":[{"level":1,"role":"user"}],"error":{"code":"1301","message":"系统检测到输入或生成内容可能包含不安全或敏感内容，请您避免输入易产生敏感内容的提示语，感谢您的配合。"}}
Error extracting triplets for text chunk: Error code: 400, with error text {"contentFilter":[{"level":1,"role":"user"}],"error":{"code":"1301","message":"系统检测到输入或生成内容可能包含不安全或敏感内容，请您避免输入易产生敏感内容的提示语，感谢您的配合。"}}
Error extracting triplets for text chunk: Error code: 400, with error text {"contentFilter":[{"level":1,"role":"user"}],"error":{"code":"1301","message":"系统检测到输入或生

### Displaying the graph in HTML file

In [18]:
# Load the Graph data
from llama_index.core import StorageContext, load_index_from_storage

graph_index_saving_path = "./dataset/graph_storage"
storage_context = StorageContext.from_defaults(persist_dir=graph_index_saving_path)

graph_index = load_index_from_storage(storage_context)

# Create graph
from pyvis.network import Network

g = graph_index.get_networkx_graph()
net = Network(notebook=True, cdn_resources="in_line", directed=True)
net.from_nx(g)

# Set node and edge properties: colors and sizes
for node in net.nodes:
    node['color'] = 'lightgray'
    node['size'] = 10

for edge in net.edges:
    edge['color'] = 'black'
    edge['width'] = 1

fgraph="Knowledge_graph_visual.html"
net.write_html(fgraph)
print(fgraph)

Loading llama_index.core.storage.kvstore.simple_kvstore from ./dataset/graph_storage\docstore.json.
Loading llama_index.core.storage.kvstore.simple_kvstore from ./dataset/graph_storage\index_store.json.


ModuleNotFoundError: No module named 'pyvis'

## Interacting with the Knowledge graph index

In [14]:
import time
import textwrap



#similarity_top_k
k=3
#temperature
temp=0.1
#num_output
mt=1024
graph_query_engine = graph_index.as_query_engine(similarity_top_k=k, temperature=temp, num_output=mt)

def execute_query(user_input, k=3, temp=0.1, mt=1024):

    # Start the timer
    start_time = time.time()

    # Execute the query with additional parameters
    response = graph_query_engine.query(user_input)

    # Stop the timer
    end_time = time.time()

    # Calculate and print the execution time
    elapsed_time = end_time - start_time
    print(f"Query execution time: {elapsed_time:.4f} seconds")

    # Print the response, wrapped to 100 characters per line
    print(textwrap.fill(str(response), 100))
    return response

In [15]:
user_query="Summarise the methods to improve efficiency of solar cells, in ordered list."

In [16]:
import time
import textwrap
import sys
import io
# Start the timer
start_time = time.time()
# Capture the output
old_stdout = sys.stdout
new_stdout = io.StringIO()
sys.stdout = new_stdout
response = execute_query(user_query)
# Restore stdout
sys.stdout = old_stdout
# Stop the timer
end_time = time.time()
# Calculate and print the execution time
elapsed_time = end_time - start_time
print(f"Query execution time: {elapsed_time:.4f} seconds")

print(textwrap.fill(str(response), 120))

Query execution time: 10.6188 seconds
1. Introduce internal polymer/nanotube junctions within the polymer matrix to enhance charge separation and collection.
2. Use a bulk donor-acceptor heterojunction with a bicontinuous network to allow electrons and holes to travel toward
their respective contacts. 3. Implement charge separation at polymer-SWCNT connections for more efficient electron
transport. 4. Blend functionalized MWCNTs into P3HT polymer to create a P3HT-MWCNT with fullerene C60 double-layered
device. 5. Use C60-modified SWCNTs and P3HT to fabricate polymer-SWCNT composites for improved short circuit current
density and electron transport. 6. Heat the blend to the glass transition temperature of the polymer to manipulate phase
separation and improve charge transfer, transport, and collection. 7. Employ tetraoctylammonium bromide in
tetrahydrofuran to assist in suspension and expose SWCNTs to an electrophoretic field for deposition.


In [None]:
# 增强版文档分割代码 - 包含完整metadata信息
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import glob
from llama_index.core import Document

import pdfplumber
import fitz  # PyMuPDF
import os
from datetime import datetime
import hashlib

import warnings
warnings.filterwarnings("ignore")

def extract_pdf_basic_info(pdf_path):
    """提取PDF基本信息作为metadata"""
    try:
        doc = fitz.open(pdf_path)
        metadata = doc.metadata
        doc.close()
        
        # 获取文件信息
        file_stats = os.stat(pdf_path)
        creation_time = datetime.fromtimestamp(file_stats.st_ctime).isoformat()
        modification_time = datetime.fromtimestamp(file_stats.st_mtime).isoformat()
        file_size = file_stats.st_size
        
        return {
            'title': metadata.get('title', ''),
            'author': metadata.get('author', ''),
            'subject': metadata.get('subject', ''),
            'creator': metadata.get('creator', ''),
            'producer': metadata.get('producer', ''),
            'creation_date': metadata.get('creationDate', ''),
            'modification_date': metadata.get('modDate', ''),
            'file_creation_time': creation_time,
            'file_modification_time': modification_time,
            'file_size_bytes': file_size
        }
    except Exception as e:
        print(f"Warning: Could not extract metadata from {pdf_path}: {e}")
        return {}

def create_enhanced_documents_with_metadata():
    """创建包含完整metadata的文档块"""
    
    pdf_files = glob.glob("../zotero/*.pdf")
    all_docs = []
    
    # 分割器
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=100,
        separators=["\n\n", "\n", ".", "。", "！", "？", "!", "?"]
    )
    
    for pdf in pdf_files:
        file_name = os.path.basename(pdf)
        file_path = os.path.abspath(pdf)
        
        # 提取PDF基本信息
        pdf_metadata = extract_pdf_basic_info(pdf)
        
        # 1. 文本块提取（增强版：包含完整metadata）
        loader = PyPDFLoader(pdf)
        documents = loader.load()
        docs = splitter.split_documents(documents)
        
        for chunk_idx, doc in enumerate(docs):
            # 创建详细的metadata
            chunk_metadata = {
                # 源文件信息
                'source_file': file_name,
                'source_path': file_path,
                'file_type': 'pdf',
                
                # PDF文档信息
                'pdf_title': pdf_metadata.get('title', ''),
                'pdf_author': pdf_metadata.get('author', ''),
                'pdf_subject': pdf_metadata.get('subject', ''),
                'pdf_creator': pdf_metadata.get('creator', ''),
                'pdf_creation_date': pdf_metadata.get('creation_date', ''),
                
                # 文件系统信息
                'file_size_bytes': pdf_metadata.get('file_size_bytes', 0),
                'file_creation_time': pdf_metadata.get('file_creation_time', ''),
                'file_modification_time': pdf_metadata.get('file_modification_time', ''),
                
                # 块信息
                'chunk_type': 'text',
                'chunk_index': chunk_idx,
                'page_number': doc.metadata.get('page', 'unknown'),
                'chunk_size': len(doc.page_content),
                
                # 原始PyPDFLoader metadata
                'original_metadata': doc.metadata,
                
                # 处理时间戳
                'processing_timestamp': datetime.now().isoformat(),
                
                # 内容哈希（用于去重和版本控制）
                'content_hash': hashlib.md5(doc.page_content.encode()).hexdigest(),
                
                # 统计信息
                'word_count': len(doc.page_content.split()),
                'char_count': len(doc.page_content)
            }
            
            # 创建带metadata的Document
            enhanced_doc = Document(
                text=doc.page_content,
                metadata=chunk_metadata
            )
            all_docs.append(enhanced_doc)

        # 2. 表格提取（增强版：包含完整metadata）
        with pdfplumber.open(pdf) as pdf_doc:
            for page_num, page in enumerate(pdf_doc.pages):
                tables = page.extract_tables()
                for table_idx, table in enumerate(tables):
                    # 转为结构化文本
                    table_text = "\n".join([", ".join([str(cell) if cell is not None else "" for cell in row]) for row in table])
                    
                    # 表格标识信息
                    table_header = f"【表格】文件:{file_name}, 页码:{page_num+1}, 表格序号:{table_idx+1}"
                    full_table_text = f"{table_header}\n{table_text}"
                    
                    # 表格metadata
                    table_metadata = {
                        # 源文件信息
                        'source_file': file_name,
                        'source_path': file_path,
                        'file_type': 'pdf',
                        
                        # PDF文档信息
                        'pdf_title': pdf_metadata.get('title', ''),
                        'pdf_author': pdf_metadata.get('author', ''),
                        'pdf_subject': pdf_metadata.get('subject', ''),
                        
                        # 表格特定信息
                        'chunk_type': 'table',
                        'page_number': page_num + 1,
                        'table_index': table_idx + 1,
                        'table_rows': len(table),
                        'table_cols': len(table[0]) if table else 0,
                        
                        # 处理信息
                        'processing_timestamp': datetime.now().isoformat(),
                        'content_hash': hashlib.md5(full_table_text.encode()).hexdigest(),
                        'word_count': len(full_table_text.split()),
                        'char_count': len(full_table_text)
                    }
                    
                    table_doc = Document(
                        text=full_table_text,
                        metadata=table_metadata
                    )
                    all_docs.append(table_doc)

        # 3. 图像/图表提取（增强版：包含完整metadata）
        doc = fitz.open(pdf)
        for page_index in range(len(doc)):
            page = doc[page_index]
            for img_index, img in enumerate(page.get_images(full=True)):
                xref = img[0]
                base_image = doc.extract_image(xref)
                image_bytes = base_image["image"]
                img_ext = base_image["ext"]
                img_name = f"{os.path.splitext(file_name)[0]}_page{page_index+1}_img{img_index+1}.{img_ext}"
                img_save_path = os.path.join("./extracted_images", img_name)
                os.makedirs("./extracted_images", exist_ok=True)
                
                with open(img_save_path, "wb") as f:
                    f.write(image_bytes)
                
                # 图像描述文本
                image_description = f"【图表】文件:{file_name}, 页码:{page_index+1}, 图片序号:{img_index+1}, 路径:{img_save_path}"
                
                # 图像metadata
                image_metadata = {
                    # 源文件信息
                    'source_file': file_name,
                    'source_path': file_path,
                    'file_type': 'pdf',
                    
                    # PDF文档信息
                    'pdf_title': pdf_metadata.get('title', ''),
                    'pdf_author': pdf_metadata.get('author', ''),
                    'pdf_subject': pdf_metadata.get('subject', ''),
                    
                    # 图像特定信息
                    'chunk_type': 'image',
                    'page_number': page_index + 1,
                    'image_index': img_index + 1,
                    'image_format': img_ext,
                    'image_path': img_save_path,
                    'image_size_bytes': len(image_bytes),
                    
                    # 图像技术信息
                    'image_width': base_image.get('width', 0),
                    'image_height': base_image.get('height', 0),
                    'image_colorspace': base_image.get('colorspace', 'unknown'),
                    
                    # 处理信息
                    'processing_timestamp': datetime.now().isoformat(),
                    'content_hash': hashlib.md5(image_description.encode()).hexdigest(),
                    'word_count': len(image_description.split()),
                    'char_count': len(image_description)
                }
                
                image_doc = Document(
                    text=image_description,
                    metadata=image_metadata
                )
                all_docs.append(image_doc)
        
        doc.close()
    
    return all_docs

# 执行文档处理
print("开始处理文档，添加完整metadata信息...")
enhanced_documents = create_enhanced_documents_with_metadata()

print(f"共加载分割 {len(enhanced_documents)} 个文档块（含文本、表格、图表信息）")
print(f"文档类型分布:")
chunk_types = {}
for doc in enhanced_documents:
    chunk_type = doc.metadata.get('chunk_type', 'unknown')
    chunk_types[chunk_type] = chunk_types.get(chunk_type, 0) + 1

for chunk_type, count in chunk_types.items():
    print(f"  {chunk_type}: {count} 个")

print(f"\n示例文档块metadata:")
if enhanced_documents:
    example_doc = enhanced_documents[0]
    print(f"文本预览: {example_doc.text[:200]}...")
    print(f"Metadata keys: {list(example_doc.metadata.keys())}")
    print(f"源文件: {example_doc.metadata.get('source_file', 'N/A')}")
    print(f"块类型: {example_doc.metadata.get('chunk_type', 'N/A')}")
    print(f"页码: {example_doc.metadata.get('page_number', 'N/A')}")
    print(f"PDF标题: {example_doc.metadata.get('pdf_title', 'N/A')}")
    print(f"PDF作者: {example_doc.metadata.get('pdf_author', 'N/A')}")

# 更新documents变量用于后续处理
documents = enhanced_documents


In [8]:
# 验证metadata信息
def verify_metadata_completeness(documents):
    """验证metadata信息是否完整"""
    print("=== Metadata完整性验证 ===")
    
    required_fields = ['source_file', 'chunk_type', 'page_number', 'processing_timestamp']
    
    for i, doc in enumerate(documents[:5]):  # 检查前5个文档
        print(f"\n文档 {i+1}:")
        print(f"文本长度: {len(doc.text)} 字符")
        
        missing_fields = []
        for field in required_fields:
            if field not in doc.metadata:
                missing_fields.append(field)
            else:
                print(f"  {field}: {doc.metadata[field]}")
        
        if missing_fields:
            print(f"  ❌ 缺失字段: {missing_fields}")
        else:
            print(f"  ✅ 所有必需字段完整")
            
        # 显示文献信息
        if doc.metadata.get('pdf_title'):
            print(f"  📖 文献标题: {doc.metadata['pdf_title']}")
        if doc.metadata.get('pdf_author'):
            print(f"  👤 文献作者: {doc.metadata['pdf_author']}")

# 执行验证
verify_metadata_completeness(documents)

print("\n=== 源文献追踪功能演示 ===")
def get_source_citation(doc_metadata):
    """根据metadata生成引用信息"""
    citation_parts = []
    
    # 作者
    if doc_metadata.get('pdf_author'):
        citation_parts.append(doc_metadata['pdf_author'])
    
    # 标题
    if doc_metadata.get('pdf_title'):
        citation_parts.append(f'"{doc_metadata["pdf_title"]}"')
    
    # 文件名（如果没有标题）
    if not doc_metadata.get('pdf_title') and doc_metadata.get('source_file'):
        citation_parts.append(doc_metadata['source_file'])
    
    # 页码
    if doc_metadata.get('page_number') and doc_metadata['page_number'] != 'unknown':
        citation_parts.append(f"p. {doc_metadata['page_number']}")
    
    # 块类型
    if doc_metadata.get('chunk_type'):
        if doc_metadata['chunk_type'] == 'table':
            citation_parts.append(f"(表格{doc_metadata.get('table_index', '')})")
        elif doc_metadata['chunk_type'] == 'image':
            citation_parts.append(f"(图{doc_metadata.get('image_index', '')})")
    
    return ", ".join(citation_parts)

# 示例：为前3个文档生成引用
for i, doc in enumerate(documents[:3]):
    citation = get_source_citation(doc.metadata)
    print(f"文档{i+1}引用: {citation}")
    print(f"内容预览: {doc.text[:100]}...\n")


=== Metadata完整性验证 ===

文档 1:
文本长度: 52 字符
  source_file: 978-3-662-56472-1.pdf
  chunk_type: text
  page_number: 0
  processing_timestamp: 2025-08-16
  ✅ 所有必需字段完整

文档 2:
文本长度: 32 字符
  source_file: 978-3-662-56472-1.pdf
  chunk_type: text
  page_number: 1
  processing_timestamp: 2025-08-16
  ✅ 所有必需字段完整

文档 3:
文本长度: 81 字符
  source_file: 978-3-662-56472-1.pdf
  chunk_type: text
  page_number: 2
  processing_timestamp: 2025-08-16
  ✅ 所有必需字段完整

文档 4:
文本长度: 780 字符
  source_file: 978-3-662-56472-1.pdf
  chunk_type: text
  page_number: 3
  processing_timestamp: 2025-08-16
  ✅ 所有必需字段完整

文档 5:
文本长度: 795 字符
  source_file: 978-3-662-56472-1.pdf
  chunk_type: text
  page_number: 3
  processing_timestamp: 2025-08-16
  ✅ 所有必需字段完整

=== 源文献追踪功能演示 ===
文档1引用: 978-3-662-56472-1.pdf
内容预览: Handbook of 
Photovoltaic 
Silicon
Deren Yang
Editor...

文档2引用: 978-3-662-56472-1.pdf, p. 1
内容预览: Handbook of Photovoltaic Silicon...

文档3引用: 978-3-662-56472-1.pdf, p. 2
内容预览: Deren Yang
Editor
Handbook of Photovoltaic
Sil

In [5]:
# 修复metadata过长问题 - 创建精简版metadata

from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import glob
from llama_index.core import Document

import pdfplumber
import fitz  # PyMuPDF
import os
from datetime import datetime
import hashlib

import warnings
warnings.filterwarnings("ignore")

def extract_pdf_basic_info(pdf_path):
    """提取PDF基本信息作为metadata（优化版：减少冗余信息）"""
    try:
        doc = fitz.open(pdf_path)
        metadata = doc.metadata
        doc.close()
        
        # 获取文件信息
        file_stats = os.stat(pdf_path)
        file_size = file_stats.st_size
        
        # 只返回核心字段，并限制长度避免metadata过长
        return {
            'title': (metadata.get('title', '') or '')[:100],  # 限制长度
            'author': (metadata.get('author', '') or '')[:50],   # 限制长度
            'subject': (metadata.get('subject', '') or '')[:50], # 限制长度
            'file_size_bytes': file_size
        }
    except Exception as e:
        print(f"Warning: Could not extract metadata from {pdf_path}: {e}")
        return {}

def create_optimized_documents_with_metadata():
    """创建包含优化metadata的文档块，避免metadata过长问题"""
    
    pdf_files = glob.glob("../zotero/*.pdf")
    all_docs = []
    
    # 分割器
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=100,
        separators=["\n\n", "\n", ".", "。", "！", "？", "!", "?"]
    )
    
    for pdf in pdf_files:
        file_name = os.path.basename(pdf)
        file_path = os.path.abspath(pdf)
        
        # 提取PDF基本信息
        pdf_metadata = extract_pdf_basic_info(pdf)
        
        # 1. 文本块提取（优化版：精简metadata）
        loader = PyPDFLoader(pdf)
        documents = loader.load()
        docs = splitter.split_documents(documents)
        
        for chunk_idx, doc in enumerate(docs):
            # 创建精简的metadata（只保留核心字段）
            chunk_metadata = {
                # 核心源文件信息
                'source_file': file_name,
                'file_type': 'pdf',
                
                # 核心PDF文档信息（仅保留非空字段）
                'pdf_title': pdf_metadata.get('title', '')[:100] if pdf_metadata.get('title') else '',  # 限制长度
                'pdf_author': pdf_metadata.get('author', '')[:50] if pdf_metadata.get('author') else '',  # 限制长度
                
                # 核心块信息
                'chunk_type': 'text',
                'chunk_index': chunk_idx,
                'page_number': doc.metadata.get('page', 'unknown'),
                
                # 处理时间戳（简化格式）
                'processing_timestamp': datetime.now().strftime('%Y-%m-%d'),
                
                # 内容统计（精简）
                'word_count': len(doc.page_content.split()),
                'char_count': len(doc.page_content)
            }
            
            # 创建带精简metadata的Document
            enhanced_doc = Document(
                text=doc.page_content,
                metadata=chunk_metadata
            )
            all_docs.append(enhanced_doc)

        # 2. 表格提取（优化版：精简metadata）
        with pdfplumber.open(pdf) as pdf_doc:
            for page_num, page in enumerate(pdf_doc.pages):
                tables = page.extract_tables()
                for table_idx, table in enumerate(tables):
                    # 转为结构化文本
                    table_text = "\n".join([", ".join([str(cell) if cell is not None else "" for cell in row]) for row in table])
                    
                    # 表格标识信息
                    table_header = f"【表格】文件:{file_name}, 页码:{page_num+1}, 表格序号:{table_idx+1}"
                    full_table_text = f"{table_header}\n{table_text}"
                    
                    # 精简表格metadata
                    table_metadata = {
                        'source_file': file_name,
                        'file_type': 'pdf',
                        'pdf_title': pdf_metadata.get('title', '')[:100] if pdf_metadata.get('title') else '',
                        'pdf_author': pdf_metadata.get('author', '')[:50] if pdf_metadata.get('author') else '',
                        'chunk_type': 'table',
                        'page_number': page_num + 1,
                        'table_index': table_idx + 1,
                        'table_rows': len(table),
                        'table_cols': len(table[0]) if table else 0,
                        'processing_timestamp': datetime.now().strftime('%Y-%m-%d'),
                        'word_count': len(full_table_text.split()),
                        'char_count': len(full_table_text)
                    }
                    
                    table_doc = Document(
                        text=full_table_text,
                        metadata=table_metadata
                    )
                    all_docs.append(table_doc)

        # 3. 图像/图表提取（优化版：精简metadata）
        doc = fitz.open(pdf)
        for page_index in range(len(doc)):
            page = doc[page_index]
            for img_index, img in enumerate(page.get_images(full=True)):
                xref = img[0]
                base_image = doc.extract_image(xref)
                image_bytes = base_image["image"]
                img_ext = base_image["ext"]
                img_name = f"{os.path.splitext(file_name)[0]}_page{page_index+1}_img{img_index+1}.{img_ext}"
                img_save_path = os.path.join("./extracted_images", img_name)
                os.makedirs("./extracted_images", exist_ok=True)
                
                with open(img_save_path, "wb") as f:
                    f.write(image_bytes)
                
                # 图像描述文本
                image_description = f"【图表】文件:{file_name}, 页码:{page_index+1}, 图片序号:{img_index+1}, 路径:{img_save_path}"
                
                # 精简图像metadata
                image_metadata = {
                    'source_file': file_name,
                    'file_type': 'pdf',
                    'pdf_title': pdf_metadata.get('title', '')[:100] if pdf_metadata.get('title') else '',
                    'pdf_author': pdf_metadata.get('author', '')[:50] if pdf_metadata.get('author') else '',
                    'chunk_type': 'image',
                    'page_number': page_index + 1,
                    'image_index': img_index + 1,
                    'image_format': img_ext,
                    'image_path': img_save_path,
                    'processing_timestamp': datetime.now().strftime('%Y-%m-%d'),
                    'word_count': len(image_description.split()),
                    'char_count': len(image_description)
                }
                
                image_doc = Document(
                    text=image_description,
                    metadata=image_metadata
                )
                all_docs.append(image_doc)
        
        doc.close()
    
    return all_docs

# 计算metadata长度的函数
def calculate_metadata_length(doc):
    """计算单个文档的metadata长度"""
    import json
    metadata_str = json.dumps(doc.metadata, ensure_ascii=False)
    return len(metadata_str)

print("=== 创建优化版文档（精简metadata）===")
optimized_documents = create_optimized_documents_with_metadata()

print(f"共加载分割 {len(optimized_documents)} 个文档块（含文本、表格、图表信息）")

# 检查metadata长度
metadata_lengths = [calculate_metadata_length(doc) for doc in optimized_documents[:10]]
max_metadata_length = max(metadata_lengths)
avg_metadata_length = sum(metadata_lengths) / len(metadata_lengths)

print(f"\n📊 Metadata长度统计（前10个文档）:")
print(f"最大metadata长度: {max_metadata_length} 字符")
print(f"平均metadata长度: {avg_metadata_length:.1f} 字符")

# 显示优化后的metadata示例
print(f"\n示例优化后metadata:")
if optimized_documents:
    example_doc = optimized_documents[0]
    print(f"文本预览: {example_doc.text[:200]}...")
    print(f"Metadata keys: {list(example_doc.metadata.keys())}")
    print(f"Metadata长度: {calculate_metadata_length(example_doc)} 字符")
    
    import json
    print(f"Metadata内容:")
    print(json.dumps(example_doc.metadata, ensure_ascii=False, indent=2))

# 更新documents变量
documents = optimized_documents
print(f"\n✅ 已更新documents变量，共 {len(documents)} 个文档块")


=== 创建优化版文档（精简metadata）===


Cannot set gray non-stroke color because /'Pa1' is an invalid float value
Cannot set gray non-stroke color because /'Pa2' is an invalid float value
Cannot set gray non-stroke color because /'Pa3' is an invalid float value
Cannot set gray non-stroke color because /'Pa1' is an invalid float value
Cannot set gray non-stroke color because /'Pa2' is an invalid float value
Cannot set gray non-stroke color because /'Pa1' is an invalid float value
Cannot set gray non-stroke color because /'Pa1' is an invalid float value
Cannot set gray non-stroke color because /'Pa1' is an invalid float value
Cannot set gray non-stroke color because /'Pa1' is an invalid float value
Cannot set gray non-stroke color because /'Pa1' is an invalid float value
Cannot set gray non-stroke color because /'Pa1' is an invalid float value
Cannot set gray non-stroke color because /'Pa2' is an invalid float value
Cannot set gray non-stroke color because /'Pa1' is an invalid float value
Cannot set gray non-stroke color becau

共加载分割 18727 个文档块（含文本、表格、图表信息）

📊 Metadata长度统计（前10个文档）:
最大metadata长度: 229 字符
平均metadata长度: 228.0 字符

示例优化后metadata:
文本预览: Handbook of 
Photovoltaic 
Silicon
Deren Yang
Editor...
Metadata keys: ['source_file', 'file_type', 'pdf_title', 'pdf_author', 'chunk_type', 'chunk_index', 'page_number', 'processing_timestamp', 'word_count', 'char_count']
Metadata长度: 226 字符
Metadata内容:
{
  "source_file": "978-3-662-56472-1.pdf",
  "file_type": "pdf",
  "pdf_title": "",
  "pdf_author": "",
  "chunk_type": "text",
  "chunk_index": 0,
  "page_number": 0,
  "processing_timestamp": "2025-08-17",
  "word_count": 7,
  "char_count": 52
}

✅ 已更新documents变量，共 18727 个文档块


In [7]:
# 使用增强版文档进行向量化
print("=== 准备进行向量化存储 ===")
print(f"总文档数量: {len(documents)}")

# 显示metadata统计信息
metadata_stats = {}
for doc in documents:
    for key in doc.metadata.keys():
        if key not in metadata_stats:
            metadata_stats[key] = 0
        metadata_stats[key] += 1

print(f"Metadata字段覆盖率:")
for key, count in sorted(metadata_stats.items()):
    coverage = (count / len(documents)) * 100
    print(f"  {key}: {count}/{len(documents)} ({coverage:.1f}%)")

# 创建向量存储（使用新的路径以区别于旧版本）
from llama_index.vector_stores.deeplake import DeepLakeVectorStore
from llama_index.core import StorageContext, VectorStoreIndex

vector_dataset_new = "./dataset/vector_storage_with_metadata"
print(f"\n准备创建向量存储: {vector_dataset_new}")

# 创建包含完整metadata的向量索引
try:
    vector_store = DeepLakeVectorStore(dataset_path=vector_dataset_new, overwrite=True)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    
    print("开始向量化处理...")
    # 使用带有完整metadata的文档创建索引
    index_with_metadata = VectorStoreIndex.from_documents(
        documents, 
        storage_context=storage_context, 
        show_progress=True
    )
    
    print(f"✅ 向量化完成！索引已保存到: {vector_dataset_new}")
    
except Exception as e:
    print(f"❌ 向量化过程中出现错误: {e}")

print("\n=== RAG系统中的源文献追踪演示 ===")
# 演示如何在查询时返回源文献信息
def enhanced_query_with_sources(query, top_k=3):
    """执行查询并返回带源文献信息的结果"""
    try:
        # 创建查询引擎
        query_engine = index_with_metadata.as_query_engine(
            similarity_top_k=top_k,
            response_mode="tree_summarize"
        )
        
        # 执行查询
        response = query_engine.query(query)
        
        print(f"查询: {query}")
        print(f"答案: {response}")
        
        # 获取源节点信息
        if hasattr(response, 'source_nodes'):
            print(f"\n📚 引用来源 ({len(response.source_nodes)} 个):")
            for i, node in enumerate(response.source_nodes, 1):
                metadata = node.node.metadata
                citation = get_source_citation(metadata)
                score = getattr(node, 'score', 'N/A')
                
                print(f"{i}. {citation}")
                print(f"   相似度分数: {score}")
                print(f"   内容: {node.node.text[:150]}...")
                print()
        
        return response
        
    except Exception as e:
        print(f"查询执行失败: {e}")
        return None

# 如果向量化成功，执行演示查询
if 'index_with_metadata' in locals():
    print("执行演示查询...")
    enhanced_query_with_sources("如何提高太阳能电池效率？", top_k=3)
else:
    print("向量化未成功，跳过查询演示")


=== 准备进行向量化存储 ===
总文档数量: 18727
Metadata字段覆盖率:
  char_count: 18727/18727 (100.0%)
  chunk_index: 13528/18727 (72.2%)
  chunk_type: 18727/18727 (100.0%)
  file_type: 18727/18727 (100.0%)
  image_format: 4320/18727 (23.1%)
  image_index: 4320/18727 (23.1%)
  image_path: 4320/18727 (23.1%)
  page_number: 18727/18727 (100.0%)
  pdf_author: 18727/18727 (100.0%)
  pdf_title: 18727/18727 (100.0%)
  processing_timestamp: 18727/18727 (100.0%)
  source_file: 18727/18727 (100.0%)
  table_cols: 879/18727 (4.7%)
  table_index: 879/18727 (4.7%)
  table_rows: 879/18727 (4.7%)
  word_count: 18727/18727 (100.0%)

准备创建向量存储: ./dataset/vector_storage_with_metadata




开始向量化处理...


Parsing nodes: 100%|██████████| 18727/18727 [00:25<00:00, 733.64it/s] 
Generating embeddings: 100%|██████████| 2048/2048 [3:24:25<00:00,  5.99s/it]  


Uploading data to deeplake dataset.


100%|██████████| 2048/2048 [00:14<00:00, 137.93it/s]


Dataset(path='./dataset/vector_storage_with_metadata', tensors=['text', 'metadata', 'embedding', 'id'])

  tensor      htype       shape       dtype  compression
  -------    -------     -------     -------  ------- 
   text       text      (2048, 1)      str     None   
 metadata     json      (2048, 1)      str     None   
 embedding  embedding  (2048, 1024)  float32   None   
    id        text      (2048, 1)      str     None   


Generating embeddings: 100%|██████████| 2048/2048 [3:25:53<00:00,  6.03s/it]  


Uploading data to deeplake dataset.


100%|██████████| 2048/2048 [00:16<00:00, 127.25it/s]


Dataset(path='./dataset/vector_storage_with_metadata', tensors=['text', 'metadata', 'embedding', 'id'])

  tensor      htype       shape       dtype  compression
  -------    -------     -------     -------  ------- 
   text       text      (4096, 1)      str     None   
 metadata     json      (4096, 1)      str     None   
 embedding  embedding  (4096, 1024)  float32   None   
    id        text      (4096, 1)      str     None   


Generating embeddings: 100%|██████████| 2048/2048 [3:45:47<00:00,  6.62s/it]  


Uploading data to deeplake dataset.


100%|██████████| 2048/2048 [00:27<00:00, 75.66it/s] 


Dataset(path='./dataset/vector_storage_with_metadata', tensors=['text', 'metadata', 'embedding', 'id'])

  tensor      htype       shape       dtype  compression
  -------    -------     -------     -------  ------- 
   text       text      (6144, 1)      str     None   
 metadata     json      (6144, 1)      str     None   
 embedding  embedding  (6144, 1024)  float32   None   
    id        text      (6144, 1)      str     None   


Generating embeddings:  71%|███████▏  | 1460/2048 [3:08:05<1:17:02,  7.86s/it]

: 

In [6]:
# 快速诊断测试 - 使用极小批量验证DeepLake存储
import os
import shutil
from llama_index.vector_stores.deeplake import DeepLakeVectorStore
from llama_index.core import StorageContext, VectorStoreIndex

print("🚀 开始快速诊断测试: DeepLake本地存储")

try:
    # 定义一个极小的批量进行测试
    batch_size = 5 
    if 'documents' not in locals() or not documents:
        print("⚠️ 'documents' 变量未定义或为空。请确保在运行此单元格之前，先运行文档加载和处理的单元格。")
    elif len(documents) < batch_size:
        print(f"⚠️ 文档总数 ({len(documents)}) 小于测试批量 ({batch_size}), 将使用所有可用文档。")
        test_docs = documents
    else:
        test_docs = documents[:batch_size]
        print(f"📋 测试文档数量: {len(test_docs)}")

        # 定义测试用的数据库路径
        test_path = "./dataset/vector_storage_quick_test"
        
        # 如果旧的测试路径存在，先清理
        if os.path.exists(test_path):
            try:
                shutil.rmtree(test_path)
                print(f"✅ 已清理旧的测试数据库: {test_path}")
            except Exception as e:
                print(f"⚠️ 清理旧的测试数据库失败 {test_path}: {e}")

        # 初始化DeepLake向量存储 (已移除不兼容的runtime参数)
        vector_store_test = DeepLakeVectorStore(
            dataset_path=test_path, 
            overwrite=True  # 确保每次测试都是全新的
        )
        storage_context_test = StorageContext.from_defaults(vector_store=vector_store_test)
        
        print("⏳ 正在向量化并存储极小批量...")
        
        # 从文档创建索引
        test_index = VectorStoreIndex.from_documents(
            test_docs, 
            storage_context_test, 
            show_progress=True
        )
        
        print(f"\n✅ 快速诊断测试成功！")
        print(f"🎉 DeepLake成功在 '{test_path}' 创建并存储了 {len(test_docs)} 个文档的向量。")
        print("💡 您现在可以恢复这个单元格的代码，增加 'batch_size' 或处理所有文档，然后运行完整的向量化流程。")

except Exception as e:
    import traceback
    print(f"\n❌ 快速诊断测试失败: {e}")
    print(f"   错误类型: {type(e)}")
    print("   Traceback:")
    traceback.print_exc()
    print("\n💡 建议：")
    print("   1. 检查错误信息，特别是关于网络、权限或磁盘空间的部分。")
    print("   2. 尝试重启Jupyter Notebook内核（Kernel -> Restart Kernel）。")
    print("   3. 确认 'documents' 变量已正确加载。")


🚀 开始快速诊断测试: DeepLake本地存储
📋 测试文档数量: 5




⏳ 正在向量化并存储极小批量...


Parsing nodes: 100%|██████████| 5/5 [00:00<00:00, 952.04it/s]
Generating embeddings: 100%|██████████| 5/5 [00:28<00:00,  5.80s/it]

Uploading data to deeplake dataset.



100%|██████████| 5/5 [00:00<00:00, 45.44it/s]


Dataset(path='./dataset/vector_storage_quick_test', tensors=['text', 'metadata', 'embedding', 'id'])

  tensor      htype      shape     dtype  compression
  -------    -------    -------   -------  ------- 
   text       text      (5, 1)      str     None   
 metadata     json      (5, 1)      str     None   
 embedding  embedding  (5, 1024)  float32   None   
    id        text      (5, 1)      str     None   

✅ 快速诊断测试成功！
🎉 DeepLake成功在 './dataset/vector_storage_quick_test' 创建并存储了 5 个文档的向量。
💡 您现在可以恢复这个单元格的代码，增加 'batch_size' 或处理所有文档，然后运行完整的向量化流程。


## 文档分割代码改进总结

### 🔍 **问题分析**
原始代码缺少原始文献的metadata信息，导致RAG系统无法追踪和返回源文献信息。

### ✅ **改进内容**

#### 1. **完整的PDF元数据提取**
- PDF标题、作者、主题、创建者等
- 文件创建/修改时间
- 文件大小等技术信息

#### 2. **详细的文档块metadata**
每个文档块现在包含：
- **源文件信息**: `source_file`, `source_path`, `file_type`
- **PDF文档信息**: `pdf_title`, `pdf_author`, `pdf_subject`
- **块特定信息**: `chunk_type`, `chunk_index`, `page_number`
- **内容统计**: `word_count`, `char_count`, `chunk_size`
- **处理信息**: `processing_timestamp`, `content_hash`
- **原始metadata**: 保留PyPDFLoader的原始信息

#### 3. **分类型metadata**
- **文本块**: 页码、块索引、内容统计
- **表格**: 表格行列数、表格索引
- **图像**: 图像尺寸、格式、保存路径

#### 4. **源文献追踪功能**
- `get_source_citation()`: 根据metadata生成标准引用格式
- `enhanced_query_with_sources()`: 查询时返回完整来源信息

### 🎯 **使用效果**

现在RAG系统可以：
1. **准确追踪来源**: 返回具体的文件名、页码、段落位置
2. **生成标准引用**: 自动格式化作者、标题、页码信息
3. **区分内容类型**: 标识文本、表格、图像来源
4. **提供可信度**: 通过相似度分数评估来源可靠性

### 📁 **新的向量存储路径**
使用 `./dataset/vector_storage_with_metadata` 存储包含完整metadata的向量数据库，确保RAG系统能够返回详细的源文献信息。

### 🔄 **下一步**
运行新的文档分割代码，重新创建向量数据库，即可在RAG查询中获得完整的源文献追踪功能。


In [17]:
print(llm_extraction.complete("Summarise the methods to improve efficiency of solar cells."))

Improving the efficiency of solar cells involves several methods aimed at enhancing the conversion of sunlight into electricity. Here's a summary of key strategies:

1. **Material Improvements**:
   - **High-efficiency semiconductors**: Use of materials with a high absorption coefficient and direct bandgap, such as gallium arsenide (GaAs) or perovskites.
   - **Multiple-junction cells**: Stack multiple cells with different bandgaps to absorb a broader range of the solar spectrum.

2. **Texturing and Antireflection Coatings**:
   - **Surface texturing**: Roughen the surface of the solar cell to increase light scattering and reduce reflection.
   - **Antireflection coatings**: Apply coatings that minimize light reflection, allowing more sunlight to be absorbed.

3. **Enhanced Light Absorption**:
   - **Light-trapping structures**: Use microlenses or 3D structures to focus light and trap it within the cell for better absorption.
   - **Dye-sensitized solar cells (DSCs)**: Incorporate dye 