In [None]:
# Construct code based Document object -- method1
# load source codes and split them into chunks
from langchain_text_splitters import (
    Language,
    RecursiveCharacterTextSplitter,
)
from langchain_chroma import Chroma
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers import LanguageParser
import re
from langchain_core.documents import Document
from call_llms import load_embedding_model
base_path = "tool_source_code/pymatgen/src/pymatgen/"
loader = GenericLoader.from_filesystem(base_path, glob="**/[!.]*", suffixes=[".py"], parser=LanguageParser(language="python"))
pymatgen_source_docs = loader.load()
def update_metadata_with_first_function_or_class(document: Document):
    """
    Update the document's metadata to add the name of the first occurring class or function.
    
    Args:
        document: A document object containing metadata and page_content.
    
    Returns:
        The updated document object.
    """
    # check content_type of metadata
    page_content = document.page_content
        
    # Define a regular expression to match function and class names
    pattern = r"^\s*(def|class)\s+([a-zA-Z_][a-zA-Z0-9_]*)"
    match = re.search(pattern, page_content, re.MULTILINE)
        
    if match:
        # extract the first function or class name with its type
        type_name = match.group(1)
        first_name = match.group(2)
        # add it to metadata with the type prefix
        document.metadata["first_function_or_class"] = f"{type_name} {first_name}"
    else:
        failed_path = document.metadata['source']
        print(f"No function or class found in the document {failed_path}.")
for doc in pymatgen_source_docs:
    if doc.metadata["content_type"] == "functions_classes":
        update_metadata_with_first_function_or_class(doc)
    # Find the second 'pymatgen' part in the path and keep everything after it, including 'pymatgen'
    doc.metadata["source"] = "pymatgen" + doc.metadata["source"].split("pymatgen", 2)[-1]
print(f"Loaded {len(pymatgen_source_docs)} source codes.")
python_splitter = RecursiveCharacterTextSplitter.from_language(language=Language.PYTHON, chunk_size=1000, chunk_overlap=50)
pymatgen_splitted_docs = python_splitter.split_documents(pymatgen_source_docs)
print(f"Codes are split into {len(pymatgen_splitted_docs)} chunks.")
embedding_model = load_embedding_model("text-embedding-3-large")
vector_store = Chroma(collection_name='pymatgen', embedding_function=embedding_model, persist_directory="vector_store/vs_method1/")
# add documents to vector store
vector_store.add_documents(documents=pymatgen_splitted_docs)


In [None]:
# documents from official documentation -- method2
from langchain_community.document_loaders import BSHTMLLoader
import os
from pathlib import Path
from langchain.document_loaders import TextLoader
from langchain_core.documents import Document
from call_llms import load_embedding_model
from langchain_text_splitters import RecursiveCharacterTextSplitter
import tiktoken
from langchain_chroma import Chroma

def calculate_token_statistics(documents: list, encoding_name: str = "cl100k_base") -> dict:
    """
    计算文档列表中每个文档的token数量并返回统计信息
    
    Args:
        documents: 包含Document对象的列表
        encoding_name: 使用的编码名称，默认为"cl100k_base"
    
    Returns:
        包含token统计信息的字典，包括token_counts, max_tokens, min_tokens
    """
    encoding = tiktoken.get_encoding(encoding_name)
    token_counts = [len(encoding.encode(doc.page_content)) for doc in documents]
    
    return {
        "token_counts": token_counts,
        "max_tokens": max(token_counts),
        "min_tokens": min(token_counts)
    }

def get_pymatgen_html_files(directory: str):
    """
    获取指定目录下所有以 'pymatgen' 开头的 HTML 文件和 'modules.html' 文件的路径，
    但排除 '_modules' 目录下的文件。
    
    参数：
    directory (str): 目标目录路径，例如 'tool_source_code/pymatgen/docs'
    
    返回：
    list: 符合条件的文件路径列表
    """
    html_files = []
    
    if not os.path.exists(directory):
        print(f"Directory {directory} does not exist.")
        return html_files
    
    for root, _, files in os.walk(directory):
        # 排除 _modules 目录
        if "_modules" in root.split(os.sep):
            continue

        for file in files:
            if (file.startswith("pymatgen") and file.endswith(".html")) or file == "modules.html":
                html_files.append(os.path.join(root, file))
    
    return html_files

html_files_list = get_pymatgen_html_files("tool_source_code/pymatgen/docs")
print(len(html_files_list))

documents = [BSHTMLLoader(file).load() for file in html_files_list]

documents = [doc for sublist in documents for doc in sublist]

print(f"Loaded {len(documents)} documents from local html files.")

token_stats = calculate_token_statistics(documents)
print("Token counts:", token_stats["token_counts"])
print("Max tokens:", token_stats["max_tokens"])
print("Min tokens:", token_stats["min_tokens"])

# 指定文件夹路径
folder_path = "./defects_doc/"  # 替换为实际路径

# 存储所有 Document
documents_defect = []

# 遍历文件夹中的所有 txt 文件（不考虑子文件夹）
for file in Path(folder_path).glob("*.txt"):
    # 加载文件内容
    loader = TextLoader(str(file))
    docs = loader.load()

    # 获取文件名（不带扩展名）
    file_name = file.stem

    # 处理文件名
    if file_name == "api_overview":
        title = "api overview — 2024.7.19 documentation"
    else:
        title = file_name.replace("_", ".") + " package — 2024.7.19 documentation"

    # 创建新的 Document 并添加到列表
    for doc in docs:
        documents_defect.append(Document(
            page_content=doc.page_content,
            metadata={"source": str(file).split("defects_doc/")[1], "title": title}
        ))
        
print(f"Loaded {len(documents_defect)} documents from local txt files.")
token_stats = calculate_token_statistics(documents_defect)
print("Token counts:", token_stats["token_counts"])
print("Max tokens:", token_stats["max_tokens"])
print("Min tokens:", token_stats["min_tokens"])

documents.extend(documents_defect)
print(f"Total {len(documents)} documents.")

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
split_docs = text_splitter.split_documents(documents)
print(f"Documents are split into {len(split_docs)} chunks.")
embedding_model = load_embedding_model("text-embedding-3-large")
vector_store = Chroma(collection_name='pymatgen-doc', embedding_function=embedding_model, persist_directory="vector_store/vs_method2/")
vector_store.add_documents(documents=split_docs)

In [None]:
# Construct llm-generated document vector store with SemanticChunker -- method3
from json_handler import JsonFileProcessor
from langchain_chroma import Chroma
from call_llms import load_embedding_model
from langchain_experimental.text_splitter import SemanticChunker
db_path = "/Users/siyuliu/Desktop/pymatgen_source_code_gemini_2_0_flash/.project_doc_record/project_hierarchy.json"

md_contents, meta_data = JsonFileProcessor(db_path).extract_data()

embeddings = load_embedding_model(model_name="text-embedding-3-large")

if not md_contents or not meta_data:
    print("No content or metadata provided. Skipping vector store creation.")
    raise ValueError

# Ensure lengths match
min_length = min(len(md_contents), len(meta_data))
md_contents = md_contents[:min_length]
meta_data = meta_data[:min_length]

semantic_chunker = SemanticChunker(
    embeddings=embeddings,
    buffer_size=1,
    breakpoint_threshold_amount=95,
)
split_docs = semantic_chunker.create_documents(
    texts=md_contents, 
    metadatas=meta_data
)

In [3]:
import json
from langchain.schema import Document

# 序列化为 JSON 结构
documents_json = [
    {"page_content": doc.page_content, "metadata": doc.metadata} for doc in split_docs
]

# 写入 JSON 文件
with open("documents_llm_doc.json", "w", encoding="utf-8") as f:
    json.dump(documents_json, f, ensure_ascii=False, indent=4)


In [2]:
from langchain_chroma import Chroma
import os
from filelock import FileLock
from tqdm import tqdm

persist_directory = "vector_store/vs_method3/"
os.makedirs(persist_directory, exist_ok=True)

lock_path = os.path.join(persist_directory, "vector_store.lock")
batch_size = 100

with FileLock(lock_path, timeout=120):
    # 初始化或加载已有的 Chroma 存储
    vector_store = Chroma(
        embedding_function=embeddings,
        collection_name="pymatgen_llm_doc",
        persist_directory=persist_directory
    )

    for i in tqdm(range(0, len(split_docs), batch_size), desc="Adding documents"):
        batch = split_docs[i:i + batch_size]
        vector_store.add_documents(batch)

Adding documents: 100%|██████████| 181/181 [08:18<00:00,  2.75s/it]


## Method4

In [1]:
# llm generated document by full document loading -- method4
from json_handler import JsonFileProcessor
from langchain_chroma import Chroma
from langchain.schema import Document
from call_llms import load_embedding_model
from filelock import FileLock
import json
import os
from tqdm import tqdm
# Load data from JSON
db_path = "/Users/siyuliu/Desktop/pymatgen_source_code_gemini_2_0_flash/.project_doc_record/project_hierarchy.json"
md_contents, meta_data = JsonFileProcessor(db_path).extract_data()

# Load embedding model
embeddings = load_embedding_model(model_name="text-embedding-3-large")

# Validate data
if not md_contents or not meta_data:
    print("No content or metadata provided. Skipping vector store creation.")
    raise ValueError

# Ensure lengths match and create Document objects
min_length = min(len(md_contents), len(meta_data))
documents = [
    Document(page_content=md_contents[i], metadata=meta_data[i])
    for i in range(min_length)
]

# 写入 JSON 文件
documents_json = [
    {"page_content": doc.page_content, "metadata": doc.metadata} for doc in documents
]
with open("documents_llm_doc_gemini_20_flash_full.json", "w", encoding="utf-8") as f:
    json.dump(documents_json, f, ensure_ascii=False, indent=4)



# Create Chroma vector store with documents
persist_directory = "vector_store/vs_method4/"
os.makedirs(persist_directory, exist_ok=True)
lock_path = os.path.join(persist_directory, "vector_store.lock")

batch_size = 100


with FileLock(lock_path, timeout=120):
    vector_store = Chroma(
    embedding_function=embeddings,
    collection_name="pymatgen_llm_doc_full",
    persist_directory=persist_directory
    )
    
    for i in tqdm(range(0, len(documents), batch_size), desc="Adding documents"):
        batch = documents[i:i + batch_size]
        vector_store.add_documents(batch)

Adding documents: 100%|██████████| 72/72 [03:25<00:00,  2.86s/it]
