In [7]:
from dotenv import load_dotenv
import os


load_dotenv(dotenv_path=".env")
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_PROJECT"] = "agent-book"
os.environ["TAVILY_API_KEY"] = os.getenv("TAVILY_API_KEY")
os.environ["COHERE_API_KEY"] = os.getenv("COHERE_API_KEY")

In [2]:
from langchain_community.document_loaders import GitLoader

def file_filter(file_path: str) -> bool:
    return file_path.endswith(".md")

loader = GitLoader(
    clone_url="https://github.com/langchain-ai/langchain",
    repo_path="./langchain",
    branch="master",
    file_filter=file_filter,
)

documents = loader.load()
print(f"Loaded {len(documents)} documents.")

Loaded 43 documents.


In [5]:
for document in documents:
    document.metadata["filename"] = document.metadata["source"]

In [None]:
import nest_asyncio
nest_asyncio.apply()

from ragas.testset import TestsetGenerator
from ragas.testset.synthesizers import default_query_distribution
from ragas.testset.transforms import Transforms
from ragas.testset.transforms.extractors import SummaryExtractor, EmbeddingExtractor
from ragas.testset.transforms.extractors.llm_based import ThemesExtractor, NERExtractor
from ragas.testset.transforms.relationship_builders import CosineSimilarityBuilder, OverlapScoreBuilder
from ragas.testset.transforms.filters import CustomNodeFilter
from ragas.testset.transforms.engine import Parallel
from ragas.testset.graph import NodeType
from ragas.utils import num_tokens_from_string
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# gpt-4o-miniを使用
generator_llm = ChatOpenAI(model="gpt-4o-mini")
embeddings = OpenAIEmbeddings()

# TestsetGeneratorを作成
generator = TestsetGenerator.from_langchain(
    llm=generator_llm,
    embedding_model=embeddings,
)

# カスタムtransformsを作成（HeadlinesExtractorとHeadlineSplitterを除外）
def filter_doc_with_num_tokens(node, min_num_tokens=500):
    return (
        node.type == NodeType.DOCUMENT
        and num_tokens_from_string(node.properties["page_content"]) > min_num_tokens
    )

def filter_chunks(node):
    return node.type == NodeType.CHUNK

# 問題のあるHeadlinesExtractorとHeadlineSplitterを除外したtransforms
custom_transforms = Transforms(
    [
        SummaryExtractor(
            llm=generator.llm,
            filter_nodes=lambda node: filter_doc_with_num_tokens(node, 100)
        ),
        EmbeddingExtractor(
            embedding_model=generator.embedding_model,
            property_name="summary_embedding",
            embed_property_name="summary",
            filter_nodes=lambda node: filter_doc_with_num_tokens(node, 100),
        ),
        CosineSimilarityBuilder(
            property_name="summary_embedding",
            new_property_name="summary_similarity",
            threshold=0.5,
            filter_nodes=lambda node: filter_doc_with_num_tokens(node, 100),
        ),
        NERExtractor(llm=generator.llm),
        ThemesExtractor(llm=generator.llm),
        OverlapScoreBuilder(threshold=0.01),
    ]
)

# query_distributionを作成
query_dist = default_query_distribution(generator_llm)

# テストセット生成（カスタムtransformsを使用）
testset = generator.generate_with_langchain_docs(
    documents=documents,
    testset_size=4,
    query_distribution=query_dist,
    transforms=custom_transforms,
)

# 出力確認
df = testset.to_pandas()
print(f"生成されたテストケース数: {len(df)}")
print("\n=== テストセットのサンプル ===")
print(df.head())