In [None]:
import os
import yaml
from langchain_community.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

base_dir = './docs'

def load_and_parse_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    return content 

def extract_front_matter(content):
    if content.startswith('---'):
        end = content.find('---', 3)
        if end != -1:
            front_matter = content[3:end].strip()
            try:
                metadata = yaml.safe_load(front_matter)
                return metadata
            except yaml.YAMLError as e:
                print(f"Error parsing YAML: {e}") 
    return {} 

splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=800,
    chunk_overlap=200,
)

def process_markdown_file(file_path):
    content = load_and_parse_file(file_path)
    metadata = extract_front_matter(content)
    
    loader = UnstructuredFileLoader(file_path)
    documents = loader.load()
    
    if documents:
        document = documents[0]
        document.metadata.update(metadata)
        # 텍스트를 분할하고 메타데이터 포함
        split_documents = splitter.create_documents([document.page_content], [document.metadata])
        return split_documents
    return []

# 모든 마크다운 파일을 처리하여 docs 리스트에 저장
docs = []

for root, _, files in os.walk(base_dir):
    for file in files:
        if file.endswith(".md"):
            file_path = os.path.join(root, file)
            documents = process_markdown_file(file_path)
            if documents:
                docs.extend(documents)  # 분할된 각 문서를 개별적으로 추가

for doc in docs:
    print(doc.page_content)
    print(doc.metadata)
    print("------------------------------")
