In [5]:
from pathlib import Path
from langchain.memory import VectorStoreRetrieverMemory
from typing import List,Tuple
from langchain.schema import Document
from langchain_text_splitters import MarkdownHeaderTextSplitter, MarkdownTextSplitter

def markdown_chunking(base_path:Path, file: str)-> Tuple[Document, List[Document]]:
    file = str(base_path.joinpath(file).absolute())
    file_id = str(Path(file).relative_to(base_path))
    
    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
    ]
    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
    
    document_content = get_file_content(base_path.joinpath(file).absolute())
    document_metadata = {"id":file_id}
    document =  Document(page_content=document_content,metadata=document_metadata) 
    md_header_splits = markdown_splitter.split_text(document_content)
    for idx,doc in enumerate(md_header_splits):
        doc.metadata = {**document_metadata,"doc_id": f'{file_id}#{idx}'}
    
    return (document,md_header_splits)


def get_file_content(file_path):
    with open(file_path, 'r', encoding = 'utf-8') as file:
        return file.read()


In [8]:
doc,chunks = markdown_chunking(Path("./resources/cookiecutter/").absolute(),"AUTHORS.md")

In [9]:
print(chunks)

[Document(page_content='- Audrey Roy Greenfeld ([@audreyfeldroy](https://github.com/audreyfeldroy))\n- Daniel Roy Greenfeld ([@pydanny](https://github.com/pydanny))\n- Raphael Pierzina ([@hackebrot](https://github.com/hackebrot))', metadata={'id': 'AUTHORS.md', 'doc_id': 'AUTHORS.md#0'}), Document(page_content='- Michael Joseph ([@michaeljoseph](https://github.com/michaeljoseph))\n- Paul Moore ([@pfmoore](https://github.com/pfmoore))\n- Andrey Shpak ([@insspb](https://github.com/insspb))\n- Sorin Sbarnea ([@ssbarnea](https://github.com/ssbarnea))\n- Fábio C. Barrionuevo da Luz ([@luzfcb](https://github.com/luzfcb))\n- Simone Basso ([@simobasso](https://github.com/simobasso))\n- Jens Klein ([@jensens](https://github.com/jensens))\n- Érico Andrei ([@ericof](https://github.com/ericof))', metadata={'id': 'AUTHORS.md', 'doc_id': 'AUTHORS.md#1'}), Document(page_content="- Steven Loria ([@sloria](https://github.com/sloria))\n- Goran Peretin ([@gperetin](https://github.com/gperetin))\n- Hamish