In [10]:
from pathlib import Path
from langchain.memory import VectorStoreRetrieverMemory
from typing import List,Tuple
from langchain.schema import Document
from langchain_text_splitters import MarkdownHeaderTextSplitter, MarkdownTextSplitter

def markdown_chunking(base_path:Path, file: str)-> Tuple[Document, List[Document]]:
    file = str(base_path.joinpath(file).absolute())
    file_id = str(Path(file).relative_to(base_path))
    
    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
    ]
    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
    
    document_content = get_file_content(base_path.joinpath(file).absolute())
    document_metadata = {"id":file_id}
    document =  Document(page_content=document_content,metadata=document_metadata) 
    md_header_splits = markdown_splitter.split_text(document_content)
    for idx,doc in enumerate(md_header_splits):
        doc.metadata = {**document_metadata,"doc_id": f'{file_id}#{idx}'}
    
    return (document,md_header_splits)


def get_file_content(file_path):
    with open(file_path, 'r', encoding = 'utf-8') as file:
        return file.read()


In [None]:
doc,chunks = markdown_chunking(Path("./resources/cookiecutter/").absolute(),"AUTHORS.md")
print(chunks)

In [22]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

def rst_chunking(base_path:Path, file: str)-> Tuple[Document, List[Document]]:
    file = str(base_path.joinpath(file).absolute())
    file_id = str(Path(file).relative_to(base_path))
    
    document_content = get_file_content(base_path.joinpath(file).absolute())
    document_metadata = {"id":file_id}
    document =  Document(page_content=document_content,metadata=document_metadata) 
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=20,
        length_function=len,
        separators=[
        "\n\n",
        "\n",
        " ",
        ".",
        ",",
        "\u200b",  # Zero-width space
        "\uff0c",  # Fullwidth comma
        "\u3001",  # Ideographic comma
        "\uff0e",  # Fullwidth full stop
        "\u3002",  # Ideographic full stop
        "",
    ],
    )
    
    rst_splits = text_splitter.split_documents(document)
    for idx,doc in enumerate(rst_splits):
        doc.metadata = {**document_metadata,"doc_id": f'{file_id}#{idx}'}
    
    return (document,rst_splits)

In [23]:
doc,chunks = markdown_chunking(Path("./resources/cookiecutter/").absolute(),"docs/usage.rst")
print(chunks)

[Document(page_content="=====\nUsage\n=====  \nGrab a Cookiecutter template\n----------------------------  \nFirst, clone a Cookiecutter project template::  \n$ git clone https://github.com/audreyfeldroy/cookiecutter-pypackage.git  \nMake your changes\n-----------------  \nModify the variables defined in `cookiecutter.json`.  \nOpen up the skeleton project. If you need to change it around a bit, do so.  \nYou probably also want to create a repo, name it differently, and push it as\nyour own new Cookiecutter project template, for handy future use.  \nGenerate your project\n---------------------  \nThen generate your project from the project template::  \n$ cookiecutter cookiecutter-pypackage/  \nThe only argument is the input directory. (The output directory is generated\nby rendering that, and it can't be the same as the input directory.)  \n.. note:: see :ref:`command_line_options` for extra command line arguments  \nTry it out!  \nWorks directly with git and hg (mercurial) repos too\