### Python Code

In [1]:
from langchain_text_splitters import RecursiveCharacterTextSplitter, Language

In [2]:
python_code = """
import numpy as np
from typing import List, Optional

def calculate_mean(numbers: List[float]) -> float:
    '''Calculate the arithmetic mean of a list of numbers.
    
    Args:
        numbers: List of numerical values
        
    Returns:
        The mean value
    '''
    return sum(numbers) / len(numbers)

def calculate_median(numbers: List[float]) -> float:
    '''Calculate the median of a list of numbers.'''
    sorted_nums = sorted(numbers)
    n = len(sorted_nums)
    mid = n // 2
    
    if n % 2 == 0:
        return (sorted_nums[mid - 1] + sorted_nums[mid]) / 2
    return sorted_nums[mid]

class StatisticalAnalyzer:
    '''A class for performing statistical analysis on datasets.'''
    
    def __init__(self, data: List[float]):
        self.data = data
        self.mean = None
        self.median = None
    
    def analyze(self) -> dict:
        '''Perform complete statistical analysis.'''
        self.mean = calculate_mean(self.data)
        self.median = calculate_median(self.data)
        
        return {
            'mean': self.mean,
            'median': self.median,
            'count': len(self.data)
        }
    
    def get_summary(self) -> str:
        '''Return a formatted summary of the analysis.'''
        if self.mean is None:
            self.analyze()
        
        return f"Mean: {self.mean:.2f}, Median: {self.median:.2f}"

def main():
    '''Main execution function.'''
    data = [1.5, 2.3, 3.7, 4.2, 5.1]
    analyzer = StatisticalAnalyzer(data)
    results = analyzer.analyze()
    print(analyzer.get_summary())

if __name__ == "__main__":
    main()
"""


In [3]:
print(python_code)


import numpy as np
from typing import List, Optional

def calculate_mean(numbers: List[float]) -> float:
    '''Calculate the arithmetic mean of a list of numbers.

    Args:
        numbers: List of numerical values

    Returns:
        The mean value
    '''
    return sum(numbers) / len(numbers)

def calculate_median(numbers: List[float]) -> float:
    '''Calculate the median of a list of numbers.'''
    sorted_nums = sorted(numbers)
    n = len(sorted_nums)
    mid = n // 2

    if n % 2 == 0:
        return (sorted_nums[mid - 1] + sorted_nums[mid]) / 2
    return sorted_nums[mid]

class StatisticalAnalyzer:
    '''A class for performing statistical analysis on datasets.'''

    def __init__(self, data: List[float]):
        self.data = data
        self.mean = None
        self.median = None

    def analyze(self) -> dict:
        '''Perform complete statistical analysis.'''
        self.mean = calculate_mean(self.data)
        self.median = calculate_median(self.data)

        

In [None]:
# create the splitter

python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON,
    chunk_size=700,
    chunk_overlap=100
)

In [None]:
# split the code

code_chunks = python_splitter.split_text(python_code)

print(code_chunks)

["import numpy as np\nfrom typing import List, Optional\n\ndef calculate_mean(numbers: List[float]) -> float:\n    '''Calculate the arithmetic mean of a list of numbers.\n\n    Args:\n        numbers: List of numerical values\n\n    Returns:\n        The mean value\n    '''\n    return sum(numbers) / len(numbers)\n\ndef calculate_median(numbers: List[float]) -> float:\n    '''Calculate the median of a list of numbers.'''\n    sorted_nums = sorted(numbers)\n    n = len(sorted_nums)\n    mid = n // 2\n\n    if n % 2 == 0:\n        return (sorted_nums[mid - 1] + sorted_nums[mid]) / 2\n    return sorted_nums[mid]", "class StatisticalAnalyzer:\n    '''A class for performing statistical analysis on datasets.'''\n\n    def __init__(self, data: List[float]):\n        self.data = data\n        self.mean = None\n        self.median = None\n\n    def analyze(self) -> dict:\n        '''Perform complete statistical analysis.'''\n        self.mean = calculate_mean(self.data)\n        self.median = c

In [16]:
from termcolor import colored, COLORS
from random import choice

In [17]:
def display_chunks(chunks):
    colors_list = list(COLORS.keys())[2:8]
    print(f"Total Number of Chunks: {len(chunks)}")
    for num, chunk in enumerate(chunks, 1):
        print(f"Chunk {num}: Length {len(chunk)} chars")
        print(colored(text=chunk, color=choice(colors_list)), end="\n\n")

In [18]:
display_chunks(code_chunks)

Total Number of Chunks: 4
Chunk 1: Length 592 chars
[35mimport numpy as np
from typing import List, Optional

def calculate_mean(numbers: List[float]) -> float:
    '''Calculate the arithmetic mean of a list of numbers.

    Args:
        numbers: List of numerical values

    Returns:
        The mean value
    '''
    return sum(numbers) / len(numbers)

def calculate_median(numbers: List[float]) -> float:
    '''Calculate the median of a list of numbers.'''
    sorted_nums = sorted(numbers)
    n = len(sorted_nums)
    mid = n // 2

    if n % 2 == 0:
        return (sorted_nums[mid - 1] + sorted_nums[mid]) / 2
    return sorted_nums[mid][0m

Chunk 2: Length 675 chars
[35mclass StatisticalAnalyzer:
    '''A class for performing statistical analysis on datasets.'''

    def __init__(self, data: List[float]):
        self.data = data
        self.mean = None
        self.median = None

    def analyze(self) -> dict:
        '''Perform complete statistical analysis.'''
        self.m

In [None]:
python_splitter.get_separators_for_language(Language.PYTHON)

['\nclass ', '\ndef ', '\n\tdef ', '\n\n', '\n', ' ', '']

In [None]:
python_splitter.get_separators_for_language(Language.MARKDOWN)

['\n#{1,6} ',
 '```\n',
 '\n\\*\\*\\*+\n',
 '\n---+\n',
 '\n___+\n',
 '\n\n',
 '\n',
 ' ',
 '']

### JSON

In [21]:
JSON_DATA = {
    "company": "AI Research Corp",
    "departments": [
        {
            "name": "Machine Learning",
            "team_size": 25,
            "projects": [
                {
                    "id": "ML001",
                    "title": "Computer Vision System",
                    "description": "Developing advanced image recognition using CNNs",
                    "status": "active",
                    "team_members": ["Alice", "Bob", "Charlie"]
                },
                {
                    "id": "ML002",
                    "title": "NLP Platform",
                    "description": "Building transformer-based language models",
                    "status": "active",
                    "team_members": ["David", "Eve"]
                }
            ]
        },
        {
            "name": "Data Engineering",
            "team_size": 15,
            "projects": [
                {
                    "id": "DE001",
                    "title": "Data Pipeline",
                    "description": "ETL pipeline for real-time data processing",
                    "status": "active"
                }
            ]
        }
    ],
    "technologies": {
        "frameworks": ["TensorFlow", "PyTorch", "scikit-learn"],
        "languages": ["Python", "R", "Julia"],
        "cloud": ["AWS", "Google Cloud", "Azure"]
    },
    "metadata": {
        "founded": 2020,
        "headquarters": "San Francisco",
        "employees": 150
    }
}

In [22]:
from langchain_text_splitters import RecursiveJsonSplitter

In [27]:
# create the json splitter

json_splitter = RecursiveJsonSplitter(
    max_chunk_size=200
)

In [28]:
# return dictionaries

chunks_dict = json_splitter.split_json(json_data=JSON_DATA)

print(chunks_dict)

[{'company': 'AI Research Corp', 'departments': [{'name': 'Machine Learning', 'team_size': 25, 'projects': [{'id': 'ML001', 'title': 'Computer Vision System', 'description': 'Developing advanced image recognition using CNNs', 'status': 'active', 'team_members': ['Alice', 'Bob', 'Charlie']}, {'id': 'ML002', 'title': 'NLP Platform', 'description': 'Building transformer-based language models', 'status': 'active', 'team_members': ['David', 'Eve']}]}, {'name': 'Data Engineering', 'team_size': 15, 'projects': [{'id': 'DE001', 'title': 'Data Pipeline', 'description': 'ETL pipeline for real-time data processing', 'status': 'active'}]}]}, {'technologies': {'frameworks': ['TensorFlow', 'PyTorch', 'scikit-learn'], 'languages': ['Python', 'R', 'Julia'], 'cloud': ['AWS', 'Google Cloud', 'Azure']}}, {'metadata': {'founded': 2020, 'headquarters': 'San Francisco', 'employees': 150}}]


In [29]:
# return json text

chunks = json_splitter.split_text(JSON_DATA)

print(chunks)

['{"company": "AI Research Corp", "departments": [{"name": "Machine Learning", "team_size": 25, "projects": [{"id": "ML001", "title": "Computer Vision System", "description": "Developing advanced image recognition using CNNs", "status": "active", "team_members": ["Alice", "Bob", "Charlie"]}, {"id": "ML002", "title": "NLP Platform", "description": "Building transformer-based language models", "status": "active", "team_members": ["David", "Eve"]}]}, {"name": "Data Engineering", "team_size": 15, "projects": [{"id": "DE001", "title": "Data Pipeline", "description": "ETL pipeline for real-time data processing", "status": "active"}]}]}', '{"technologies": {"frameworks": ["TensorFlow", "PyTorch", "scikit-learn"], "languages": ["Python", "R", "Julia"], "cloud": ["AWS", "Google Cloud", "Azure"]}}', '{"metadata": {"founded": 2020, "headquarters": "San Francisco", "employees": 150}}']


In [30]:
display_chunks(chunks)

Total Number of Chunks: 3
Chunk 1: Length 635 chars
[35m{"company": "AI Research Corp", "departments": [{"name": "Machine Learning", "team_size": 25, "projects": [{"id": "ML001", "title": "Computer Vision System", "description": "Developing advanced image recognition using CNNs", "status": "active", "team_members": ["Alice", "Bob", "Charlie"]}, {"id": "ML002", "title": "NLP Platform", "description": "Building transformer-based language models", "status": "active", "team_members": ["David", "Eve"]}]}, {"name": "Data Engineering", "team_size": 15, "projects": [{"id": "DE001", "title": "Data Pipeline", "description": "ETL pipeline for real-time data processing", "status": "active"}]}]}[0m

Chunk 2: Length 157 chars
[32m{"technologies": {"frameworks": ["TensorFlow", "PyTorch", "scikit-learn"], "languages": ["Python", "R", "Julia"], "cloud": ["AWS", "Google Cloud", "Azure"]}}[0m

Chunk 3: Length 82 chars
[34m{"metadata": {"founded": 2020, "headquarters": "San Francisco", "employees": 1

### Markdown

In [31]:
MARKDOWN_TEXT = """# Artificial Intelligence Overview

Artificial intelligence is transforming technology and shaping the future of computing.

## Machine Learning

Machine learning is a subset of AI that focuses on pattern recognition.

### Supervised Learning

Supervised learning algorithms learn from labeled training data.
They make predictions based on input-output pairs.

Common algorithms include:
- Linear regression
- Decision trees
- Support vector machines

### Unsupervised Learning

Unsupervised learning finds patterns in unlabeled data.
It's useful for clustering and dimensionality reduction.

Common techniques:
- K-means clustering
- Principal component analysis
- Hierarchical clustering

## Deep Learning

Deep learning uses neural networks with multiple layers.

### Neural Networks

Neural networks are inspired by biological neurons.
They consist of interconnected nodes organized in layers.

### Convolutional Neural Networks

CNNs excel at image recognition tasks.
They use convolutional layers to detect features hierarchically.

## Applications

AI has applications across multiple domains:

### Healthcare

- Disease diagnosis
- Drug discovery
- Medical imaging analysis

### Finance

- Fraud detection
- Algorithmic trading
- Risk assessment
"""

In [32]:
from langchain_text_splitters import MarkdownHeaderTextSplitter

In [37]:
headers_to_split_on = [
    ("#", "Header_1"),
    ("##", "Header_2"),
    ("###", "Header_3")
]

In [38]:
# create the markdown splitter
markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on,
    strip_headers=False
)

In [39]:
# split the text

markdown_chunks = markdown_splitter.split_text(MARKDOWN_TEXT)

print(markdown_chunks)

[Document(metadata={'Header_1': 'Artificial Intelligence Overview'}, page_content='# Artificial Intelligence Overview  \nArtificial intelligence is transforming technology and shaping the future of computing.'), Document(metadata={'Header_1': 'Artificial Intelligence Overview', 'Header_2': 'Machine Learning'}, page_content='## Machine Learning  \nMachine learning is a subset of AI that focuses on pattern recognition.'), Document(metadata={'Header_1': 'Artificial Intelligence Overview', 'Header_2': 'Machine Learning', 'Header_3': 'Supervised Learning'}, page_content='### Supervised Learning  \nSupervised learning algorithms learn from labeled training data.\nThey make predictions based on input-output pairs.  \nCommon algorithms include:\n- Linear regression\n- Decision trees\n- Support vector machines'), Document(metadata={'Header_1': 'Artificial Intelligence Overview', 'Header_2': 'Machine Learning', 'Header_3': 'Unsupervised Learning'}, page_content="### Unsupervised Learning  \nUnsu

In [40]:
for doc in markdown_chunks:
    print(doc.page_content, end="\n\n")

# Artificial Intelligence Overview  
Artificial intelligence is transforming technology and shaping the future of computing.

## Machine Learning  
Machine learning is a subset of AI that focuses on pattern recognition.

### Supervised Learning  
Supervised learning algorithms learn from labeled training data.
They make predictions based on input-output pairs.  
Common algorithms include:
- Linear regression
- Decision trees
- Support vector machines

### Unsupervised Learning  
Unsupervised learning finds patterns in unlabeled data.
It's useful for clustering and dimensionality reduction.  
Common techniques:
- K-means clustering
- Principal component analysis
- Hierarchical clustering

## Deep Learning  
Deep learning uses neural networks with multiple layers.

### Neural Networks  
Neural networks are inspired by biological neurons.
They consist of interconnected nodes organized in layers.

### Convolutional Neural Networks  
CNNs excel at image recognition tasks.
They use convoluti