# Chunking and Document Task Tests (English & Chinese)
This notebook contains unit tests for chunking and document tasks in `cognee/tasks`, covering both English and Chinese text.

In [47]:
import pytest
import importlib
import sys
import asyncio
import types

async def collect_async_generator(async_gen):
    result = []
    async for item in async_gen:
        result.append(item)
    return result

def import_task_module(module_path):
    return importlib.import_module(module_path)

def run_async(coro):
    if sys.version_info >= (3, 7):
        try:
            # For Jupyter, use asyncio.create_task and await
            import nest_asyncio
            nest_asyncio.apply()
            return asyncio.get_event_loop().run_until_complete(coro)
        except RuntimeError:
            # If event loop is already running (Jupyter), use ensure_future and await
            return asyncio.ensure_future(coro)
    else:
        return asyncio.get_event_loop().run_until_complete(coro)

In [48]:
import os 

os.environ['HF_ENDPOINT']='https://hf-mirror.com'

## Test 1: Sentence Chunking (English & Chinese)

In [49]:
def test_chunk_by_sentence_english_and_chinese():
    import asyncio
    chunk_by_sentence = import_task_module('cognee.tasks.chunks.chunk_by_sentence')
    # English
    english_text = "This is a sentence. Here is another one! And a third?"
    english_chunks_result = chunk_by_sentence.chunk_by_sentence(english_text)
    if asyncio.iscoroutine(english_chunks_result):
        english_chunks = asyncio.get_event_loop().run_until_complete(english_chunks_result)
    else:
        english_chunks = list(english_chunks_result)
    print(f"English chunks: {english_chunks}")

    # Chinese
    chinese_text = "这是一个句子。这里还有另一个！再来一个？"
    chinese_chunks_result = chunk_by_sentence.chunk_by_sentence(chinese_text)
    if asyncio.iscoroutine(chinese_chunks_result):
        chinese_chunks = asyncio.get_event_loop().run_until_complete(chinese_chunks_result)
    else:
        chinese_chunks = list(chinese_chunks_result)
    print(f"Chinese chunks: {chinese_chunks}")


test_chunk_by_sentence_english_and_chinese()

English chunks: [(UUID('55c1d716-f043-4576-92b6-35752fd9cef5'), 'This is a sentence. ', 9, 'sentence_end'), (UUID('55c1d716-f043-4576-92b6-35752fd9cef5'), 'Here is another one! ', 9, 'sentence_end'), (UUID('55c1d716-f043-4576-92b6-35752fd9cef5'), 'And a third?', 6, 'sentence_end')]
Chinese chunks: [(UUID('0267e60b-823e-4797-89d4-5ef21a48e99e'), '这是一个句子。', 7, 'sentence_end'), (UUID('0267e60b-823e-4797-89d4-5ef21a48e99e'), '这里还有另一个！', 8, 'sentence_end'), (UUID('0267e60b-823e-4797-89d4-5ef21a48e99e'), '再来一个？', 5, 'sentence_end')]


## Test 2: Word and Paragraph Chunking (English & Chinese)

In [50]:
def test_chunk_by_word_english_and_chinese():
    chunk_by_word = import_task_module('cognee.tasks.chunks.chunk_by_word')
    english_text = "Hello world! This is a test."
    chinese_text = "你好世界！这是一个测试。"
    english_chunks = chunk_by_word.chunk_by_word(english_text)
    chinese_chunks = chunk_by_word.chunk_by_word(chinese_text)
    print(f"English chunks: { list( english_chunks)} ")
    print(list(chinese_chunks))

def test_chunk_by_paragraph_english_and_chinese():
    chunk_by_paragraph = import_task_module('cognee.tasks.chunks.chunk_by_paragraph')
    english_text = "Paragraph one.\n\nParagraph two."
    chinese_text = "第一段。\n\n第二段。"
    english_chunks = chunk_by_paragraph.chunk_by_paragraph(english_text, max_chunk_size=10)
    chinese_chunks = chunk_by_paragraph.chunk_by_paragraph(chinese_text, max_chunk_size=10)
    print(f"English chunks: { list( english_chunks)} ")
    print(f"Chinese chunks: { list( chinese_chunks)} ")

test_chunk_by_word_english_and_chinese()
test_chunk_by_paragraph_english_and_chinese()

English chunks: [('Hello ', 'word'), ('world! ', 'sentence_end'), ('This ', 'word'), ('is ', 'word'), ('a ', 'word'), ('test.', 'sentence_end')] 
[('你好世界！', 'sentence_end'), ('这是一个测试。', 'sentence_end')]
English chunks: [{'text': 'Paragraph one.', 'chunk_size': 5, 'chunk_id': UUID('8c1cda85-52f4-508b-a2f4-ce77ebaa528a'), 'paragraph_ids': [UUID('b796e2a4-c26e-4ee2-b54a-be716a1e0eac')], 'chunk_index': 0, 'cut_type': 'paragraph_end'}, {'text': '\n\nParagraph two.', 'chunk_size': 8, 'chunk_id': UUID('16539612-e6fc-5eb2-bd2b-779d034c397a'), 'paragraph_ids': [UUID('b796e2a4-c26e-4ee2-b54a-be716a1e0eac')], 'chunk_index': 1, 'cut_type': 'sentence_end'}] 
Chinese chunks: [{'text': '第一段。', 'chunk_size': 5, 'chunk_id': UUID('e573efab-6951-5670-88e1-ba632e8d2eea'), 'paragraph_ids': [UUID('d0448a55-23c7-42cc-bd1d-d824193c9221')], 'chunk_index': 0, 'cut_type': 'paragraph_end'}, {'text': '\n\n第二段。', 'chunk_size': 7, 'chunk_id': UUID('6696819a-592e-5694-abff-c84b9c415fe9'), 'paragraph_ids': [UUID('d044