## 建立索引（Indexing）

In [3]:
import sys
sys.path.append('..')
from nanoidx.schema import BaseNode, TextNode
from nanoidx.semantic_splitter_node_parser import SemanticSplitterNodeParser

In [2]:
import pypdf

pdfreader = pypdf.PdfReader('../docs/deepseek-r1.pdf')
page = pdfreader.pages[0]
text = page.extract_text()
all_pages_text = "".join([page.extract_text() for page in pdfreader.pages])
print(f'All pages text:\n{all_pages_text[:1000]}...')

All pages text:
DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via
Reinforcement Learning
DeepSeek-AI
research@deepseek.com
Abstract
We introduce our first-generation reasoning models, DeepSeek-R1-Zero and DeepSeek-R1.
DeepSeek-R1-Zero, a model trained via large-scale reinforcement learning (RL) without super-
vised fine-tuning (SFT) as a preliminary step, demonstrates remarkable reasoning capabilities.
Through RL, DeepSeek-R1-Zero naturally emerges with numerous powerful and intriguing
reasoning behaviors. However, it encounters challenges such as poor readability, and language
mixing. To address these issues and further enhance reasoning performance, we introduce
DeepSeek-R1, which incorporates multi-stage training and cold-start data before RL. DeepSeek-
R1 achieves performance comparable to OpenAI-o1-1217 on reasoning tasks. To support the
research community, we open-source DeepSeek-R1-Zero, DeepSeek-R1, and six dense models
(1.5B, 7B, 8B, 14B, 32B, 70B) distilled from Dee

### 1. 定义节点（Node）

In [19]:
TextNode.model_fields

{'id_': FieldInfo(annotation=str, required=False, default_factory=<lambda>, description='Unique ID of the node.'),
 'embedding': FieldInfo(annotation=Union[List[float], NoneType], required=False, default=None, description='Embedding of the node.'),
 'metadata': FieldInfo(annotation=Dict[str, Any], required=False, default_factory=dict, alias='extra_info', alias_priority=2, description='A flat dictionary of metadata fields'),
 'excluded_embed_metadata_keys': FieldInfo(annotation=List[str], required=False, default_factory=list, description='Metadata keys that are excluded from text for the embed model.'),
 'excluded_llm_metadata_keys': FieldInfo(annotation=List[str], required=False, default_factory=list, description='Metadata keys that are excluded from text for the LLM.'),
 'hash': FieldInfo(annotation=str, required=False, default=<property object at 0x000001D871935B70>),
 'text': FieldInfo(annotation=str, required=False, default='', description='Text content of the node.'),
 'mimetype':

In [20]:
node = TextNode()
node

TextNode(id_='8a2487bc-3fef-4ef4-8ae3-b258bf8a2475', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], hash=<property object at 0x000001D871935B70>, text='', mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')

### 2. 载入节点

In [21]:
semantic_parser = SemanticSplitterNodeParser()
chunks = semantic_parser.split_text(all_pages_text)

Text Embedding...: 100%|██████████| 912/912 [00:44<00:00, 20.32it/s]
Calculating distances between pairs of embeddings...: 100%|██████████| 911/911 [00:00<00:00, 12272.99it/s]
Building Text Chunks...: 100%|██████████| 46/46 [00:00<?, ?it/s]


In [22]:
chunk_embs = semantic_parser.get_text_embedding_batch(chunks)

Text Embedding...: 100%|██████████| 47/47 [00:19<00:00,  2.40it/s]


In [23]:
assert len(chunks) == len(chunk_embs)

In [24]:
chunk_embs_dict_list = [
    {
        'id': i,
        'chunk': chunk,
        'embedding': emb,
    }
    for i, (chunk, emb) in enumerate(zip(chunks, chunk_embs))
]

In [25]:
from tqdm import tqdm

nodes = []
for chunk_emb_dict in tqdm(chunk_embs_dict_list):
    node = TextNode(text=chunk_emb_dict['chunk'], embedding=chunk_emb_dict['embedding'])
    nodes.append(node)

100%|██████████| 47/47 [00:00<00:00, 46669.58it/s]


In [26]:
display_nodes = nodes[:10]
display_nodes

[TextNode(id_='ad76b0fd-ca73-4d0e-bd66-4a7bf16b815e', embedding=[0.011614857241511345, -0.010581101290881634, -0.004364491440355778, -0.025349019095301628, -0.015055932104587555, 0.011455350555479527, -0.056878428906202316, 0.024565795436501503, -0.030975110828876495, 0.03103703074157238, 0.027133354917168617, -0.05629664286971092, 0.07557607442140579, -0.0046440353617072105, -0.03513990342617035, 0.06726975739002228, -0.10865139216184616, -0.0069596124812960625, -0.05989440903067589, -0.02169753983616829, -0.0007745671318843961, 0.029219338670372963, -0.008198360912501812, 0.07747015357017517, -0.04365783929824829, -0.06448332220315933, -0.04036571457982063, -0.0038192416541278362, -0.0232645645737648, -0.013977359049022198, -0.10980138927698135, 0.027731705456972122, 0.021697890013456345, -0.02816244401037693, 0.029036521911621094, -0.00820094719529152, -0.061360668390989304, 0.004054820165038109, 0.001198234735056758, 0.0324377603828907, -0.0913594514131546, -0.003961572423577309, 0

### 3. 推理引擎（Query Engine)