/
semantic_chunking.py
77 lines (67 loc) · 3.58 KB
/
semantic_chunking.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import openai
from neumai.Shared.NeumDocument import NeumDocument
from typing import (
List,
)
def semantic_chunking_strategy(text:str) -> str:
messages = []
messages = [
{"role": "system", "content": ('You are helpful assistant' +
'Based on a given piece of text provided, describe the correct strategy to split the text.' +
'Describe the schema you would follow to chunk the text' +
'Be concise, but specific the process to allow a developer to implement it' +
'The goal is for the chunks to mantain the semantic meaning of the text' +
'For example, if I have a text that have several questions and answers, they should be kept together inside the same chunk' +
'If I have a text that has many parapraphs, ideally try to keep pargraphs within the same chunk' +
'Same applies for sentences, try to keep them together within the same chunk and not cut in the middle.')},
{"role": "user", "content": text}
]
client = openai.OpenAI()
response = client.chat.completions.create(
model="gpt-4-0613",
messages=messages,
temperature=0.9
)
response_message = response.choices[0].message
return response_message
def semantic_chunking_strategy_code(text:str, chunking_strategy:str) -> str:
messages = []
messages = [
{"role": "system", "content": ('You are helpful developer that writes python code.' +
'Output the code in this format: ```python def split_text_into_chunks(text): <Insert Code>```' +
'The function `split_text_into_chunks` should output an array of text chunks.'
'Implement the strategy provided by the user to help split text.')},
{"role": "user", "content": chunking_strategy}
]
client = openai.OpenAI()
response = client.chat.completions.create(
model="gpt-4-0613",
messages=messages,
temperature=0.9
)
response_message = response.choices[0].message
return response_message
def semantic_chunking_code(text:str) -> str:
fixed_text = cut_text(text)
chunking_strategy = semantic_chunking_strategy(text=fixed_text)['content']
chunking_code = semantic_chunking_strategy_code(text=fixed_text, chunking_strategy=chunking_strategy)['content']
chunking_code_exec = chunking_code.split("```python")[1].split("```")[0]
return chunking_code_exec
def semantic_chunking(documents:List[NeumDocument], chunking_code_exec: str) -> List[NeumDocument]:
exec(chunking_code_exec, globals())
result_doc = []
for doc in documents:
results = split_text_into_chunks(doc.page_content)
for result in results:
result_doc.append(NeumDocument(id=doc.id, content=result, metadata=doc.metadata))
return result_doc
def cut_text(s):
words = s.split() # Split the string into a list of words
word_count = len(words)
if word_count <= 750:
return s # If the string is less than or equal to 750 words, return the entire string
if word_count > 750:
if word_count >= 1250: # Check if we can skip 500 and still get 750
return ' '.join(words[500:1250]) # Skip the first 500 words and take the next 750 words
else:
return ' '.join(words[word_count - 750:word_count]) # Take the last 750 words