-
Notifications
You must be signed in to change notification settings - Fork 6
/
04_06_solution_import_wiki_chunks
67 lines (61 loc) · 1.98 KB
/
04_06_solution_import_wiki_chunks
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import utils
import weaviate.classes as wvc
from weaviate.util import generate_uuid5
from pathlib import Path
import json
client = utils.connect_to_my_db() # Connect to our own database
# Load the chunked pages to a file
chunks_path = Path("chunks.json")
chunked_pages = json.loads(chunks_path.read_text())
for k, v in chunked_pages.items():
print(f"Page {k} has {len(v)} chunks")
# Create a collection called WikiChunk
# Use the text2vec_openai vectorizer and the openai generative module
# The properties should be:
# title (text)
# chunk (text)
# chunk_number (int)
client.collections.create(
name="WikiChunk",
vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_openai(),
generative_config=wvc.config.Configure.Generative.openai(),
properties=[
wvc.config.Property(
name="title",
data_type=wvc.config.DataType.TEXT
),
wvc.config.Property(
name="chunk",
data_type=wvc.config.DataType.TEXT
),
wvc.config.Property(
name="chunk_number",
data_type=wvc.config.DataType.INT
)
]
)
# Get the collection
wiki_chunks = client.collections.get(name="WikiChunk")
# Insert the chunks
# Iterate over the chunked pages
for page_name, page_chunks in chunked_pages.items():
# Create a list of DataObjects
chunk_objs = list()
# Iterate over the chunks with enumerate()
# Use the index as the chunk number
# How would you generate a unique UUID for each chunk?
for i, chunk in enumerate(page_chunks):
chunk_uuid = generate_uuid5(f"{page_name}_{i}")
chunk_obj = wvc.data.DataObject(
properties={
"title": page_name,
"chunk": chunk,
"chunk_number": i
},
uuid=chunk_uuid
)
chunk_objs.append(chunk_obj)
# Insert the chunks for that document
wiki_chunks.data.insert_many(chunk_objs)
print(f"Finished importing {page_name}")
client.close()