In [1]:
import json

with open("data_pipeline/action.json") as f:
    all_data = json.load(f)

len(all_data)

39

In [2]:
all_data[0].keys()

dict_keys(['name', 'description', 'genre', 'director', 'stars'])

In [3]:
def to_string(sample):
    information = ""
    information += f"The name of film is {sample['name']}. "
    information += sample['description']
    genres = ','.join(sample['genre'])
    information += f"The genres of film are {genres}. "
    information += f"The director of film is {sample['director']}. "
    information += f"Some stars of the film are {','.join(sample['stars'])}"
    return information

In [4]:
from llama_index.core import Document

documents = []
for doc_data in all_data:
    documents.append(Document(text=to_string(doc_data),
                            metadata={  
                                "filmname": doc_data["name"],
                            },
                            text_template="{content}"))


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
from pyvi import ViTokenizer
from transformers import AutoTokenizer
from llama_index.core.node_parser import TokenTextSplitter, SentenceSplitter



In [6]:
base_node_parser = TokenTextSplitter( 
                                chunk_overlap=0,
                                chunk_size=900,
                                separator=" ",
                                backup_separators=["__", "..", "--"],
                                include_prev_next_rel=False
                                )


base_nodes = base_node_parser.get_nodes_from_documents(documents,
                                                       show_progress=True)

Parsing nodes: 100%|██████████| 39/39 [00:00<00:00, 3724.20it/s]


In [7]:
len(base_nodes)

39

In [8]:
child_node_parser= SentenceSplitter(
                    chunk_size=600,
                    chunk_overlap=90,
                    separator=" ",
                    include_prev_next_rel=False,
                    )

In [9]:
child_nodes = child_node_parser.get_nodes_from_documents(base_nodes,
                                                         show_progress=True)

Parsing nodes: 100%|██████████| 39/39 [00:00<00:00, 4070.62it/s]


In [10]:
len(child_nodes)

39

In [11]:
from tqdm import tqdm
from llama_index.core.schema import NodeRelationship

for child_node in tqdm(child_nodes):
    child_node.text = ViTokenizer.tokenize(child_node.text.lower())
    try:
        del child_node.relationships[NodeRelationship.SOURCE].metadata
    except AttributeError:
        continue

100%|██████████| 39/39 [00:00<00:00, 495.21it/s]


In [12]:
child_nodes[0].to_dict()

{'id_': '805cf048-1cb7-49dd-9617-996d81d31a7a',
 'embedding': None,
 'metadata': {'filmname': 'Die Hard'},
 'excluded_embed_metadata_keys': [],
 'excluded_llm_metadata_keys': [],
 'relationships': {<NodeRelationship.SOURCE: '1'>: {'node_id': 'ca948746-aa49-453c-9f0f-877f87515eef',
   'node_type': <ObjectType.TEXT: '1'>,
   'hash': 'ec423299550ca45785083cc4bbd2c78e18208eec91516cf043c4a934c07240a7',
   'class_name': 'RelatedNodeInfo'}},
 'text': 'the name of film is die hard . a new york city police officer tries to save his estranged wife and several others taken hostage by terrorists during a christmas party at the nakatomi plaza in los angeles . the genres of film are action , thriller . the director of film is john mctiernan . some stars of the film are bruce willes , alan rickman , bonnie bedelia',
 'start_char_idx': 0,
 'end_char_idx': 350,
 'text_template': '{content}',
 'metadata_template': '{key}: {value}',
 'metadata_seperator': '\n',
 'class_name': 'TextNode'}

## Indexing

In [13]:
# Save to disk
import weaviate
from llama_index.vector_stores.weaviate import WeaviateVectorStore
from llama_index.core import StorageContext
from llama_index.core import VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from src.components.embedding.custom.text_embeddings_inference.base import TextEmbeddingsInference


WEAVIATE_URL = "http://localhost:9090"
DATA_COLLECTION = "Film"
embed_model = TextEmbeddingsInference(base_url="http://127.0.0.1:8081", 
                                      timeout=60)


client = weaviate.Client(WEAVIATE_URL)

vector_store = WeaviateVectorStore(weaviate_client=client,
                                   index_name=DATA_COLLECTION)

storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex(child_nodes, 
                         storage_context=storage_context, 
                         embed_model=embed_model,
                         insert_batch_size=32768,
                         show_progress=True)

            Please consider upgrading to the latest version. See https://weaviate.io/developers/weaviate/client-libraries/python for details.

Generating embeddings:   0%|          | 0/39 [00:00<?, ?it/s]13:19:05.006 [INFO    ]                     httpx - HTTP Request: POST http://127.0.0.1:8081/embed "HTTP/1.1 200 OK"
Generating embeddings:  26%|██▌       | 10/39 [00:15<00:46,  1.60s/it]13:19:19.028 [INFO    ]                     httpx - HTTP Request: POST http://127.0.0.1:8081/embed "HTTP/1.1 200 OK"
Generating embeddings:  51%|█████▏    | 20/39 [00:30<00:28,  1.49s/it]13:19:35.616 [INFO    ]                     httpx - HTTP Request: POST http://127.0.0.1:8081/embed "HTTP/1.1 200 OK"
Generating embeddings:  77%|███████▋  | 30/39 [00:46<00:14,  1.56s/it]13:19:49.524 [INFO    ]                     httpx - HTTP Request: POST http://127.0.0.1:8081/embed "HTTP/1.1 200 OK"
Generating embeddings: 100%|██████████| 39/39 [01:00<00:00,  1.55s/it]


In [25]:
data = child_nodes[0]
try:
    client.data_object.create(
        data_object=data,
        class_name=DATA_COLLECTION
    )
    print("Data inserted into Weaviate successfully:", json.dumps(data))
except Exception as e:
    print("Failed to insert data into Weaviate:", e)

Failed to insert data into Weaviate: Argument is not of the supported types. Supported types are url or file path as string or schema as dict.


## Retrieval

In [28]:
from src.components.vector_store.custom.weaviate.base import CustomWeaviateVectorStore
from llama_index.core.indices.vector_store import VectorIndexRetriever, VectorStoreIndex

from pyvi import ViTokenizer
from llama_index.core.response.notebook_utils import display_source_node

vector_store = CustomWeaviateVectorStore(
                    weaviate_client=client,
                    index_name="Test_Film"
                )
vector_store_index = VectorStoreIndex.from_vector_store(
    vector_store=vector_store,
    embed_model=embed_model
)
index = vector_store_index.as_retriever(
            similarity_top_k=5, vector_store_query_mode="hybrid"
        )

# base_retriever = vector_store.(vector_store_query_mode="hybrid",
#                                     similarity_top_k=5, 
#                                     alpha=0.5)

In [29]:
TEST_QUESTION = "I want to watch film about hitman"
retrievals = index.retrieve(
    TEST_QUESTION.lower()
)

for n in retrievals[:5]:
    display_source_node(n, source_length=1000, show_source_metadata=True)

12:19:38.890 [INFO    ]                     httpx - HTTP Request: POST http://127.0.0.1:8081/embed "HTTP/1.1 200 OK"


**Node ID:** 750cdd08-eb55-4e3a-a501-c7a36e1cae37<br>**Similarity:** 0.98576015<br>**Text:** the name of film is john wick . with the untimely death of his beloved wife still bitter in his mouth , john wick , the expert former assassin , receives one final gift from her - - a precious keepsake to help john find a new meaning in life now that she is gone . but when the arrogant russian mob prince , iosef tarasov , and his men pay wick a rather unwelcome visit to rob him of his prized 1969 mustang and his wife ' s present , the legendary hitman will be forced to unearth his meticulously concealed identity . blind with revenge , john will immediately unleash a carefully orchestrated maelstrom of destruction against the sophisticated kingpin , viggo tarasov , and his family , who are fully aware of his lethal capacity . now , only blood can quench the boogeyman ' s thirst for retribution . the genres of film are action , crime , thriller . the director of film is chad stahelski . some stars of the film are keanu reeves , micheal nyqvist , alfie allen<br>**Metadata:** {'filmname': 'John Wick'}<br>

**Node ID:** e0f0cacf-a9ed-4ddc-9a4b-91999c02bfa5<br>**Similarity:** 0.985842986<br>**Text:** the name of film is speed . when a young los angeles police department , special weapons and tactics ( s . w . a . t . ) officer called jack traven angers retired atlanta police department bomb squad member howard payne , by foiling his attempt at taking hostages stuck in an elevator with a bomb , payne in retaliation arms a bus with a bomb that will explode if it drops below 50 miles per hour . with the help of spunky passenger annie , jack and his partner detective harry temple try to save the people on the bus before the bomb goes off , while also trying to figure out how payne is monitoring them . the genres of film are action , adventure , thriller . the director of film is jan de bont . some stars of the film are keanu reeves , dennis hopper , sandra bullock<br>**Metadata:** {'filmname': 'Speed'}<br>

**Node ID:** 7ecb8c0e-e324-49cd-a984-02d45775896d<br>**Similarity:** 0.985866683<br>**Text:** the name of film is die hard with a vengeance . john mcclane is now almost a full - blown alcoholic and is suspended from the nypd . but when a bomb goes off in the bonwit teller department store the police go insane trying to figure out what ' s going on . soon , a man named simon calls and asks for mcclane . simon tells inspector walter cobb that mcclane is going to play a game called simon says . he says that mcclane is going to do the tasks he assigns him . if not , he ' ll set off another bomb . with the help of a harlem electrician , john mcclane must race all over new york trying to figure out the frustrating puzzles that the crafty terrorist gives him . but when a bomb goes off in a subway station right by the federal reserve ( the biggest gold storage in the world ) things start to get heated . the genres of film are action , adventure , thriller . the director of film is john mctiernan . some stars of the film are bruce willes , jeremy irons , samuel l . jackson<br>**Metadata:** {'filmname': 'Die Hard with a Vengeance'}<br>

**Node ID:** 488b4d1d-d683-41c8-ba01-964203e63201<br>**Similarity:** 0.985961539<br>**Text:** the name of film is last action hero . young danny madigan is a huge fan of jack slater , a larger - than - life action hero played by arnold schwarzenegger . when his best friend , nick the projectionist , gives him a magic ticket to the newest jack slater movie , danny is transported into slater ' s world , his number one hero where the good guys always win . it ' s a dream come true for danny , but things take a turn for the worse when one of slater ' s enemies , benedict the hit man , gets ahold of the ticket and ends up in danny ' s world . slater and danny must join forces and travel back and stop him at all costs before it ' ll be the end of jack slater . the genres of film are action , adventure , comedy , fantasy . the director of film is john mctiernan . some stars of the film are arnold schwarzenegger , f . , paul freeman<br>**Metadata:** {'filmname': 'Last Action Hero'}<br>