In [1]:
#import library needed
from neo4j import GraphDatabase
import torch
from PIL import Image
import os
import json
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from collections import defaultdict
from dotenv import load_dotenv

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()

True

In [3]:
# Neo4j Connection Details
URI = os.getenv("NEO4J_URI")
# We use 'neo4j' as the default user
AUTH = ("neo4j", os.getenv("NEO4J_PASSWORD"))

driver = GraphDatabase.driver(URI, auth=AUTH)
driver.verify_connectivity()
print("Connection established.")

Connection established.


In [3]:
#Load the embedding model from huggingface and use the GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model = SentenceTransformer("BAAI/bge-m3").to(device)

Using device: cpu


In [4]:
#Text embedding function
def embed_text(text):
    embedding = model.encode(text, convert_to_numpy=True)
    return embedding.tolist()


In [5]:
# Create a graph for each place and define its relationships to State, Type, and multilingual Content nodes

def create_place_graph(tx, place):
    title = place.get("title")
    if not title:
        return

     # Place Node
    title_emb = embed_text(title)
    image_url = place.get("image_url", "")
    tx.run("""
        MERGE (p:Place {title: $title})
        SET p.embedding = $embedding,
            p.image_url = $image_url
    """, title=title, embedding=title_emb, image_url=image_url)

    # State Node
    state = place.get("state")
    if state:
        state_emb = embed_text(state)
        tx.run("""
            MERGE (s:State {name: $state})
            SET s.embedding = $embedding
            WITH s
            MATCH (p:Place {title: $title})
            MERGE (p)-[:IN_STATE]->(s)
        """, state=state, embedding=state_emb, title=title)

    # Type Node
    type_ = place.get("type")
    if type_:
        type_emb = embed_text(type_)
        tx.run("""
            MERGE (t:Type {name: $type})
            SET t.embedding = $embedding
            WITH t
            MATCH (p:Place {title: $title})
            MERGE (p)-[:HAS_TYPE]->(t)
        """, type=type_, embedding=type_emb, title=title)

    # Contents (en_content, ms_content)
    for lang in ["ms", "en"]:
        content = place.get(f"{lang}_content")
        if content:
            content_emb = embed_text(content)
            rel = f"HAS_{lang.upper()}_CONTENT"
            tx.run(f"""
                MERGE (c:Content {{lang: $lang, text: $text}})
                SET c.embedding = $embedding
                WITH c
                MATCH (p:Place {{title: $title}})
                MERGE (p)-[:{rel}]->(c)
            """, lang=lang, text=content, embedding=content_emb, title=title)


In [None]:
# Load tourism dataset 

JSON_PATH = "app/tourism_data.json"

def process_dataset():
    with open(JSON_PATH, "r", encoding="utf-8") as f:
        data = json.load(f)

    with driver.session() as session:
        for place in tqdm(data, desc="üìç Processing Places"):
            session.execute_write(create_place_graph, place)

    print("‚úÖ Knowledge graph created with proper vector embeddings.")
    driver.close()

In [7]:
#Start processing the dataset
if __name__ == "__main__":
    process_dataset()

üìç Processing Places: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 216/216 [07:35<00:00,  2.11s/it]

‚úÖ Knowledge graph created with proper vector embeddings.



