In [None]:
# This experiment is conducted on Oracle OCI using the Data Flow service to understand the impact 
# of the 'num_partitions' parameter on the overall runtime of SERE.
# The 'num_partitions' parameter in the Word2Vec function controls the level of parallelism 
# during the computation. By varying this parameter, we aim to observe how it affects the time taken 
# to generate word embeddings in a distributed computing environment like Spark. 
# The experiment also involves measuring the time taken to generate motif walks and combining 
# it with the embedding time to get the total execution time. 
# The results will provide insights into optimizing 'num_partitions' for large-scale 
# knowledge graph processing tasks in a distributed setup.

# The code initializes a Spark session, loads an RDF dataset to create a knowledge graph,
# filters entities based on path numbers, and runs motif walks followed by Word2Vec embeddings. 
# The execution times for both steps are measured and printed for analysis.

# To repeat the experiment in the paper, one needs to increase the num_partitions in the function. 

In [None]:
import ads  # Importing the Oracle Cloud Infrastructure (OCI) Data Science API
ads.set_auth("resource_principal")  # Setting the authentication method (options: resource_principal or api_key)

# Loading the OCI Data Flow magic commands extension
%load_ext dataflow.magics

In [None]:
import json  # Importing the JSON module for handling JSON data

# Creating a command dictionary to configure the Data Flow session
command = {
    "compartmentId": "REMOVED_FOR_SECURITY",  # OCI Compartment ID
    "displayName": "Exec#1",  # Name for the Data Flow session
    "language": "PYTHON",  # Programming language for the session
    "sparkVersion": "3.2.1",  # Version of Apache Spark to be used
    "driverShape": "VM.Standard.E3.Flex",  # Compute shape for the driver node
    "executorShape": "VM.Standard.E3.Flex",  # Compute shape for the executor nodes
    "driverShapeConfig": {"ocpus": 4, "memoryInGBs": 32},  # Configuration for the driver node: 4 OCPUs and 32GB memory
    "executorShapeConfig": {"ocpus": 2, "memoryInGBs": 16},  # Configuration for executor nodes: 2 OCPUs and 16GB memory
    "numExecutors": 1,  # Number of executor nodes
    "type": "SESSION",  # Type of Data Flow execution (SESSION)
    "archiveUri": "REMOVED_FOR_SECURITY"  # URI to the archived job files
}

# Converting the command dictionary to a JSON string and escaping quotes for shell execution
command = f'\'{json.dumps(command)}\''
# Creating a new Data Flow session with the specified command
%create_session -l python -c $command

In [None]:
%%spark
import pyspark.sql.functions as f
import pandas as pd
from sparkkgml.kg import KG
from sparkkgml.motifWalks import MotifWalks
import time

In [None]:
%%spark
# Initialize the KG class with the provided RDF data and optional skip predicates
kg_instance = KG(location="/opt/dataflow/python/lib/user/AM/rdf_am-data.ttl", skip_predicates=[], sparkSession=spark)

# Create a new GraphFrame from the knowledge graph
graph = kg_instance.createKG()

In [None]:
%%spark
# Load the CSV file containing entity path numbers into a pandas DataFrame
df = pd.read_csv("/opt/dataflow/python/lib/user/AM/entity_path_numbers.csv")
# Filter entities with path numbers less than 1,000,000 and get the unique list of entities
entities = df[df['pathNum'] < 1000000]['entity'].unique().tolist()

In [None]:
%%spark
# Initialize lists to store path counts, walk times, embedding times, and total times
path_num = []
walk_times = []
embedding_times = []
total_times = []

In [None]:
%%spark
# Initialize the MotifWalks class with the filtered entities and the Spark session
motifWalks_instance = MotifWalks(kg_instance, entities=entities, sparkSession=spark)

# Measure the time taken to generate motif walks
start_time = time.time()
paths_df = motifWalks_instance.motif_walk(graph, 4)  # Generate motif walks with depth 4
path_num.append(paths_df.count())  # Count the number of generated paths and store it

end_time = time.time()
walk_time = end_time - start_time  # Calculate the time taken for motif walks
walk_times.append(round(walk_time, 3))  # Store the walk time

# Measure the time taken to generate Word2Vec embeddings
start_time = time.time()


# num_partitions should be changed to understand the affect of it for the experiment 
embeddings = motifWalks_instance.word2Vec_embeddings(
    paths_df, vector_size=100, window_size=5, min_count=5, max_iter=5, step_size=0.025, num_partitions=1, seed=42, input_col="paths", output_col="vectors"
)  # Generate Word2Vec embeddings

end_time = time.time()
embedding_time = end_time - start_time  # Calculate the time taken for embeddings
embedding_times.append(round(embedding_time, 3))  # Store the embedding time

# Calculate the total time for both motif walks and embeddings
total_time = round(walk_time + embedding_time, 3)
total_times.append(total_time)

# Print the results: path counts, walk times, embedding times, and total times
print('path nums:', path_num)
print('walk times:', walk_times)
print('embedding times:', embedding_times)
print('total times:', total_times)