In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
import pandas as pd
from sparkkgml.kg import KG
from sparkkgml.motifWalks import MotifWalks
import time

In [None]:
# Initialize SparkSession 
spark = SparkSession.builder.getOrCreate()  

In [None]:
# Initialize the KG class with the provided RDF data and optional skip predicates
kg_instance = KG(location="./AM.ttl", skip_predicates=[], sparkSession=spark)

# Create a new GraphFrame from the knowledge graph
graph = kg_instance.createKG()

In [None]:
# Load the CSV file containing entity path numbers into a pandas DataFrame
df = pd.read_csv("pyRDF2Vec/entity_path_numbers.csv")

# Filter entities with path numbers less than 1,000,000 and get the unique list of entities
entities = df[df['pathNum'] < 1000000]['entity'].unique().tolist()

In [None]:
# Initialize lists to store the number of paths and runtime for each iteration
path_num = []
run_times = []

# Loop over the list of entities, processing subsets incrementally
for i in range(len(entities)):
    
    # Initialize the MotifWalks class with the current subset of entities and the Spark session
    motifWalks_instance = MotifWalks(kg_instance, entities=entities[:i], sparkSession=spark)
    
    # Start the timer to measure the runtime of the motif_walk function
    start_time = time.time()
    
    # Generate motif walks with a depth of 4 for the current subset of entities
    paths_df = motifWalks_instance.motif_walk(graph, 4)
    
    # Append the count of paths generated to the path_num list
    path_num.append(paths_df.count())

    # Stop the timer and calculate the elapsed time
    end_time = time.time()
    
    # Append the elapsed time to the run_times list
    run_times.append(round(end_time - start_time, 3))
    
    # Print the current list of path numbers and corresponding runtimes
    print('path nums:', path_num)
    print('run times:', run_times)