In [None]:
# this code is prepared in Databricks 
# pip install sparkkgml library
!pip install sparkkgml

In [None]:
# import necessary modules from sparkkgml 
import time
from sparkkgml.data_acquisition import DataAcquisition
from sparkkgml.feature_engineering import FeatureEngineering
from sparkkgml.vectorization import Vectorization
from pyspark.sql.functions import regexp_replace

In [None]:
# prepare endpoint and queries for every size of dataset

endpoint1 = "https://sparkkgml.arcc.albany.edu/sampleMovieRDF10"
endpoint2 = "https://sparkkgml.arcc.albany.edu/sampleMovieRDF100"
endpoint3 = "https://sparkkgml.arcc.albany.edu/sampleMovieRDF1000"
endpoint4 = "https://sparkkgml.arcc.albany.edu/sampleMovieRDF10000"
endpoint5 = "https://sparkkgml.arcc.albany.edu/sampleMovieRDF100000"

query1 ="""SELECT  ?movie ?producer ?date ?actor
        WHERE {
        ?actor  <http://sparkkgml.arcc.albany.edu/sampleMovieRDF10/movie_information.org/actedIn> ?movie.
        ?movie <http://sparkkgml.arcc.albany.edu/sampleMovieRDF10/movie_information.org/publishedDate> ?date.
        ?movie <http://sparkkgml.arcc.albany.edu/sampleMovieRDF10/movie_information.org/producedBy> ?producer.
        } 
        """
query2 ="""SELECT  ?movie ?producer ?date ?actor
        WHERE {
        ?actor  <http://sparkkgml.arcc.albany.edu/sampleMovieRDF100/movie_information.org/actedIn> ?movie.
        ?movie <http://sparkkgml.arcc.albany.edu/sampleMovieRDF100/movie_information.org/publishedDate> ?date.
        ?movie <http://sparkkgml.arcc.albany.edu/sampleMovieRDF100/movie_information.org/producedBy> ?producer.
        } 
        """
query3 ="""SELECT  ?movie ?producer ?date ?actor
        WHERE {
        ?actor  <http://sparkkgml.arcc.albany.edu/sampleMovieRDF1000/movie_information.org/actedIn> ?movie.
        ?movie <http://sparkkgml.arcc.albany.edu/sampleMovieRDF1000/movie_information.org/publishedDate> ?date.
        ?movie <http://sparkkgml.arcc.albany.edu/sampleMovieRDF1000/movie_information.org/producedBy> ?producer.
        } 
        """
query4 ="""SELECT  ?movie ?producer ?date ?actor
        WHERE {
        ?actor  <http://sparkkgml.arcc.albany.edu/sampleMovieRDF10000/movie_information.org/actedIn> ?movie.
        ?movie <http://sparkkgml.arcc.albany.edu/sampleMovieRDF10000/movie_information.org/publishedDate> ?date.
        ?movie <http://sparkkgml.arcc.albany.edu/sampleMovieRDF10000/movie_information.org/producedBy> ?producer.
        } 
        """
query5 ="""SELECT  ?movie ?producer ?date ?actor
        WHERE {
        ?actor  <http://sparkkgml.arcc.albany.edu/sampleMovieRDF100000/movie_information.org/actedIn> ?movie.
        ?movie <http://sparkkgml.arcc.albany.edu/sampleMovieRDF100000/movie_information.org/publishedDate> ?date.
        ?movie <http://sparkkgml.arcc.albany.edu/sampleMovieRDF100000/movie_information.org/producedBy> ?producer.
        } 
        """

In [None]:
# Dataset Size 1

# run the code in a loop and append runtimes for every run 
data_acq_times=[]
feature_eng_times=[]
vectorization_times=[]
total_times=[]

for i in range(10):
    total_time_start = time.time()
    # create an instance of DataAcquisition
    # set parameters for null values
    dataAcquisitionObject=DataAcquisition(spark)
    dataAcquisitionObject.set_amputationMethod('nullReplacement')
    dataAcquisitionObject.set_nullReplacementMethod('customValue')
    dataAcquisitionObject.set_customValueVariable(-1)
    dataAcquisitionObject.set_customStringValueVariable(' ')

    # retrieve the data as a Spark DataFrame
    start_time1 = time.time()
    df = dataAcquisitionObject.getDataFrame(endpoint=endpoint1, query=query)
    end_time1 = time.time()
    data_acq_times.append(round(end_time1 - start_time1, 2))

    # apply preprocess
    df = df.withColumn("movie", regexp_replace('movie','http://sparkkgml.arcc.albany.edu/sampleMovieRDF10/movie_information.org/movie/',''))
    df = df.withColumn("producer", regexp_replace('producer','http://sparkkgml.arcc.albany.edu/sampleMovieRDF10/movie_information.org/producer/',''))
    df = df.withColumn("actor", regexp_replace('actor','http://sparkkgml.arcc.albany.edu/sampleMovieRDF10/movie_information.org/actor/',''))


    # create an instance of FeatureEngineering
    # call getFeatures function and get features for every column
    featureEngineeringObject=FeatureEngineering()
    start_time1 = time.time()
    df2,features=featureEngineeringObject.getFeatures(df)
    end_time1 = time.time()
    feature_eng_times.append(round(end_time1 - start_time1, 2))

    # create an instance of Vectorization module
    # call vectorize function and digitaze all the features
    vectorizationObject=Vectorization()
    start_time1 = time.time()
    digitized_df=vectorizationObject.vectorize(df2,features)
    end_time1 = time.time()
    vectorization_times.append(round(end_time1 - start_time1, 2))

    total_time_end = time.time()
    total_times.append(round(total_time_end - total_time_start, 2))

print('data_acq_times:',data_acq_times)
print('feature_eng_times:',feature_eng_times)
print('vectorization_times:',vectorization_times)
print('total_times:',total_times)

In [None]:
# Dataset Size 2

# run the code in a loop and append runtimes for every run data_acq_times=[]
feature_eng_times=[]
vectorization_times=[]
total_times=[]

for i in range(10):
    total_time_start = time.time()
    # create an instance of DataAcquisition
    # set parameters for null values
    dataAcquisitionObject=DataAcquisition(spark)
    dataAcquisitionObject.set_amputationMethod('nullReplacement')
    dataAcquisitionObject.set_nullReplacementMethod('customValue')
    dataAcquisitionObject.set_customValueVariable(-1)
    dataAcquisitionObject.set_customStringValueVariable(' ')

    #apply preprocess
    df = df.withColumn("movie", regexp_replace('movie','http://sparkkgml.arcc.albany.edu/sampleMovieRDF100/movie_information.org/movie/',''))
    df = df.withColumn("producer", regexp_replace('producer','http://sparkkgml.arcc.albany.edu/sampleMovieRDF100/movie_information.org/producer/',''))
    df = df.withColumn("actor", regexp_replace('actor','http://sparkkgml.arcc.albany.edu/sampleMovieRDF100/movie_information.org/actor/',''))

    # retrieve the data as a Spark DataFrame
    start_time1 = time.time()
    df = dataAcquisitionObject.getDataFrame(endpoint=endpoint2, query=query2)
    end_time1 = time.time()
    data_acq_times.append(round(end_time1 - start_time1, 2))

    # create an instance of FeatureEngineering
    # call getFeatures function and get features for every column
    featureEngineeringObject=FeatureEngineering()
    start_time1 = time.time()
    df2,features=featureEngineeringObject.getFeatures(df)
    end_time1 = time.time()
    feature_eng_times.append(round(end_time1 - start_time1, 2))

    # create an instance of Vectorization module
    # call vectorize function and digitaze all the features
    vectorizationObject=Vectorization()
    start_time1 = time.time()
    digitized_df=vectorizationObject.vectorize(df2,features)
    end_time1 = time.time()
    vectorization_times.append(round(end_time1 - start_time1, 2))

    total_time_end = time.time()
    total_times.append(round(total_time_end - total_time_start, 2))

print('data_acq_times:',data_acq_times)
print('feature_eng_times:',feature_eng_times)
print('vectorization_times:',vectorization_times)
print('total_times:',total_times)

In [None]:
# Dataset Size 3

# run the code in a loop and append runtimes for every run 
data_acq_times=[]
feature_eng_times=[]
vectorization_times=[]
total_times=[]

for i in range(10):
    total_time_start = time.time()
    # create an instance of DataAcquisition
    # set parameters for null values
    dataAcquisitionObject=DataAcquisition(spark)
    dataAcquisitionObject.set_amputationMethod('nullReplacement')
    dataAcquisitionObject.set_nullReplacementMethod('customValue')
    dataAcquisitionObject.set_customValueVariable(-1)
    dataAcquisitionObject.set_customStringValueVariable(' ')

    #apply preprocess
    df = df.withColumn("movie", regexp_replace('movie','http://sparkkgml.arcc.albany.edu/sampleMovieRDF1000/movie_information.org/movie/',''))
    df = df.withColumn("producer", regexp_replace('producer','http://sparkkgml.arcc.albany.edu/sampleMovieRDF1000/movie_information.org/producer/',''))
    df = df.withColumn("actor", regexp_replace('actor','http://sparkkgml.arcc.albany.edu/sampleMovieRDF1000/movie_information.org/actor/',''))

    # create an instance of FeatureEngineering
    # call getFeatures function and get features for every column
    featureEngineeringObject=FeatureEngineering()
    start_time1 = time.time()
    df2,features=featureEngineeringObject.getFeatures(df)
    end_time1 = time.time()
    feature_eng_times.append(round(end_time1 - start_time1, 2))

    # create an instance of Vectorization module
    # call vectorize function and digitaze all the features
    vectorizationObject=Vectorization()
    start_time1 = time.time()
    digitized_df=vectorizationObject.vectorize(df2,features)
    end_time1 = time.time()
    vectorization_times.append(round(end_time1 - start_time1, 2))

    total_time_end = time.time()
    total_times.append(round(total_time_end - total_time_start, 2))

print('data_acq_times:',data_acq_times)
print('feature_eng_times:',feature_eng_times)
print('vectorization_times:',vectorization_times)
print('total_times:',total_times)

In [None]:
# Dataset Size 4

# run the code in a loop and append runtimes for every run 
data_acq_times=[]
feature_eng_times=[]
vectorization_times=[]
total_times=[]

for i in range(10):
    total_time_start = time.time()
    # create an instance of DataAcquisition
    # set parameters for null values
    dataAcquisitionObject=DataAcquisition(spark)
    dataAcquisitionObject.set_amputationMethod('nullReplacement')
    dataAcquisitionObject.set_nullReplacementMethod('customValue')
    dataAcquisitionObject.set_customValueVariable(-1)
    dataAcquisitionObject.set_customStringValueVariable(' ')

    # retrieve the data as a Spark DataFrame
    start_time1 = time.time()
    df = dataAcquisitionObject.getDataFrame(endpoint=endpoint4, query=query4)
    end_time1 = time.time()
    data_acq_times.append(round(end_time1 - start_time1, 2))

    # apply preprocess
    df = df.withColumn("movie", regexp_replace('movie','http://sparkkgml.arcc.albany.edu/sampleMovieRDF10000/movie_information.org/movie/',''))
    df = df.withColumn("producer", regexp_replace('producer','http://sparkkgml.arcc.albany.edu/sampleMovieRDF10000/movie_information.org/producer/',''))
    df = df.withColumn("actor", regexp_replace('actor','http://sparkkgml.arcc.albany.edu/sampleMovieRDF10000/movie_information.org/actor/',''))


    # create an instance of FeatureEngineering
    # call getFeatures function and get features for every column
    featureEngineeringObject=FeatureEngineering()
    start_time1 = time.time()
    df2,features=featureEngineeringObject.getFeatures(df)
    end_time1 = time.time()
    feature_eng_times.append(round(end_time1 - start_time1, 2))

    # create an instance of Vectorization module
    # call vectorize function and digitaze all the features
    vectorizationObject=Vectorization()
    start_time1 = time.time()
    digitized_df=vectorizationObject.vectorize(df2,features)
    end_time1 = time.time()
    vectorization_times.append(round(end_time1 - start_time1, 2))

    total_time_end = time.time()
    total_times.append(round(total_time_end - total_time_start, 2))

print('data_acq_times:',data_acq_times)
print('feature_eng_times:',feature_eng_times)
print('vectorization_times:',vectorization_times)
print('total_times:',total_times)

In [None]:
# Dataset Size 5

# run the code in a loop and append runtimes for every run 
data_acq_times=[]
feature_eng_times=[]
vectorization_times=[]
total_times=[]

for i in range(10):
    total_time_start = time.time()
    # Create an instance of KgQuery
    dataAcquisitionObject=DataAcquisition(spark)
    dataAcquisitionObject.set_amputationMethod('nullReplacement')
    dataAcquisitionObject.set_nullReplacementMethod('customValue')
    dataAcquisitionObject.set_customValueVariable(-1)
    dataAcquisitionObject.set_customStringValueVariable(' ')

    # Retrieve the data as a Spark DataFrame
    start_time1 = time.time()
    df = dataAcquisitionObject.getDataFrame(endpoint=endpoint5, query=query5)
    end_time1 = time.time()
    data_acq_times.append(round(end_time1 - start_time1, 2))

    # apply preprocess
    df = df.withColumn("movie", regexp_replace('movie','http://sparkkgml.arcc.albany.edu/sampleMovieRDF100000/movie_information.org/movie/',''))
    df = df.withColumn("producer", regexp_replace('producer','http://sparkkgml.arcc.albany.edu/sampleMovieRDF100000/movie_information.org/producer/',''))
    df = df.withColumn("actor", regexp_replace('actor','http://sparkkgml.arcc.albany.edu/sampleMovieRDF100000/movie_information.org/actor/',''))

    # create an instance of FeatureEngineering
    # call getFeatures function and get features for every column
    featureEngineeringObject=FeatureEngineering()
    start_time1 = time.time()
    df2,features=featureEngineeringObject.getFeatures(df)
    end_time1 = time.time()
    feature_eng_times.append(round(end_time1 - start_time1, 2))

    # create an instance of Vectorization module
    # call vectorize function and digitaze all the features
    vectorizationObject=Vectorization()
    start_time1 = time.time()
    digitized_df=vectorizationObject.vectorize(df2,features)
    end_time1 = time.time()
    vectorization_times.append(round(end_time1 - start_time1, 2))

    total_time_end = time.time()
    total_times.append(round(total_time_end - total_time_start, 2))

print('data_acq_times:',data_acq_times)
print('feature_eng_times:',feature_eng_times)
print('vectorization_times:',vectorization_times)
print('total_times:',total_times)