In [None]:
# this code is prepared in Databricks 
# pip install sparkkgml library
!pip install sparkkgml

In [None]:
# import necessary modules from sparkkgml 
import time
from sparkkgml.data_acquisition import DataAcquisition
from sparkkgml.feature_engineering import FeatureEngineering
from sparkkgml.vectorization import Vectorization

In [None]:
# Query1 (genres: Spy, Superhero)

# prepare endpoint and query
endpoint = "https://sparkkgml.arcc.albany.edu/lmdb"
query ="""SELECT
        ?movie
        ?movie_title
        ?genre
        ?director
        ?actor
        (<http://www.w3.org/2001/XMLSchema#int>(?movie__down_runtime) as ?runtime)
        WHERE { 
        ?movie <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://data.linkedmdb.org/movie/film> . 
        ?movie <http://data.linkedmdb.org/movie/genre> ?movie__down_genre . ?movie__down_genre <http://data.linkedmdb.org/movie/film_genre_name> ?genre .
       
        
        OPTIONAL { ?movie <http://purl.org/dc/terms/title> ?movie_title . } 
        OPTIONAL { ?movie <http://data.linkedmdb.org/movie/runtime> ?movie__down_runtime . } 
        OPTIONAL { ?movie <http://data.linkedmdb.org/movie/actor> ?movie__down_actor . ?movie__down_actor <http://data.linkedmdb.org/movie/actor_name> ?actor . } 
        OPTIONAL { ?movie <http://data.linkedmdb.org/movie/director> ?movie__down_director .  ?movie__down_director <http://data.linkedmdb.org/movie/director_name> ?director . } 
        
        FILTER (?genre = 'Spy' || ?genre = 'Superhero' )
        }
        """

# run the code in a loop and append runtimes for every run 
data_acq_times=[]
feature_eng_times=[]
vectorization_times=[]
total_times=[]

for i in range(10):
    total_time_start = time.time()
    # create an instance of DataAcquisition
    # set parameters for null values
    dataAcquisitionObject=DataAcquisition(spark)
    dataAcquisitionObject.set_amputationMethod('nullReplacement')
    dataAcquisitionObject.set_nullReplacementMethod('customValue')
    dataAcquisitionObject.set_customValueVariable(-1)
    dataAcquisitionObject.set_customStringValueVariable(' ')

    # retrieve the data as a Spark DataFrame
    start_time1 = time.time()
    df = dataAcquisitionObject.getDataFrame(endpoint=endpoint, query=query)
    end_time1 = time.time()
    data_acq_times.append(round(end_time1 - start_time1, 2))

    # create an instance of FeatureEngineering
    # call getFeatures function and get features for every column
    featureEngineeringObject=FeatureEngineering()
    start_time1 = time.time()
    df2,features=featureEngineeringObject.getFeatures(df)
    end_time1 = time.time()
    feature_eng_times.append(round(end_time1 - start_time1, 2))

    # create an instance of Vectorization module
    # call vectorize function and digitaze all the features
    vectorizationObject=Vectorization()
    start_time1 = time.time()
    digitized_df=vectorizationObject.vectorize(df2,features)
    end_time1 = time.time()
    vectorization_times.append(round(end_time1 - start_time1, 2))

    total_time_end = time.time()
    total_times.append(round(total_time_end - total_time_start, 2))

print('data_acq_times:',data_acq_times)
print('feature_eng_times:',feature_eng_times)
print('vectorization_times:',vectorization_times)
print('total_times:',total_times)

In [None]:
# Query2 (genres: Mystery,Romantic comedy)

# prepare endpoint and query
endpoint = "https://sparkkgml.arcc.albany.edu/lmdb"
query ="""SELECT
        ?movie
        ?movie_title
        ?genre
        ?director
        ?actor
        (<http://www.w3.org/2001/XMLSchema#int>(?movie__down_runtime) as ?runtime)
        WHERE { 
        ?movie <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://data.linkedmdb.org/movie/film> . 
        ?movie <http://data.linkedmdb.org/movie/genre> ?movie__down_genre . ?movie__down_genre <http://data.linkedmdb.org/movie/film_genre_name> ?genre .
       
        
        OPTIONAL { ?movie <http://purl.org/dc/terms/title> ?movie_title . } 
        OPTIONAL { ?movie <http://data.linkedmdb.org/movie/runtime> ?movie__down_runtime . } 
        OPTIONAL { ?movie <http://data.linkedmdb.org/movie/actor> ?movie__down_actor . ?movie__down_actor <http://data.linkedmdb.org/movie/actor_name> ?actor . } 
        OPTIONAL { ?movie <http://data.linkedmdb.org/movie/director> ?movie__down_director .  ?movie__down_director <http://data.linkedmdb.org/movie/director_name> ?director . } 
        
        FILTER (?genre = 'Mystery' || ?genre = 'Romantic comedy' )
        }
        """


# run the code in a loop and append runtimes for every run 
data_acq_times=[]
feature_eng_times=[]
vectorization_times=[]
total_times=[]

for i in range(10):
    total_time_start = time.time()
    # create an instance of DataAcquisition
    # set parameters for null values
    dataAcquisitionObject=DataAcquisition(spark)
    dataAcquisitionObject.set_amputationMethod('nullReplacement')
    dataAcquisitionObject.set_nullReplacementMethod('customValue')
    dataAcquisitionObject.set_customValueVariable(-1)
    dataAcquisitionObject.set_customStringValueVariable(' ')

    # retrieve the data as a Spark DataFrame
    start_time1 = time.time()
    df = dataAcquisitionObject.getDataFrame(endpoint=endpoint, query=query)
    end_time1 = time.time()
    data_acq_times.append(round(end_time1 - start_time1, 2))

    # create an instance of FeatureEngineering
    # call getFeatures function and get features for every column
    featureEngineeringObject=FeatureEngineering()
    start_time1 = time.time()
    df2,features=featureEngineeringObject.getFeatures(df)
    end_time1 = time.time()
    feature_eng_times.append(round(end_time1 - start_time1, 2))

    # create an instance of Vectorization module
    # call vectorize function and digitaze all the features
    vectorizationObject=Vectorization()
    start_time1 = time.time()
    digitized_df=vectorizationObject.vectorize(df2,features)
    end_time1 = time.time()
    vectorization_times.append(round(end_time1 - start_time1, 2))

    total_time_end = time.time()
    total_times.append(round(total_time_end - total_time_start, 2))

print('data_acq_times:',data_acq_times)
print('feature_eng_times:',feature_eng_times)
print('vectorization_times:',vectorization_times)
print('total_times:',total_times)

In [None]:
# Query3 (genres: Musical,Action)

# prepare endpoint and query
endpoint = "https://sparkkgml.arcc.albany.edu/lmdb"
query ="""SELECT
        ?movie
        ?movie_title
        ?genre
        ?director
        ?actor
        (<http://www.w3.org/2001/XMLSchema#int>(?movie__down_runtime) as ?runtime)
        WHERE { 
        ?movie <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://data.linkedmdb.org/movie/film> . 
        ?movie <http://data.linkedmdb.org/movie/genre> ?movie__down_genre . ?movie__down_genre <http://data.linkedmdb.org/movie/film_genre_name> ?genre .
       
        
        OPTIONAL { ?movie <http://purl.org/dc/terms/title> ?movie_title . } 
        OPTIONAL { ?movie <http://data.linkedmdb.org/movie/runtime> ?movie__down_runtime . } 
        OPTIONAL { ?movie <http://data.linkedmdb.org/movie/actor> ?movie__down_actor . ?movie__down_actor <http://data.linkedmdb.org/movie/actor_name> ?actor . } 
        OPTIONAL { ?movie <http://data.linkedmdb.org/movie/director> ?movie__down_director .  ?movie__down_director <http://data.linkedmdb.org/movie/director_name> ?director . } 
        
        FILTER (?genre = 'Musical' || ?genre = 'Action' )
        }
        """


# run the code in a loop and append runtimes for every run 
data_acq_times=[]
feature_eng_times=[]
vectorization_times=[]
total_times=[]

for i in range(10):
    total_time_start = time.time()
    # create an instance of DataAcquisition
    # set parameters for null values
    dataAcquisitionObject=DataAcquisition(spark)
    dataAcquisitionObject.set_amputationMethod('nullReplacement')
    dataAcquisitionObject.set_nullReplacementMethod('customValue')
    dataAcquisitionObject.set_customValueVariable(-1)
    dataAcquisitionObject.set_customStringValueVariable(' ')

    # retrieve the data as a Spark DataFrame
    start_time1 = time.time()
    df = dataAcquisitionObject.getDataFrame(endpoint=endpoint, query=query)
    end_time1 = time.time()
    data_acq_times.append(round(end_time1 - start_time1, 2))

    # create an instance of FeatureEngineering
    # call getFeatures function and get features for every column
    featureEngineeringObject=FeatureEngineering()
    start_time1 = time.time()
    df2,features=featureEngineeringObject.getFeatures(df)
    end_time1 = time.time()
    feature_eng_times.append(round(end_time1 - start_time1, 2))

    # create an instance of Vectorization module
    # call vectorize function and digitaze all the features
    vectorizationObject=Vectorization()
    start_time1 = time.time()
    digitized_df=vectorizationObject.vectorize(df2,features)
    end_time1 = time.time()
    vectorization_times.append(round(end_time1 - start_time1, 2))

    total_time_end = time.time()
    total_times.append(round(total_time_end - total_time_start, 2))

print('data_acq_times:',data_acq_times)
print('feature_eng_times:',feature_eng_times)
print('vectorization_times:',vectorization_times)
print('total_times:',total_times)

In [None]:
# Query4 (genres: Horror Film, Silent film)

# prepare endpoint and query
endpoint = "https://sparkkgml.arcc.albany.edu/lmdb"
query ="""SELECT
        ?movie
        ?movie_title
        ?genre
        ?director
        ?actor
        (<http://www.w3.org/2001/XMLSchema#int>(?movie__down_runtime) as ?runtime)
        WHERE { 
        ?movie <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://data.linkedmdb.org/movie/film> . 
        ?movie <http://data.linkedmdb.org/movie/genre> ?movie__down_genre . ?movie__down_genre <http://data.linkedmdb.org/movie/film_genre_name> ?genre .
       
        
        OPTIONAL { ?movie <http://purl.org/dc/terms/title> ?movie_title . } 
        OPTIONAL { ?movie <http://data.linkedmdb.org/movie/runtime> ?movie__down_runtime . } 
        OPTIONAL { ?movie <http://data.linkedmdb.org/movie/actor> ?movie__down_actor . ?movie__down_actor <http://data.linkedmdb.org/movie/actor_name> ?actor . } 
        OPTIONAL { ?movie <http://data.linkedmdb.org/movie/director> ?movie__down_director .  ?movie__down_director <http://data.linkedmdb.org/movie/director_name> ?director . } 
        
        FILTER (?genre = 'Horror Film' || ?genre = 'Silent film' )
        }
        """


# run the code in a loop and append runtimes for every run 
data_acq_times=[]
feature_eng_times=[]
vectorization_times=[]
total_times=[]

for i in range(10):
    total_time_start = time.time()
    # create an instance of DataAcquisition
    # set parameters for null values
    dataAcquisitionObject=DataAcquisition(spark)
    dataAcquisitionObject.set_amputationMethod('nullReplacement')
    dataAcquisitionObject.set_nullReplacementMethod('customValue')
    dataAcquisitionObject.set_customValueVariable(-1)
    dataAcquisitionObject.set_customStringValueVariable(' ')

    # retrieve the data as a Spark DataFrame
    start_time1 = time.time()
    df = dataAcquisitionObject.getDataFrame(endpoint=endpoint, query=query)
    end_time1 = time.time()
    data_acq_times.append(round(end_time1 - start_time1, 2))

    # create an instance of FeatureEngineering
    # call getFeatures function and get features for every column
    featureEngineeringObject=FeatureEngineering()
    start_time1 = time.time()
    df2,features=featureEngineeringObject.getFeatures(df)
    end_time1 = time.time()
    feature_eng_times.append(round(end_time1 - start_time1, 2))

    # create an instance of Vectorization module
    # call vectorize function and digitaze all the features
    vectorizationObject=Vectorization()
    start_time1 = time.time()
    digitized_df=vectorizationObject.vectorize(df2,features)
    end_time1 = time.time()
    vectorization_times.append(round(end_time1 - start_time1, 2))

    total_time_end = time.time()
    total_times.append(round(total_time_end - total_time_start, 2))

print('data_acq_times:',data_acq_times)
print('feature_eng_times:',feature_eng_times)
print('vectorization_times:',vectorization_times)
print('total_times:',total_times)

In [None]:
# Query5 (genres: Drama, Black-and-white)

# prepare endpoint and query
endpoint = "https://sparkkgml.arcc.albany.edu/lmdb"
query ="""SELECT
        ?movie
        ?movie_title
        ?genre
        ?director
        ?actor
        (<http://www.w3.org/2001/XMLSchema#int>(?movie__down_runtime) as ?runtime)
        WHERE { 
        ?movie <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://data.linkedmdb.org/movie/film> . 
        ?movie <http://data.linkedmdb.org/movie/genre> ?movie__down_genre . ?movie__down_genre <http://data.linkedmdb.org/movie/film_genre_name> ?genre .
       
        
        OPTIONAL { ?movie <http://purl.org/dc/terms/title> ?movie_title . } 
        OPTIONAL { ?movie <http://data.linkedmdb.org/movie/runtime> ?movie__down_runtime . } 
        OPTIONAL { ?movie <http://data.linkedmdb.org/movie/actor> ?movie__down_actor . ?movie__down_actor <http://data.linkedmdb.org/movie/actor_name> ?actor . } 
        OPTIONAL { ?movie <http://data.linkedmdb.org/movie/director> ?movie__down_director .  ?movie__down_director <http://data.linkedmdb.org/movie/director_name> ?director . } 
        
        FILTER (?genre = 'Drama' || ?genre = 'Black-and-white' )
        }
        """


# run the code in a loop and append runtimes for every run 
data_acq_times=[]
feature_eng_times=[]
vectorization_times=[]
total_times=[]

for i in range(10):
    total_time_start = time.time()
    # create an instance of DataAcquisition
    # set parameters for null values
    dataAcquisitionObject=DataAcquisition(spark)
    dataAcquisitionObject.set_amputationMethod('nullReplacement')
    dataAcquisitionObject.set_nullReplacementMethod('customValue')
    dataAcquisitionObject.set_customValueVariable(-1)
    dataAcquisitionObject.set_customStringValueVariable(' ')

    # retrieve the data as a Spark DataFrame
    start_time1 = time.time()
    df = dataAcquisitionObject.getDataFrame(endpoint=endpoint, query=query)
    end_time1 = time.time()
    data_acq_times.append(round(end_time1 - start_time1, 2))

    # create an instance of FeatureEngineering
    # call getFeatures function and get features for every column
    featureEngineeringObject=FeatureEngineering()
    start_time1 = time.time()
    df2,features=featureEngineeringObject.getFeatures(df)
    end_time1 = time.time()
    feature_eng_times.append(round(end_time1 - start_time1, 2))

    # create an instance of Vectorization module
    # call vectorize function and digitaze all the features
    vectorizationObject=Vectorization()
    start_time1 = time.time()
    digitized_df=vectorizationObject.vectorize(df2,features)
    end_time1 = time.time()
    vectorization_times.append(round(end_time1 - start_time1, 2))

    total_time_end = time.time()
    total_times.append(round(total_time_end - total_time_start, 2))

print('data_acq_times:',data_acq_times)
print('feature_eng_times:',feature_eng_times)
print('vectorization_times:',vectorization_times)
print('total_times:',total_times)