In [None]:
# import necessary modules from sparkkgml and pyspark
import time
from pyspark.sql import SparkSession
from sparkkgml.data_acquisition import DataAcquisition
from sparkkgml.feature_engineering import FeatureEngineering
from sparkkgml.vectorization import Vectorization
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import size
from pyspark.sql.functions import concat_ws
from pyspark.sql.types import StringType
from sparkkgml.data_augmentation import spark_dbpedia_lookup_linker
from sparkkgml.data_augmentation import spark_specific_relation_generator

In [None]:
# prepare endpoint and query
endpoint = "https://sparkkgml.arcc.albany.edu/lmdb"
query ="""SELECT
        ?movie
        ?movie_title
        ?genre
        ?director
        ?actor
        (<http://www.w3.org/2001/XMLSchema#int>(?movie__down_runtime) as ?runtime)
        WHERE { 
        ?movie <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://data.linkedmdb.org/movie/film> . 
        ?movie <http://data.linkedmdb.org/movie/genre> ?movie__down_genre . ?movie__down_genre <http://data.linkedmdb.org/movie/film_genre_name> ?genre .
       
        
        OPTIONAL { ?movie <http://purl.org/dc/terms/title> ?movie_title . } 
        OPTIONAL { ?movie <http://data.linkedmdb.org/movie/runtime> ?movie__down_runtime . } 
        OPTIONAL { ?movie <http://data.linkedmdb.org/movie/actor> ?movie__down_actor . ?movie__down_actor <http://data.linkedmdb.org/movie/actor_name> ?actor . } 
        OPTIONAL { ?movie <http://data.linkedmdb.org/movie/director> ?movie__down_director .  ?movie__down_director <http://data.linkedmdb.org/movie/director_name> ?director . } 
        
        FILTER (?genre = 'Spy' || ?genre = 'Superhero' )
        #FILTER (?genre = 'Spy' || ?genre = 'Superhero' || ?genre = 'Parody' || ?genre = 'Zombie' )
        }
        """

In [None]:
# create an instance of DataAcquisition
# set parameters for null values
dataAcquisitionObject=DataAcquisition(spark)
dataAcquisitionObject.set_amputationMethod('nullReplacement')
dataAcquisitionObject.set_nullReplacementMethod('customValue')
dataAcquisitionObject.set_customValueVariable(-1)
dataAcquisitionObject.set_customStringValueVariable(' ')
# Retrieve the data as a Spark DataFrame
df = dataAcquisitionObject.getDataFrame(endpoint=endpoint, query=query)

In [None]:
# look for matching DBPedia entity and return its URI as a newly added column
df_lookup_linked = spark_dbpedia_lookup_linker(
        df, column="movie_title", new_attribute_name="new_link",
        query_class="Film", max_hits=1, lookup_api="KeywordSearch")

In [None]:
# create attributes from the URI created above and find specific direct relation, add them as a new feature
df_specific_relation = spark_specific_relation_generator(df_lookup_linked, "new_link")
#df_specific_relation.printSchema()

In [None]:
# select the first 10 columns
columns_to_keep = df_specific_relation.columns[:10]
# select only the first 10 columns from the DataFrame
filtered_df = df_specific_relation.select(*columns_to_keep).drop("new_link")
#filtered_df.printSchema()

In [None]:
# create an instance of FeatureEngineering
# call getFeatures function and get features for every column
featureEngineeringObject=FeatureEngineering()
df2,features=featureEngineeringObject.getFeatures(filtered_df)

# apply preprocess to transform genres to have only 1 genre per movie
filtered_df = df2.filter(size(df2.genre) <= 1)
filtered_df = filtered_df.withColumn('genre', concat_ws(',', filtered_df['genre']))
features['genre']['isListOfEntries']=False
features['genre']['featureType']='Single_Categorical_String'

# create an instance of Vectorization module
# call vectorize function and digitaze all the features
vectorizationObject=Vectorization()
digitized_df=vectorizationObject.vectorize(filtered_df,features)

In [None]:
accuracy_list=[]
# run the code in a loop and append accuracies for every run 
for i in range(10):
    # assemble features into a vector
    selected_features=[i for i in digitized_df.columns if i!='genre' and i!='movie']
    assembler = VectorAssembler(inputCols=selected_features, outputCol="features")
    assembled_data = assembler.transform(digitized_df)

    # split the data into training and testing sets (70% training, 30% testing)
    train_data, test_data = assembled_data.randomSplit([0.7, 0.3])
    # create a RandomForestClassifier
    rf_classifier = RandomForestClassifier(labelCol="genre", featuresCol="features", numTrees=10)
    # Train the model
    model = rf_classifier.fit(train_data)

    # make predictions on both train and test data
    train_predictions = model.transform(train_data)
    test_predictions = model.transform(test_data)

    # evaluate the model on train data using MulticlassClassificationEvaluator
    train_evaluator = MulticlassClassificationEvaluator(labelCol="genre", predictionCol="prediction", metricName="accuracy")
    train_accuracy = train_evaluator.evaluate(train_predictions)

    # evaluate the model on test data using MulticlassClassificationEvaluator
    test_evaluator = MulticlassClassificationEvaluator(labelCol="genre", predictionCol="prediction", metricName="accuracy")
    test_accuracy = test_evaluator.evaluate(test_predictions)
    accuracy_list.append(round(test_accuracy, 4))
    print(test_accuracy)

print(accuracy_list)