In [None]:
# This code is run in Oracle Cloud Infrastructure with Data Science and Data Flow services
import ads
ads.set_auth("resource_principal") # Supported values: resource_principal, api_key
%load_ext dataflow.magics

In [None]:
# connecting to dataflow service to create Spark application
# 'numExecutors' and 'ocpus' in executorShapeConfig, should be adjusted to create desired experiment parameters
import json
command = {
    "compartmentId": "ocid1.compartment.oc1..aaaaaaaazpukv3ckmxt3lkjzvjf4kaiuwmsdz6su25zucjutcxuqqjmrgsqq",
    "displayName": "2exec-64core",
    "language": "PYTHON",
    "sparkVersion": "3.2.1",
    "driverShape": "VM.Standard.E3.Flex",
    "executorShape": "VM.Standard.E3.Flex",
    "driverShapeConfig":{"ocpus":1,"memoryInGBs":16},
    "executorShapeConfig":{"ocpus":64,"memoryInGBs":512},
    "numExecutors": 2,
    "type": "SESSION",
    "configuration": {"spark.archives":"oci://dataflow-logs@id79isy5uol3/conda_environments/cpu/PySpark 3.2 and Data Flow/3.0/pyspark32_p38_cpu_v3#conda"}
}
command = f'\'{json.dumps(command)}\''
 
%create_session -l python -c $command

In [None]:
%%spark
# import necessary modules from sparkkgml 
from sparkkgml.feature_engineering import FeatureEngineering
from sparkkgml.vectorization import Vectorization
import time

In [None]:
%%spark
# preprocess
from pyspark.sql.functions import when
from pyspark.sql.types import IntegerType
# read the data 
df= spark.read.csv("oci://example@id79isy5uol3/data.csv" ,sep = '\t', header=True)
# replace '\N' values with 0 in the specified column
df = df.withColumn('runtimeMinutes', when(df['runtimeMinutes'] == "\\N", -1).otherwise(df['runtimeMinutes']))
df = df.withColumn('startYear', when(df['startYear'] == "\\N", 0).otherwise(df['startYear']))
df = df.withColumn('endYear', when(df['endYear'] == "\\N", 0).otherwise(df['endYear']))
# change the datatype of the column to IntegerType
df = df.withColumn('runtimeMinutes', df['runtimeMinutes'].cast(IntegerType()))
df = df.withColumn('startYear', df['startYear'].cast(IntegerType()))
df = df.withColumn('endYear', df['endYear'].cast(IntegerType()))

In [None]:
%%spark
# run the code in a loop and append runtimes for every run 
feature_eng_times=[]
vectorization_times=[]
total_times=[]

for i in range(10):
    total_time_start = time.time()
    # create an instance of FeatureEngineering
    # call getFeatures function and get features for every column
    featureEngineeringObject=FeatureEngineering()
    start_time1 = time.time()
    df2,features=featureEngineeringObject.getFeatures(df)
    end_time1 = time.time()
    feature_eng_times.append(round(end_time1 - start_time1, 2))

    # create an instance of Vectorization module
    # call vectorize function and digitaze all the features
    vectorizationObject=Vectorization()
    start_time1 = time.time()
    digitized_df=vectorizationObject.vectorize(df2,features)
    end_time1 = time.time()
    vectorization_times.append(round(end_time1 - start_time1, 2))

    total_time_end = time.time()
    total_times.append(round(total_time_end - total_time_start, 2))

In [None]:
%%spark
print('feature_eng_times: ' + str(feature_eng_times))
print('vectorization_times: ' + str(vectorization_times) )
print('total_times: '  + str(total_times) )