In [1]:
#Applying Support Vector Machine
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('ctr-prediction').getOrCreate()
spark_df = spark.read.csv('hdfs://lakhwinder/preprocessed_500000.csv', header = True, inferSchema = True)
spark_df = spark_df.drop('_c0')

#Handling Categorical Data
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
catergoricalFeature = ["banner_pos", "site_category", "app_category","device_type", "device_conn_type"]
stages = [] # stages in our Pipeline
for catergoricalFeat in catergoricalFeature:
    # StringIndexer for category Indexing
    strIndexer = StringIndexer(inputCol=catergoricalFeat, outputCol=catergoricalFeat + "Index").setHandleInvalid("skip")
    # Using OneHotEncoder
    ohencoder = OneHotEncoderEstimator(inputCols=[strIndexer.getOutputCol()], outputCols=[catergoricalFeat + "classVec"])
    # Add stages for pipeline
    stages += [strIndexer, ohencoder]

label_output = StringIndexer(inputCol="click", outputCol="label")
stages += [label_output]

numericCols = ["hour"]
assemblerInputs = [c + "classVec" for c in catergoricalFeature] + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

# Applying Support Vector Machine (SVM) algorithm
from pyspark.ml.classification import LinearSVC
partialPipeline = Pipeline().setStages(stages)
pipelineModel = partialPipeline.fit(spark_df)
preppedDataDF = pipelineModel.transform(spark_df)

train, test = preppedDataDF.randomSplit([0.75, 0.25], seed = 2)

#Calculating the training time
from time import *
start_time = time()

lsvmModel = LinearSVC(featuresCol='features',labelCol = 'label', maxIter=10)
lsvmModel1 = lsvmModel.fit(train)

end_time = time()
elapsed_time = end_time - start_time
print("Time to train the Model LR : %.3f seconds" % elapsed_time)

Time to train the Model LR : 101.464 seconds


In [4]:
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.ml.evaluation import BinaryClassificationEvaluator
predictions = lsvmModel1.transform(test)

#Finding the Accuracy
evaluator = BinaryClassificationEvaluator()
accuracy = evaluator.evaluate(predictions)
accuracy

0.5783887886247026