In [2]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer, IDF, ChiSqSelector, Normalizer, StringIndexer
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import getpass
from pyspark.ml.feature import VarianceThresholdSelector
from pyspark.ml import Pipeline
from pyspark.ml.feature import (
    RegexTokenizer, StopWordsRemover,
    CountVectorizer, IDF, ChiSqSelector,
    Normalizer, StringIndexer
)
from pyspark.ml.classification import LinearSVC, OneVsRest
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder

Set seed and initialize spark session

In [4]:
SEED = 42
seed = 42
random.seed(seed)
np.random.seed(seed)

spark = SparkSession \
    .builder \
    .appName("DIC EX 2 - group 36") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")

print("Spark session initialized")



SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/usr/lib/spark/jars/log4j-slf4j-impl-2.17.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/usr/lib/hadoop/lib/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


25/05/13 01:45:37 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/05/13 01:45:37 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
25/05/13 01:45:37 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
25/05/13 01:45:37 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
25/05/13 01:45:37 WARN Utils: Service 'SparkUI' could not bind on port 4044. Attempting port 4045.
25/05/13 01:45:37 WARN Utils: Service 'SparkUI' could not bind on port 4045. Attempting port 4046.
25/05/13 01:45:37 WARN Utils: Service 'SparkUI' could not bind on port 4046. Attempting port 4047.
25/05/13 01:45:37 WARN Utils: Service 'SparkUI' could not bind on port 4047. Attempting port 4048.
25/05/13 01:45:37 WARN Utils: Service 'SparkUI' could not bind on port 4048. Attempting port 4049.
25/05/13 01:45:37 WARN Utils: Service 'SparkUI' could not bind on port 4049. Attempting port 4050.
25/05/13 0

here we reuse the pipe that we created in ex2_part2


In [5]:
USER      = getpass.getuser() 
PIPE_PATH = f"hdfs:///user/{USER}/models/feature_pipe_part2" 

data_path = "hdfs:///user/dic25_shared/amazon-reviews/full/reviews_devset.json"       
df = spark.read.json(data_path)

feat_model = PipelineModel.load(PIPE_PATH)

                                                                                

we split the data in train, validation and test split. Before that we sample the dataset down to 5% due to limited computational resources.

In [None]:
df = df.sample(withReplacement=False, fraction=0.05, seed=42) #sample to 5%

train_data, temp = df.randomSplit([0.7, 0.3], seed=SEED)

valid_data, test_data = temp.randomSplit([0.5, 0.5], seed=SEED)

print(f"train rows: {train_data.count()}")
print(f"valid rows: {valid_data.count()}")
print(f"test rows: {test_data.count()}")

                                                                                

train rows: 2889
valid rows: 587




test rows: 584


                                                                                

as we are going to create our own filtering in the next steps we are deleting the chisquared filtering from the pipe and taking the tf_idf as our input for the selection

In [7]:
stages_no_chi = feat_model.stages
stages_no_chi = feat_model.stages
if stages_no_chi[-1].__class__.__name__ == "ChiSqSelectorModel":
    stages_no_chi = stages_no_chi[:-1] 

idf_out = stages_no_chi[-2].getOutputCol() 
print(f"idf_out: {idf_out}")

idf_out: tf_idf


just create the base svm model 

In [1]:
base_svm = LinearSVC(
    featuresCol="norm", labelCol="label", predictionCol="prediction",
    )

NameError: name 'LinearSVC' is not defined

here we create two different filtering approaches

- chisqselector with 2000 features

- Variance ThresholdSelector

each of them with a specific normalizer

we also create the OneVsRest for our svm

In [9]:
chisq = ChiSqSelector(
        featuresCol=idf_out, outputCol="selected", labelCol="label", numTopFeatures=2000,
    )

chi_normalizer = Normalizer(
    inputCol="selected", outputCol="norm", p=2.0
    )


In [10]:
vts = VarianceThresholdSelector(            
      featuresCol=idf_out, 
      outputCol="selected"
    )

vts_normalizer = Normalizer(inputCol="selected", outputCol="norm")

In [11]:
ovr = OneVsRest(classifier=base_svm,
                labelCol="label", featuresCol="norm")


this creates the two different pipelines for both filtering models

In [12]:
chi_pipeline = Pipeline(stages=stages_no_chi + [chisq, chi_normalizer, ovr])
vts_pipeline = Pipeline(stages=stages_no_chi + [vts, vts_normalizer, ovr])

here we initialize the ParameterGrid with different parameters for the sv,:

- max. Iterations: 5, 10
- regularization Parameter: 0.1, 1.0, 10.0
- standardization: True, False 

and create the evaluator with the F1 metric 


In [13]:
param_grid = (
    ParamGridBuilder()
        .addGrid(base_svm.maxIter, [5,10])
        .addGrid(base_svm.regParam, [0.1, 1.0, 10.0])
        .addGrid(base_svm.standardization, [True, False])
        .build()
    )

evaluator = MulticlassClassificationEvaluator(
        labelCol="label", predictionCol="prediction", metricName="f1"
    )


here we build a function to run both pipelines and safe the best parameters aswell as the F1 score, 
we then save the results and print them out

In [14]:
def run_tvs(pipe, param_grid, name):
    tvs = TrainValidationSplit(
        estimator=pipe,
        estimatorParamMaps=param_grid,
        evaluator=evaluator,
        trainRatio=0.8,
        seed=SEED
    )
    print(f"\n Fitting TVS for {name} …")
    model = tvs.fit(train_data)
    val_metrics = model.validationMetrics
    best_i     = val_metrics.index(max(val_metrics))
    best_val   = val_metrics[best_i]
    best_params= tvs.getEstimatorParamMaps()[best_i]
    test_pred  = model.bestModel.transform(test_data)
    test_f1    = evaluator.evaluate(test_pred)
    print(f" {name}  Best val F1 = {best_val:.4f}  params={best_params}")
    print(f" {name}  Test F1     = {test_f1:.4f}")
    return best_val, test_f1

In [16]:
results = {}
results['var'], results['var_test'] = run_tvs(vts_pipeline, param_grid, "VTS")
results['chi2'], results['chi2_test'] = run_tvs(chi_pipeline, param_grid, "ChiSq")



 Fitting TVS for VTS …
25/05/13 01:58:33 WARN CacheManager: Asked to cache already cached data.
25/05/13 01:58:33 WARN CacheManager: Asked to cache already cached data.


                                                                                

25/05/13 02:00:01 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB


[Stage 1800:>                                                       (0 + 2) / 2]

25/05/13 02:00:02 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB


                                                                                

25/05/13 02:01:51 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB


[Stage 2502:>                                                       (0 + 2) / 2]

25/05/13 02:01:58 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB


                                                                                

25/05/13 02:03:44 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB


[Stage 3258:>                                                       (0 + 2) / 2]

25/05/13 02:03:55 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB


                                                                                

25/05/13 02:05:47 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB


                                                                                

25/05/13 02:06:21 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB


                                                                                

25/05/13 02:07:30 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB


                                                                                

25/05/13 02:09:12 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB


                                                                                

25/05/13 02:09:44 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB


                                                                                

25/05/13 02:11:23 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB


                                                                                

25/05/13 02:11:55 WARN BlockManager: Asked to remove block broadcast_14021, which does not exist
25/05/13 02:11:57 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB


                                                                                

25/05/13 02:14:13 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB


[Stage 8353:>                                                       (0 + 2) / 2]

25/05/13 02:14:20 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB


                                                                                

25/05/13 02:16:08 ERROR OWLQN: Failure! Resetting history: breeze.optimize.NaNHistory: 


                                                                                

25/05/13 02:16:34 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB


                                                                                

25/05/13 02:17:12 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB


                                                                                

25/05/13 02:19:25 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB


                                                                                

25/05/13 02:20:18 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB


                                                                                

25/05/13 02:21:05 ERROR OWLQN: Failure! Resetting history: breeze.optimize.NaNHistory: 


                                                                                

25/05/13 02:21:53 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB


                                                                                

25/05/13 02:24:35 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB


                                                                                

25/05/13 02:26:49 WARN DAGScheduler: Broadcasting large task binary with size 2.9 MiB


                                                                                

 VTS  Best val F1 = 0.4068  params={Param(parent='LinearSVC_6d0c326237d8', name='maxIter', doc='max number of iterations (>= 0).'): 5, Param(parent='LinearSVC_6d0c326237d8', name='regParam', doc='regularization parameter (>= 0).'): 0.1, Param(parent='LinearSVC_6d0c326237d8', name='standardization', doc='whether to standardize the training features before fitting the model.'): False}
 VTS  Test F1     = 0.4202

 Fitting TVS for ChiSq …


                                                                                

25/05/13 02:29:05 WARN DAGScheduler: Broadcasting large task binary with size 1235.0 KiB


                                                                                

25/05/13 02:31:19 WARN DAGScheduler: Broadcasting large task binary with size 1235.0 KiB


                                                                                

25/05/13 02:33:27 WARN DAGScheduler: Broadcasting large task binary with size 1235.0 KiB


                                                                                

25/05/13 02:35:45 WARN DAGScheduler: Broadcasting large task binary with size 1235.0 KiB


                                                                                

25/05/13 02:37:48 WARN DAGScheduler: Broadcasting large task binary with size 1235.0 KiB


                                                                                

25/05/13 02:40:15 WARN DAGScheduler: Broadcasting large task binary with size 1235.0 KiB


                                                                                

25/05/13 02:42:50 WARN DAGScheduler: Broadcasting large task binary with size 1235.0 KiB


                                                                                

25/05/13 02:45:48 WARN DAGScheduler: Broadcasting large task binary with size 1235.0 KiB


                                                                                

25/05/13 02:48:28 WARN DAGScheduler: Broadcasting large task binary with size 1235.0 KiB


                                                                                

25/05/13 02:51:51 WARN DAGScheduler: Broadcasting large task binary with size 1235.0 KiB


                                                                                

25/05/13 02:54:48 WARN DAGScheduler: Broadcasting large task binary with size 1235.0 KiB


                                                                                

25/05/13 02:58:29 WARN DAGScheduler: Broadcasting large task binary with size 1235.0 KiB


                                                                                

25/05/13 03:00:57 WARN DAGScheduler: Broadcasting large task binary with size 1233.6 KiB




 ChiSq  Best val F1 = 0.3612  params={Param(parent='LinearSVC_6d0c326237d8', name='maxIter', doc='max number of iterations (>= 0).'): 10, Param(parent='LinearSVC_6d0c326237d8', name='regParam', doc='regularization parameter (>= 0).'): 0.1, Param(parent='LinearSVC_6d0c326237d8', name='standardization', doc='whether to standardize the training features before fitting the model.'): True}
 ChiSq  Test F1     = 0.3814


                                                                                

In [None]:
spark.stop()