In [1]:
# Import required libraries
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer, IDF, ChiSqSelector, Normalizer, StringIndexer
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import getpass

# Set a fixed random seed for reproducibility
seed = 42
random.seed(seed)
np.random.seed(seed)



In [2]:
# Initialize SparkSession
spark = SparkSession \
    .builder \
    .appName("DIC EX 2 - group 36") \
    .getOrCreate()

print("Spark session initialized")



SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/usr/lib/spark/jars/log4j-slf4j-impl-2.17.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/usr/lib/hadoop/lib/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


25/05/12 11:55:34 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/05/12 11:55:34 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
25/05/12 11:55:34 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
25/05/12 11:55:34 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
25/05/12 11:55:34 WARN Utils: Service 'SparkUI' could not bind on port 4044. Attempting port 4045.
25/05/12 11:55:34 WARN Utils: Service 'SparkUI' could not bind on port 4045. Attempting port 4046.
25/05/12 11:55:34 WARN Utils: Service 'SparkUI' could not bind on port 4046. Attempting port 4047.
25/05/12 11:55:34 WARN Utils: Service 'SparkUI' could not bind on port 4047. Attempting port 4048.
25/05/12 11:55:34 WARN Utils: Service 'SparkUI' could not bind on port 4048. Attempting port 4049.
25/05/12 11:55:34 WARN Utils: Service 'SparkUI' could not bind on port 4049. Attempting port 4050.
25/05/12 1

In [3]:
# ----------------------------------------------------------------------------------
# DIC Ex 2 · Part 3  —  end-to-end pipeline with One-Vs-Rest SVM + TrainValidationSplit
# ----------------------------------------------------------------------------------

from pyspark.ml import Pipeline
from pyspark.ml.feature import (
    RegexTokenizer, StopWordsRemover,
    CountVectorizer, IDF, ChiSqSelector,
    Normalizer, StringIndexer
)
from pyspark.ml.classification import LinearSVC, OneVsRest
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder


In [4]:
#load the pipeline 
USER      = getpass.getuser() 
PIPE_PATH = f"hdfs:///user/{USER}/models/feature_pipe_part2" 

In [5]:
# --------------------------------------------------------------------
# 0 Load & split the data  (edit the path if yours differs)
# --------------------------------------------------------------------
SEED = 42
data_path = "hdfs:///user/dic25_shared/amazon-reviews/full/reviews_devset.json"       
df = spark.read.json(data_path)

print("→ Loading fitted feature pipeline …")
feat_model = PipelineModel.load(PIPE_PATH)



                                                                                

→ Loading fitted feature pipeline …


                                                                                

In [6]:
print("Splitting data …")
train_data, temp = df.randomSplit([0.7, 0.3], seed=SEED)

valid_data, test_data = temp.randomSplit([0.5, 0.5], seed=SEED)

print(f"⚙️  train rows: {train_data.count()}")
print(f"🔍 valid rows: {valid_data.count()}")
print(f"🧪 test rows       : {test_data.count()}")

train_data.cache()

Splitting data …


                                                                                

⚙️  train rows: 55401


                                                                                

🔍 valid rows: 11838


[Stage 29:>                                                         (0 + 2) / 2]

🧪 test rows       : 11590


                                                                                

DataFrame[asin: string, category: string, helpful: array<bigint>, overall: double, reviewText: string, reviewTime: string, reviewerID: string, reviewerName: string, summary: string, unixReviewTime: bigint]

In [7]:
stages_no_chi = feat_model.stages
stages_no_chi = feat_model.stages
if stages_no_chi[-1].__class__.__name__ == "ChiSqSelectorModel":
    stages_no_chi = stages_no_chi[:-1] 

idf_out = stages_no_chi[-2].getOutputCol() 
idf_out

'tf_idf'

In [8]:

chisq = ChiSqSelector(
    featuresCol=idf_out, outputCol="selected", labelCol="label"
)

normalizer = Normalizer(
    inputCol="selected", outputCol="norm", p=2.0
)

base_svm = LinearSVC(
    featuresCol="norm", labelCol="label", predictionCol="prediction",
    maxIter=100
)

ovr = OneVsRest(classifier=base_svm,
                labelCol="label", featuresCol="norm")


In [9]:
# --------------------------------------------------------------------
# 2 Full pipeline
# --------------------------------------------------------------------

pipeline = Pipeline(stages=stages_no_chi + [chisq, normalizer, ovr])

In [10]:
# --------------------------------------------------------------------
# 3 Hyper-parameter grid (6 points)
# --------------------------------------------------------------------
param_grid = (
    ParamGridBuilder()
    .addGrid(chisq.numTopFeatures, [2000, 500])
    .addGrid(base_svm.regParam, [0.1, 1.0, 10.0])
    .build()
)

evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="f1"
)

tvs = TrainValidationSplit(
    estimator=pipeline,
    estimatorParamMaps=param_grid,
    evaluator=evaluator,
    trainRatio=0.8,           # 80 % (of 5 %) used to fit per grid point
    seed=SEED
)

In [11]:
# --------------------------------------------------------------------
# 4 Fit & evaluate
# --------------------------------------------------------------------
print("⏳  Fitting TVS grid …")
tvs_model = tvs.fit(train_data)



⏳  Fitting TVS grid …


[Stage 41:>                                                         (0 + 2) / 2]

25/05/12 11:57:18 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/05/12 11:57:18 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS


                                                                                

25/05/12 11:57:19 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
25/05/12 11:57:19 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS


                                                                                

25/05/12 12:09:45 WARN DAGScheduler: Broadcasting large task binary with size 1237.1 KiB


                                                                                

25/05/12 12:21:04 WARN DAGScheduler: Broadcasting large task binary with size 1237.1 KiB


                                                                                

25/05/12 12:30:38 WARN DAGScheduler: Broadcasting large task binary with size 1237.1 KiB


                                                                                

25/05/12 13:03:44 ERROR OWLQN: Failure! Resetting history: breeze.optimize.NaNHistory: 
25/05/12 13:03:47 ERROR OWLQN: Failure! Resetting history: breeze.optimize.NaNHistory: 


                                                                                

25/05/12 13:04:22 ERROR OWLQN: Failure! Resetting history: breeze.optimize.NaNHistory: 


                                                                                

25/05/12 13:05:26 ERROR OWLQN: Failure! Resetting history: breeze.optimize.NaNHistory: 
25/05/12 13:05:29 ERROR OWLQN: Failure! Resetting history: breeze.optimize.NaNHistory: 


                                                                                

25/05/12 13:06:37 ERROR OWLQN: Failure! Resetting history: breeze.optimize.NaNHistory: 


                                                                                

25/05/12 13:09:33 ERROR OWLQN: Failure! Resetting history: breeze.optimize.NaNHistory: 


                                                                                

In [12]:
metrics = tvs_model.validationMetrics          

best_idx   = metrics.index(max(metrics))       




best_val_f = metrics[best_idx]
best_pars  = tvs_model.getEstimatorParamMaps()[best_idx]

print(f"🏅  Best validation F1 = {best_val_f:.4f}")
print("Best params           :", best_pars)

print("\n🧪  Evaluating on test set …")
test_pred = tvs_model.bestModel.transform(test_data)
test_f1   = evaluator.evaluate(test_pred)
print("Test F1               :", round(test_f1, 4))

🏅  Best validation F1 = 0.5893
Best params           : {Param(parent='ChiSqSelector_b1c3f09f47b9', name='numTopFeatures', doc='Number of features that selector will select, ordered by ascending p-value. If the number of features is < numTopFeatures, then this will select all features.'): 2000, Param(parent='LinearSVC_5a196dc8785c', name='regParam', doc='regularization parameter (>= 0).'): 0.1}

🧪  Evaluating on test set …
25/05/12 13:24:29 WARN DAGScheduler: Broadcasting large task binary with size 1244.1 KiB


[Stage 31201:>                                                      (0 + 2) / 2]

Test F1               : 0.599


                                                                                

In [13]:
tvs_model

TrainValidationSplitModel_429d1a4d3188