# Exercise 2 - Text Processing and Classification using Spark
## Group 29

## Part 2 

In [1]:
import org.apache.spark.ml.feature.{Tokenizer, RegexTokenizer, StopWordsRemover, CountVectorizer, IDF, ChiSqSelector}
import org.apache.spark.ml.feature.CountVectorizerModel
import org.apache.spark.ml.feature.ChiSqSelectorModel
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.linalg.SparseVector
import org.apache.spark.sql.functions._

Intitializing Scala interpreter ...

Spark Web UI available at http://captain01.os.hpc.tuwien.ac.at:9999/proxy/application_1685516010423_1030
SparkContext available as 'sc' (version = 3.2.3, master = yarn, app id = application_1685516010423_1030)
SparkSession available as 'spark'


import org.apache.spark.ml.feature.{Tokenizer, RegexTokenizer, StopWordsRemover, CountVectorizer, IDF, ChiSqSelector}
import org.apache.spark.ml.feature.CountVectorizerModel
import org.apache.spark.ml.feature.ChiSqSelectorModel
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.linalg.SparseVector
import org.apache.spark.sql.functions._


In [6]:
val df = spark.read.json("hdfs:///user/dic23_shared/amazon-reviews/full/reviews_devset.json")

df: org.apache.spark.sql.DataFrame = [asin: string, category: string ... 8 more fields]


In [2]:
// Step 1: Tokenize reviewText into unigrams
val tokenizer = new Tokenizer()
  .setInputCol("reviewText")
  .setOutputCol("words")

// Step 2: Apply various transformations: stopword removal, and special character removal
val stopwordsPath = "stopwords.txt"
val stopwords = sc.textFile(stopwordsPath).collect().toList
val stopwordRemover = new StopWordsRemover()
  .setStopWords(stopwords.toArray)
  .setInputCol(tokenizer.getOutputCol)
  .setOutputCol("filtered_words")

// Step 3: Count the term frequency of each word in the filtered words
val countVectorizer = new CountVectorizer()
  .setInputCol(stopwordRemover.getOutputCol)
  .setOutputCol("rawFeatures")

// Step 4: Apply the IDF transformation to get the TF-IDF weighted features
val idf = new IDF()
  .setInputCol(countVectorizer.getOutputCol)
  .setOutputCol("tfidfFeatures")

// Step 5: Apply Chi-Square feature selection to select the top 2000 terms overall
val selector = new ChiSqSelector()
  .setFeaturesCol(idf.getOutputCol)
  .setLabelCol("overall")
  .setOutputCol("selectedFeatures")
  .setNumTopFeatures(2000)

tokenizer: org.apache.spark.ml.feature.Tokenizer = tok_395435f43919
stopwordsPath: String = stopwords.txt
stopwords: List[String] = List(a, aa, able, about, above, absorbs, accord, according, accordingly, across, actually, after, afterwards, again, against, ain, album, album, all, allow, allows, almost, alone, along, already, also, although, always, am, among, amongst, an, and, another, any, anybody, anyhow, anyone, anything, anyway, anyways, anywhere, apart, app, appear, appreciate, appropriate, are, aren, around, as, aside, ask, asking, associated, at, available, away, awfully, b, baby, bb, be, became, because, become, becomes, becoming, been, before, beforehand, behind, being, believe, below, beside, besides, best, better, between, beyond, bibs, bike, book, books, both, brief, bulbs,...


In [3]:
val pipeline = new Pipeline()
  .setStages(Array(tokenizer, stopwordRemover, countVectorizer, idf, selector))

pipeline: org.apache.spark.ml.Pipeline = pipeline_82665673785a


In [7]:
val pipelineModel = pipeline.fit(df)
val transformedData = pipelineModel.transform(df)

pipelineModel: org.apache.spark.ml.PipelineModel = pipeline_82665673785a
transformedData: org.apache.spark.sql.DataFrame = [asin: string, category: string ... 13 more fields]


In [10]:
// Get the selected features/terms
val selectedTerms = pipelineModel.stages(4).asInstanceOf[ChiSqSelectorModel]
  .selectedFeatures
  .map(index => pipelineModel.stages(2).asInstanceOf[CountVectorizerModel].vocabulary(index))

// Write the selected terms to the file
scala.tools.nsc.io.File("output_ds_2.txt").writeAll(selectedTerms.mkString("\n"))

selectedTerms: Array[String] = Array("", great, good, it's, love, it., time, don't, -, i'm, recommend, bought, make, easy, back, find, work, made, buy, i've, didn't, found, works, book., nice, lot, makes, doesn't, long, bit, feel, loved, characters, years, give, thought, fit, hard, it,, author, things, set, 2, pretty, thing, price, small, perfect, highly, big, part, purchased, book,, sound, enjoyed, series, end, 3, size, excellent, however,, bad, well., happy, wanted, 5, world, times, that's, me., enjoy, interesting, ordered, you're, isn't, 4, amazon, loves, job, read., problem, order, takes, writing, received, wait, family, top, favorite, couldn't, character, great., short, worked, won't, kind, couple, wonderful, left, cover, money, battery, wasn't, felt, started, fits, purchase, revie...


## Part 3

In [17]:
import org.apache.spark.ml.classification.{LinearSVC, OneVsRest}
import org.apache.spark.ml.feature.Normalizer
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.StringIndexer

import org.apache.spark.ml.classification.{LinearSVC, OneVsRest}
import org.apache.spark.ml.feature.Normalizer
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.StringIndexer


In [96]:
val df = spark.read.json("hdfs:///user/dic23_shared/amazon-reviews/full/reviews_devset.json")

df: org.apache.spark.sql.DataFrame = [asin: string, category: string ... 8 more fields]


In [97]:
val Array(train, test) = df.randomSplit(Array(0.8, 0.2), seed = 1234L)

train: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [asin: string, category: string ... 8 more fields]
test: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [asin: string, category: string ... 8 more fields]


In [19]:
// Step 1: Tokenize reviewText into unigrams
val tokenizer = new Tokenizer()
  .setInputCol("reviewText")
  .setOutputCol("words")

// Step 2: Apply various transformations: stopword removal, and special character removal
// val stopwords = StopWordsRemover.loadDefaultStopWords("english")
val stopwordsPath = "stopwords.txt"
val stopwords = sc.textFile(stopwordsPath).collect().toList
val stopwordRemover = new StopWordsRemover()
  .setStopWords(stopwords.toArray)
  .setInputCol(tokenizer.getOutputCol)
  .setOutputCol("filtered_words")

// Step 3: Count the term frequency of each word in the filtered words
val countVectorizer = new CountVectorizer()
  .setInputCol(stopwordRemover.getOutputCol)
  .setOutputCol("rawFeatures")

// Step 4: Apply the IDF transformation to get the TF-IDF weighted features
val idf = new IDF()
  .setInputCol(countVectorizer.getOutputCol)
  .setOutputCol("tfidfFeatures")

// Step 5: Apply Chi-Square feature selection to select the top 2000 terms overall
val selector = new ChiSqSelector()
  .setFeaturesCol(idf.getOutputCol)
  .setLabelCol("label")
  .setOutputCol("selectedFeatures")
  .setNumTopFeatures(2000)

// Step 6: Add vector length normalization
val normalizer = new Normalizer()
  .setInputCol("selectedFeatures")
  .setOutputCol("normalizedFeatures")
  .setP(2.0) // L2 norm

// Step 7: Convert the "category" column to a numeric type
val indexer = new StringIndexer()
  .setInputCol("category")
  .setOutputCol("label")

// Define the SVM classifier
val svm = new LinearSVC()
  //.setLabelCol("label")
  // .setFeaturesCol("normalizedFeatures")
   .setMaxIter(10)
   .setRegParam(0.1)

// Step 8 Define the One-vs-Rest classifier
val ovr = new OneVsRest()
   .setClassifier(svm)
   .setFeaturesCol("normalizedFeatures")
   .setLabelCol("label")

val pipeline_2 = new Pipeline()
    .setStages(Array(tokenizer, stopwordRemover, countVectorizer, idf, indexer, selector, normalizer, ovr))

tokenizer: org.apache.spark.ml.feature.Tokenizer = tok_9e0dacdd2a24
stopwordsPath: String = stopwords.txt
stopwords: List[String] = List(a, aa, able, about, above, absorbs, accord, according, accordingly, across, actually, after, afterwards, again, against, ain, album, album, all, allow, allows, almost, alone, along, already, also, although, always, am, among, amongst, an, and, another, any, anybody, anyhow, anyone, anything, anyway, anyways, anywhere, apart, app, appear, appreciate, appropriate, are, aren, around, as, aside, ask, asking, associated, at, available, away, awfully, b, baby, bb, be, became, because, become, becomes, becoming, been, before, beforehand, behind, being, believe, below, beside, besides, best, better, between, beyond, bibs, bike, book, books, both, brief, bulbs,...


In [99]:
// Fit the pipeline to training documents
val model = pipeline_2.fit(train)

model: org.apache.spark.ml.PipelineModel = pipeline_d37b793f627b


In [100]:
// Make predictions on the test data
val predictions = model.transform(test)

predictions: org.apache.spark.sql.DataFrame = [asin: string, category: string ... 17 more fields]


In [101]:
val evaluator = new MulticlassClassificationEvaluator()
  .setLabelCol("label")
  .setPredictionCol("prediction")
  .setMetricName("f1")


val f1 = evaluator.evaluate(predictions)

evaluator: org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator = MulticlassClassificationEvaluator: uid=mcEval_bcd399b031fb, metricName=f1, metricLabel=0.0, beta=1.0, eps=1.0E-15
f1: Double = 0.5868747088097422


## Using grid search for parameter optimization

In [31]:
// Apply downsampling
val downsampledDF = df.sample(fraction = 0.5, seed = 1234L)

downsampledDF: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [asin: string, category: string ... 8 more fields]


In [32]:
val Array(train_1, test_1) = downsampledDF.randomSplit(Array(0.8, 0.2), seed = 1234L)

train_1: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [asin: string, category: string ... 8 more fields]
test_1: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [asin: string, category: string ... 8 more fields]


In [33]:
// Create the parameter grid for the grid search
val paramGrid = new ParamGridBuilder()
  .addGrid(selector.numTopFeatures, Array(2000, 500))
  .addGrid(svm.regParam, Array(0.1, 0.01, 0.001))
  .addGrid(svm.standardization, Array(true, false))
  .addGrid(svm.maxIter, Array(10, 20))
  .build()

// Define the evaluator
val evaluator_2 = new MulticlassClassificationEvaluator()
  .setLabelCol("label")
  .setPredictionCol("prediction")
  .setMetricName("f1")

// Perform cross-validation with grid search
val cv = new CrossValidator()
  .setEstimator(pipeline_2)
  .setEvaluator(evaluator_2)
  .setEstimatorParamMaps(paramGrid)
  .setNumFolds(5) // number of folds for cross-validation

paramGrid: Array[org.apache.spark.ml.param.ParamMap] =
Array({
	linearsvc_76cea51520dc-maxIter: 10,
	chiSqSelector_b0672845a847-numTopFeatures: 2000,
	linearsvc_76cea51520dc-regParam: 0.1,
	linearsvc_76cea51520dc-standardization: true
}, {
	linearsvc_76cea51520dc-maxIter: 10,
	chiSqSelector_b0672845a847-numTopFeatures: 2000,
	linearsvc_76cea51520dc-regParam: 0.01,
	linearsvc_76cea51520dc-standardization: true
}, {
	linearsvc_76cea51520dc-maxIter: 10,
	chiSqSelector_b0672845a847-numTopFeatures: 2000,
	linearsvc_76cea51520dc-regParam: 0.001,
	linearsvc_76cea51520dc-standardization: true
}, {
	linearsvc_76cea51520dc-maxIter: 10,
	chiSqSelector_b0672845a847-numTopFeatures: 2000,
	linearsvc_76cea51520dc-regParam: 0.1,
	linearsvc_76cea51520dc-standardization: false
}, {
	linearsvc_76cea51520d...


In [None]:
// Fit the cross-validation model to the training data
val cvModel = cv.fit(train_1)

In [None]:
// Get the best model from the cross-validation
val bestModel = cvModel.bestModel

In [None]:
// Make predictions on the test data using the best model
val predictions = bestModel.transform(test)

// Evaluate the predictions using the evaluator
val f1Score = evaluator.evaluate(predictions)

// Print the F1 score on the test data
println(s"F1 score on the test data: $f1Score")