In [1]:
import org.apache.spark.ml.feature.{RegexTokenizer, StopWordsRemover, CountVectorizer, CountVectorizerModel, IDF, StringIndexer, ChiSqSelector, Normalizer}
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit, TrainValidationSplitModel}
import org.apache.spark.ml.classification.{LinearSVC, OneVsRest}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator

val SEED = 1228952

Intitializing Scala interpreter ...

Spark Web UI available at http://c100.local:8088/proxy/application_1596895008206_26694
SparkContext available as 'sc' (version = 2.4.0-cdh6.3.2, master = yarn, app id = application_1596895008206_26694)
SparkSession available as 'spark'


import org.apache.spark.ml.feature.{RegexTokenizer, StopWordsRemover, CountVectorizer, CountVectorizerModel, IDF, StringIndexer, ChiSqSelector, Normalizer}
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit, TrainValidationSplitModel}
import org.apache.spark.ml.classification.{LinearSVC, OneVsRest}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
SEED: Int = 1228952


# Load Data

In [5]:
// load dataframe and select relevant columns
val DEVSET = "hdfs:///user/pknees/amazon-reviews/full/reviews_devset.json"
val reviewsDf = spark.read.json(DEVSET).select("category", "reviewText")

DEVSET: String = hdfs:///user/pknees/amazon-reviews/full/reviews_devset.json
reviewsDf: org.apache.spark.sql.DataFrame = [category: string, reviewText: string]


In [3]:
// generate subsample of data for testing purposes
// val subset = reviewsDf.sample(fraction=0.01, withReplacement=false, seed=SEED)
// val reviewsDf = subset

In [None]:
reviewsDf.show()

+--------------------+--------------------+
|            category|          reviewText|
+--------------------+--------------------+
|Patio_Lawn_and_Garde|This was a gift f...|
|Patio_Lawn_and_Garde|This is a very ni...|
|Patio_Lawn_and_Garde|The metal base wi...|
|Patio_Lawn_and_Garde|For the most part...|
|Patio_Lawn_and_Garde|This hose is supp...|
|Patio_Lawn_and_Garde|This tool works v...|
|Patio_Lawn_and_Garde|This product is a...|
|Patio_Lawn_and_Garde|I was excited to ...|
|Patio_Lawn_and_Garde|I purchased the L...|
|Patio_Lawn_and_Garde|Never used a manu...|
|Patio_Lawn_and_Garde|Good price. Good ...|
|Patio_Lawn_and_Garde|I have owned the ...|
|Patio_Lawn_and_Garde|I had "won" a sim...|
|Patio_Lawn_and_Garde|The birds ate all...|
|Patio_Lawn_and_Garde|Bought last summe...|
|Patio_Lawn_and_Garde|I knew I had a mo...|
|Patio_Lawn_and_Garde|I was a little wo...|
|Patio_Lawn_and_Garde|I have used this ...|
|Patio_Lawn_and_Garde|I actually do not...|
|Patio_Lawn_and_Garde|Just what 

In [None]:
reviewsDf.count()

res2: Long = 78829


# Feature Creation Steps (Part 2)

In [None]:
// copy of feature creation steps from part 2
val indexer = new StringIndexer()
    .setInputCol("category")
    .setOutputCol("label")

val tokenizer = new RegexTokenizer()
    .setInputCol("reviewText")
    .setOutputCol("tokensRaw")
    .setPattern("[ \t0123456789.!?,;:()\\[\\]{}\\-_\"'`~#&*%$\\\\/]+")
    .setToLowercase(true)
    .setMinTokenLength(2)

val stopWordsRemover = new StopWordsRemover()
    .setInputCol("tokensRaw")
    .setOutputCol("tokens")
    .setStopWords(Array("a", "aa", "able", "about", "above", "according", "accordingly", "across", "actually", "after", "afterwards", "again", "against", "ain", "all", "allow", "allows", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "an", "and", "another", "any", "anybody", "anyhow", "anyone", "anything", "anyway", "anyways", "anywhere", "apart", "appear", "appreciate", "appropriate", "are", "aren", "around", "as", "aside", "ask", "asking", "associated", "at", "available", "away", "awfully", "b", "bb", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "believe", "below", "beside", "besides", "best", "better", "between", "beyond", "bibs", "book", "both", "brief", "but", "by", "c", "came", "can", "cannot", "cant", "car", "cause", "causes", "cd", "certain", "certainly", "changes", "clearly", "co", "com", "come", "comes", "concerning", "consequently", "consider", "considering", "contain", "containing", "contains", "corresponding", "could", "couldn", "course", "currently", "d", "definitely", "described", "despite", "did", "didn", "different", "do", "does", "doesn", "doing", "don", "done", "down", "downwards", "during", "e", "each", "edu", "eg", "eight", "either", "else", "elsewhere", "enough", "entirely", "especially", "et", "etc", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example", "except", "f", "far", "few", "fifth", "first", "five", "followed", "following", "follows", "for", "former", "formerly", "forth", "four", "from", "further", "furthermore", "g", "game", "game", "get", "gets", "getting", "given", "gives", "go", "goes", "going", "gone", "got", "gotten", "greetings", "h", "had", "hadn", "happens", "hardly", "has", "hasn", "have", "haven", "having", "he", "hello", "help", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "hi", "him", "himself", "his", "hither", "hopefully", "how", "howbeit", "however", "i", "ie", "if", "ignored", "immediate", "in", "inasmuch", "inc", "indeed", "indicate", "indicated", "indicates", "inner", "insofar", "instead", "into", "inward", "is", "isn", "it", "its", "itself", "j", "just", "k", "keep", "keeps", "kept", "know", "known", "knows", "l", "last", "lately", "later", "latter", "latterly", "least", "less", "lest", "let", "life", "like", "liked", "likely", "little", "ll", "look", "looking", "looks", "ltd", "m", "mainly", "many", "may", "maybe", "me", "mean", "meanwhile", "merely", "might", "mon", "more", "moreover", "most", "mostly", "much", "must", "my", "myself", "n", "name", "namely", "nd", "near", "nearly", "necessary", "need", "needs", "neither", "never", "nevertheless", "new", "next", "nine", "no", "nobody", "non", "none", "noone", "nor", "normally", "not", "nothing", "novel", "now", "nowhere", "o", "obviously", "of", "off", "often", "oh", "ok", "okay", "old", "on", "once", "one", "ones", "only", "onto", "or", "other", "others", "otherwise", "ought", "our", "ours", "ourselves", "out", "outside", "over", "overall", "own", "p", "particular", "particularly", "per", "perhaps", "placed", "please", "plus", "possible", "presumably", "probably", "provides", "q", "que", "quite", "qv", "r", "rather", "rd", "re", "really", "reasonably", "regarding", "regardless", "regards", "relatively", "respectively", "right", "s", "said", "same", "saw", "say", "saying", "says", "second", "secondly", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "shall", "she", "should", "shouldn", "since", "six", "so", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specified", "specify", "specifying", "still", "sub", "such", "sup", "sure", "t", "take", "taken", "tell", "tends", "th", "than", "thank", "thanks", "thanx", "that", "that", "thats", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "there", "thereafter", "thereby", "therefore", "therein", "theres", "thereupon", "these", "they", "think", "third", "this", "thorough", "thoroughly", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "took", "toward", "towards", "tried", "tries", "truly", "try", "trying", "twice", "two", "u", "un", "under", "unfortunately", "unless", "unlikely", "until", "unto", "up", "upon", "us", "use", "used", "useful", "uses", "using", "usually", "v", "value", "various", "ve", "very", "via", "viz", "vs", "want", "wants", "was", "wasn", "way", "we", "welcome", "well", "went", "were", "weren", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "willing", "wish", "with", "within", "without", "won", "wonder", "would", "wouldn", "x", "y", "yes", "yet", "you", "your", "yours", "yourself", "yourselves", "z", "zero"))

val countVectorizer = new CountVectorizer()
    .setInputCol("tokens")
    .setOutputCol("featuresRaw")

val idf = new IDF()
    .setInputCol("featuresRaw")
    .setOutputCol("featuresWeighted")

val selector = new ChiSqSelector()
    .setLabelCol("label")
    .setFeaturesCol("featuresWeighted")
    .setOutputCol("features")

indexer: org.apache.spark.ml.feature.StringIndexer = strIdx_ae3cb28b63aa
tokenizer: org.apache.spark.ml.feature.RegexTokenizer = regexTok_383b38ee05a1
stopWordsRemover: org.apache.spark.ml.feature.StopWordsRemover = stopWords_7cb044318e63
countVectorizer: org.apache.spark.ml.feature.CountVectorizer = cntVec_c1f99181f95c
idf: org.apache.spark.ml.feature.IDF = idf_565e7c106b26
selector: org.apache.spark.ml.feature.ChiSqSelector = chiSqSelector_f40b2ffd9229


# Normalizer

In [None]:
// apply normalizer to features using L2 norm
val normalizer = new Normalizer()
    .setInputCol("features")
    .setOutputCol("featuresNorm")
    .setP(2.0)  // L2 norm

normalizer: org.apache.spark.ml.feature.Normalizer = normalizer_5a634e02e072


# Classifier

In [None]:
// setup classifier and enable use SVM in OneVsRest for multi-class problems
val svc = new LinearSVC()
val ovr = new OneVsRest()
    .setClassifier(svc)
    .setFeaturesCol("featuresNorm")
    .setLabelCol("label")

svc: org.apache.spark.ml.classification.LinearSVC = linearsvc_ee66215b518b
ovr: org.apache.spark.ml.classification.OneVsRest = oneVsRest_5f2adc9d883c


# Pipeline

In [None]:
// setup pipeline with all steps for the text classification problem
val pipeline = new Pipeline()
    .setStages(Array(indexer, tokenizer, stopWordsRemover, countVectorizer, idf, selector, normalizer, ovr))

pipeline: org.apache.spark.ml.Pipeline = pipeline_762cbcb5029e


# Parameter Grid

In [None]:
// setup parameter grid for HyperParameter optimization using feasable parameters
val paramGrid = new ParamGridBuilder()
    .addGrid(selector.numTopFeatures, Array(4000, 400))
    .addGrid(svc.regParam, Array(0.1, 1, 10)) 
    .addGrid(svc.maxIter, Array(10, 1000))
    .addGrid(svc.standardization, Array(true, false))
    .build()

paramGrid: Array[org.apache.spark.ml.param.ParamMap] =
Array({
	linearsvc_ee66215b518b-maxIter: 10,
	chiSqSelector_f40b2ffd9229-numTopFeatures: 4000,
	linearsvc_ee66215b518b-regParam: 0.1,
	linearsvc_ee66215b518b-standardization: true
}, {
	linearsvc_ee66215b518b-maxIter: 10,
	chiSqSelector_f40b2ffd9229-numTopFeatures: 400,
	linearsvc_ee66215b518b-regParam: 0.1,
	linearsvc_ee66215b518b-standardization: true
}, {
	linearsvc_ee66215b518b-maxIter: 1000,
	chiSqSelector_f40b2ffd9229-numTopFeatures: 4000,
	linearsvc_ee66215b518b-regParam: 0.1,
	linearsvc_ee66215b518b-standardization: true
}, {
	linearsvc_ee66215b518b-maxIter: 1000,
	chiSqSelector_f40b2ffd9229-numTopFeatures: 400,
	linearsvc_ee66215b518b-regParam: 0.1,
	linearsvc_ee66215b518b-standardization: true
}, {
	linearsvc_ee66215b518b-...

# Evaluator

In [45]:
// evaluator for multi-class problems using f1 score
val evaluator = new MulticlassClassificationEvaluator()
    .setMetricName("f1")

evaluator: org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator = mcEval_57449ab8f980


# Hyper-parameter tuning

In [None]:
// to save resources we use TrainValidation Split instead of Cross Validation
val trainValidationSplit = new TrainValidationSplit()
    .setEstimator(pipeline)
    .setEvaluator(evaluator)
    .setEstimatorParamMaps(paramGrid)
    .setTrainRatio(0.8)
    .setParallelism(2)
    .setSeed(SEED)

trainValidationSplit: org.apache.spark.ml.tuning.TrainValidationSplit = tvs_9e06415eaea1


# Running the Model

In [7]:
// split data in training and testing data (validation data is created by TrainValidationSplit for each parameter combination)
val Array(train, test) = reviewsDf.randomSplit(Array(0.9, 0.1), seed = SEED)

train: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [category: string, reviewText: string]
test: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [category: string, reviewText: string]


In [None]:
// train model using Train Validation split and Grid Search
val tvModel = trainValidationSplit.fit(train)

In [None]:
// save model
tvModel.write.overwrite().save("/tmp/e1228952/my_model")

In [48]:
// load model from folder
val myModel = TrainValidationSplitModel.load("/tmp/e1228952/text-processing-model")

myModel: org.apache.spark.ml.tuning.TrainValidationSplitModel = tvs_834b603bfe97


In [49]:
// predict labels
val predictions = myModel.transform(test)

predictions: org.apache.spark.sql.DataFrame = [category: string, reviewText: string ... 9 more fields]


In [50]:
predictions.select("label", "prediction").show()

+-----+----------+
|label|prediction|
+-----+----------+
| 10.0|      12.0|
| 10.0|      10.0|
| 10.0|      10.0|
| 10.0|      10.0|
| 10.0|      10.0|
| 10.0|       0.0|
| 10.0|      10.0|
| 10.0|      10.0|
| 10.0|      10.0|
| 10.0|      10.0|
| 10.0|      10.0|
| 10.0|      10.0|
| 10.0|      10.0|
| 10.0|      10.0|
| 10.0|      10.0|
| 10.0|      10.0|
| 10.0|       2.0|
| 10.0|      10.0|
| 10.0|      10.0|
| 10.0|      10.0|
+-----+----------+
only showing top 20 rows



In [51]:
// overall f1 score
val f1 = evaluator.evaluate(predictions)

f1: Double = 0.7228184192298264


In [53]:
val bestModel = myModel.bestModel.asInstanceOf[PipelineModel]

bestModel: org.apache.spark.ml.PipelineModel = pipeline_fc3776ea7eff


In [54]:
bestModel.explainParams()

res31: String = ""
