In [63]:
// Test: hdfs:///user/pknees/amazon-reviews/full/reviews_devset.json 
// Full: hdfs:///user/pknees/amazon-reviews/full/reviewscombined.json
val data_file = "hdfs:///user/pknees/amazon-reviews/full/reviewscombined.json"
val stopwords_file = "hdfs:///user/e12132344/stopwords.txt"

data_file: String = hdfs:///user/pknees/amazon-reviews/full/reviewscombined.json
stopwords_file: String = hdfs:///user/e12132344/stopwords.txt


In [60]:
!hadoop fs -rm -r hdfs:///user/e12132344/output_rdd
!rm output_rdd.txt

rm: `hdfs:///user/e12132344/output_rdd': No such file or directory


rm: cannot remove ‘output_rdd.txt’: No such file or directory



# Imports

In [61]:
import org.json4s._
import org.json4s.jackson.JsonMethods._
import java.util.StringTokenizer
import scala.collection.mutable.ListBuffer
import scala.collection.mutable.HashSet

import org.json4s._
import org.json4s.jackson.JsonMethods._
import java.util.StringTokenizer
import scala.collection.mutable.ListBuffer
import scala.collection.mutable.HashSet


## Functions & Parsing Stopwords

In [64]:
// Parsing Stop Words
val stop_words_file = sc.textFile(stopwords_file)
val stop_words: HashSet[String] = HashSet()
stop_words_file.collect().foreach(v => stop_words += v)

// Review Text -> tokens: List[String]
def tokenize(text: String): List[String] = {
    val tokenizer = new StringTokenizer(text, " ()[]{}.!?,;:+=-_\"'`~#@&*%€$§\\/")
    var tokens = new ListBuffer[String]()
     while (tokenizer.hasMoreTokens()) {
      tokens += tokenizer.nextToken().toLowerCase()
     }
    tokens.toList
}

// Get String from Option
def option_to_string(x: Option[String]): String = x match {
  case Some(s) => s
  case None => "?"
}

stop_words_file: org.apache.spark.rdd.RDD[String] = hdfs:///user/e12132344/stopwords.txt MapPartitionsRDD[399] at textFile at <console>:96
stop_words: scala.collection.mutable.HashSet[String] = Set(overall, at, seemed, baby, it, together, very, j, down, b, toward, used, enough, thereby, least, gotten, hopefully, being, relatively, except, certainly, yours, lately, serious, further, something, trying, sure, truly, kitchen, us, away, uses, contain, rd, these, ve, looking, where, necessary, toy, okay, described, nobody, as, appropriate, placed, on, is, hadn, already, a, having, be, movie, beside, different, respectively, just, former, everyone, ask, indicates, novel, entirely, lest, insofar, x, its, nothing, aa, mower, seem, allow, ever, use, corresponding, able, hereupon, several, install...

In [66]:
// Reading file 
val reviews_file = sc.textFile(file)

// Parsing JSON -> extracting "category" & "reaviewText" (tokenize -> case folding -> stop word filtering)
val reviews_json = reviews_file.map(line => parse(line).values.asInstanceOf[Map[String, String]])
val reviews = reviews_json.map(j => 
    (option_to_string(j.get("category")), tokenize(j.get("reviewText").toString()).filter(e => !stop_words(e)))) 

val num_docs = reviews_json.count()

// <category> -> [<term1>, <term2>, ... ] => [<category> -> <term1>, <category> -> <term2>, ...] 
val categories_tokens = reviews.flatMapValues(v => v)

val tokens = reviews.flatMap(j => j._2)
val categories = reviews.map(j => j._1)

// Count token, category, category&token
val count_tokens = tokens.map(v => (v,1)).reduceByKey((a,b) => a+b)
val count_category = categories.map(v => (v,1)).reduceByKey((a,b) => a+b)
val count_categoriestokens = categories_tokens.map(v => (v,1)).reduceByKey((a,b) => a+b)

// Joins count_categories_tokens with count_tokens ON token -> ReMap on category 
val count_tokens_categoriestokens = count_categories_tokens.map(v => (v._1._2, (v._1._1, v._2))).join(count_tokens)
    .map(v => (v._2._1._1, (v._1, v._2._1._2, v._2._2)))

/*
 * Joins count_category with count_tokens_categoriestokens ON category 
 * Calculate Chi^2 Value for every category X term
 * * ReMap on (category, (token, chi_square))
 */
val chi_square_values = count_tokens_categoriestokens.join(count_category).map(v => {
    val category = v._1
    val term = v._2._1._1
    val count_term_cat = v._2._1._2
    val count_term = v._2._1._3
    val count_cat = v._2._2
    
    val a: Double = count_term_cat
    val b: Double = count_term - count_term_cat
    val c: Double = count_cat - count_term_cat
    val d: Double = num_docs - count_cat - count_term + count_term_cat
    
    var chi_square: Double = (a*d) - (b*c)
    chi_square = chi_square * chi_square
    
    chi_square = chi_square / (a+b)
    chi_square = chi_square / (a+c)
    chi_square = chi_square / (b+d)
    chi_square = chi_square / (c+d)
    
    (category, (term, chi_square))
})

val result = chi_square_values.groupByKey().map(v => v._1.toString() + " " + v._2.toList.sortWith(_._2 > _._2).take(150).map(v => v._1+":"+v._2).mkString(" "))
result.saveAsTextFile("output_rdd")


reviews_file: org.apache.spark.rdd.RDD[String] = hdfs:///user/pknees/amazon-reviews/full/reviewscombined.json MapPartitionsRDD[405] at textFile at <console>:107
reviews_json: org.apache.spark.rdd.RDD[Map[String,String]] = MapPartitionsRDD[406] at map at <console>:110
reviews: org.apache.spark.rdd.RDD[(String, List[String])] = MapPartitionsRDD[407] at map at <console>:111
num_docs: Long = 78828876
categories_tokens: org.apache.spark.rdd.RDD[(String, String)] = MapPartitionsRDD[408] at flatMapValues at <console>:117
tokens: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[409] at flatMap at <console>:119
categories: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[410] at map at <console>:120
count_tokens: org.apache.spark.rdd.RDD[(String, Int)] = ShuffledRDD[412] at reduceByKey at ...

In [67]:
!hadoop fs -getmerge hdfs:///user/e12132344/output_rdd output_rdd.txt