## Part 1) RDDs

In [None]:
// Intitialize Scala interpreter
sc

In [2]:
// Load the data
val dataPath= "hdfs:///user/dic23_shared/amazon-reviews/full/reviews_devset.json"
val data_ = spark.read.json(dataPath)

dataPath: String = hdfs:///user/dic23_shared/amazon-reviews/full/reviews_devset.json
data_: org.apache.spark.sql.DataFrame = [asin: string, category: string ... 8 more fields]


In [4]:
// Extract columns category and reviewText
val pairs = data_.rdd.map{row=>
    val category = row.getAs[String] ("category")
    val text = row.getAs[String] ("reviewText")
    (category,text)}

pairs: org.apache.spark.rdd.RDD[(String, String)] = MapPartitionsRDD[9] at map at <console>:25


In [5]:
// Load stopwords
val stopwordsPath = "stopwords.txt"
val stopwords = sc.textFile(stopwordsPath).collect()

stopwordsPath: String = stopwords.txt
stopwords: Array[String] = Array(a, aa, able, about, above, absorbs, accord, according, accordingly, across, actually, after, afterwards, again, against, ain, album, album, all, allow, allows, almost, alone, along, already, also, although, always, am, among, amongst, an, and, another, any, anybody, anyhow, anyone, anything, anyway, anyways, anywhere, apart, app, appear, appreciate, appropriate, are, aren, around, as, aside, ask, asking, associated, at, available, away, awfully, b, baby, bb, be, became, because, become, becomes, becoming, been, before, beforehand, behind, being, believe, below, beside, besides, best, better, between, beyond, bibs, bike, book, books, both, brief, bulbs, but, by, c, came, camera, can, cannot, cant, car, case, cause, ca...


In [6]:
// extract words only and filter out the stopwords
val pairs_clean = pairs.mapValues(value=>value.toLowerCase
                                  .split("[^a-zA-Z<>^|]+")
                                  .filterNot(x => stopwords.contains(x.toLowerCase)).mkString(" ")).cache()

pairs_clean: org.apache.spark.rdd.RDD[(String, String)] = MapPartitionsRDD[12] at mapValues at <console>:26


In [7]:
// Look at the number of partitions
pairs_clean.getNumPartitions

res2: Int = 2


In [8]:
// Calculate a number of unique "category and word" lines
val categoryTermCount = pairs_clean.flatMapValues(x=>x.split(" ")).map(word => (word, 1)).reduceByKey ((x,y)=>x+y)
// Prepare data for the following join per word. The data now represents a tuple (word (category,A = frequency of word and category))
val result_A = categoryTermCount.map{case(k,v) => ( (k._2),( k._1, v))}

categoryTermCount: org.apache.spark.rdd.RDD[((String, String), Int)] = ShuffledRDD[15] at reduceByKey at <console>:25
result_A: org.apache.spark.rdd.RDD[(String, (String, Int))] = MapPartitionsRDD[16] at map at <console>:27


In [10]:
// Calculate a number of unique words. The result is a tuple (word, frequency of word )
val countWords = pairs_clean.map{ case (key, value) =>  value }.flatMap (x=>x.split(" ")).map(word => (word, 1)).reduceByKey ((x,y)=>x+y)

countWords: org.apache.spark.rdd.RDD[(String, Int)] = ShuffledRDD[20] at reduceByKey at <console>:25


In [13]:
// Do the join datasets per word column
val rddJoin_1 = result_A.join(countWords)
// Prepare data for the following join per category. The data now represents a tuple (category (word, A,B))
val result_A_B = rddJoin_1.map{case (k,v)=> (v._1._1, (k, v._1._2, v._2-v._1._2)) }

rddJoin_1: org.apache.spark.rdd.RDD[(String, ((String, Int), Int))] = MapPartitionsRDD[26] at join at <console>:27
result_A_B: org.apache.spark.rdd.RDD[(String, (String, Int, Int))] = MapPartitionsRDD[27] at map at <console>:29


In [15]:
result_A_B.take(1).foreach(println)

(Movies_and_TV,(vecindad,1,0))


In [16]:
// Calculate a frequency of each category
val countCategory = pairs_clean.map{ case (key, value) =>  (key,1)}.reduceByKey ((x,y)=>x+y)
// Join per category
val rddJoin_2 = result_A_B.join(countCategory).persist

countCategory: org.apache.spark.rdd.RDD[(String, Int)] = ShuffledRDD[29] at reduceByKey at <console>:26
rddJoin_2: org.apache.spark.rdd.RDD[(String, ((String, Int, Int), Int))] = MapPartitionsRDD[32] at join at <console>:28


In [19]:
// Calculate the number of lines in the whole document
val NN = pairs_clean.count().toInt
// Calculate A,B,C,D
val result_A_B_C_D = rddJoin_2.map{case (k,v)=> (k, (v._1._1, v._1._2, v._1._3, v._2-v._1._2,
                                                  NN - v._1._3 - v._2 )) }

NN: Int = 78829
result_A_B_C_D: org.apache.spark.rdd.RDD[(String, (String, Int, Int, Int, Int))] = MapPartitionsRDD[33] at map at <console>:29


In [20]:
// Calculate chi_2 value using A,B,C,D,NN
val result_A_B_C_D_chi2 = result_A_B_C_D.map{case (k,v)=> {
    val A = v._2.toFloat
    val B = v._3.toFloat
    val C = v._4.toFloat
    val D = v._5.toFloat
    val top = NN*(A*D-B*C)*(A*D-B*C) 
    val bottom = (A+B)*(A+C)*(B+D)*(C+D)
    val result = top/bottom
    (k, (result,v._1))
}}

result_A_B_C_D_chi2: org.apache.spark.rdd.RDD[(String, (Float, String))] = MapPartitionsRDD[34] at map at <console>:26


In [40]:
// Group the lines according to the key (=category) and sort according to the value of chi_2
val chi_grouped = result_A_B_C_D_chi2.groupByKey().mapValues(tuple => tuple.toList.sortBy(-_._1))
// Extract the first 75 values in each category
val chi_grouped_75 = chi_grouped.mapValues(line=>line.take(75)).sortByKey()

chi_grouped: org.apache.spark.rdd.RDD[(String, List[(Float, String)])] = MapPartitionsRDD[47] at mapValues at <console>:26
chi_grouped_75: org.apache.spark.rdd.RDD[(String, List[(Float, String)])] = ShuffledRDD[51] at sortByKey at <console>:28


In [41]:
// Save the output as a txt file
scala.tools.nsc.io.File("output_rdd.txt").writeAll(chi_grouped_75.collect().mkString("\n"))