## Part 1 RDDs
Repeat the steps of Assignment 1, i.e. calculation of chi-square values and output of the sorted top terms per category, as well as the joined dictionary, using RDDs and transformations. Write the output to a file output_rdd.txt. Compare the generated output_rdd.txt with your generated output.txt from Assignment 1 and describe your observations briefly in the submission report (see Part 3).

In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import json
import re

In [2]:
# Initialize Spark context and session
conf = SparkConf().setAppName("ChiSquareAnalysis")
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

SLF4J: Class path contains multiple SLF4J bindings.

In [3]:
spark

### Preprocessing

In [4]:
# Define the stopwords file and the counters file
stopwords_file = "stopwords.txt"

# Load stopwords into a set
with open(stopwords_file, "r") as f:
    stopwords = set(f.read().strip().split())
    
# Load and preprocess the Amazon reviews dataset
input_file = "hdfs:///user/dic24_shared/amazon-reviews/full/reviews_devset.json"
reviews_rdd = sc.textFile(input_file)

In [5]:
# Show first two objects of reviews_rdd
reviews_rdd.take(2)

['{"reviewerID": "A2VNYWOPJ13AFP", "asin": "0981850006", "reviewerName": "Amazon Customer \\"carringt0n\\"", "helpful": [6, 7], "reviewText": "This was a gift for my other husband.  He\'s making us things from it all the time and we love the food.  Directions are simple, easy to read and interpret, and fun to make.  We all love different kinds of cuisine and Raichlen provides recipes from everywhere along the barbecue trail as he calls it. Get it and just open a page.  Have at it.  You\'ll love the food and it has provided us with an insight into the culture that produced it. It\'s all about broadening horizons.  Yum!!", "overall": 5.0, "summary": "Delish", "unixReviewTime": 1259798400, "reviewTime": "12 3, 2009", "category": "Patio_Lawn_and_Garde"}',
 '{"reviewerID": "A2E5XXXC07AGA7", "asin": "B00002N66D", "reviewerName": "James", "helpful": [1, 1], "reviewText": "This is a very nice spreader.  It feels very solid and the pneumatic tires give it great maneuverability and handling over

In [6]:
def preprocess_text(text):
    text = text.lower()
    unigrams = re.split(r'\s+|\d+|[(){}[\].!?,;:+=_"\'`~#@&*%€$§\\/\-]', text)
    unigrams = set(unigrams)
    return unigrams

def valid_word(word):
    if len(word) > 1 and word not in stopwords:
        return word
    
json_rdd = reviews_rdd.map(lambda line: json.loads(line))

word_category_rdd = json_rdd.flatMap(lambda x: [(x["category"], word) for word in preprocess_text(x['reviewText']) if valid_word(word)])

In [7]:
word_category_rdd.take(1)

[('Patio_Lawn_and_Garde', 'insight')]

In [8]:
combined_rdd = word_category_rdd.map(lambda x: (x,1)).reduceByKey(lambda x,y: x + y)

In [9]:
combined_rdd.take(1)

[(('Patio_Lawn_and_Garde', 'horizons'), 1)]

In [10]:
reduced_rdd = combined_rdd.map(lambda x: (x[0][0], (x[0][1],x[1])))

In [11]:
# Version for showing intermediate results
for category, word_count in reduced_rdd.groupByKey().take(2):
    print(category)
    word_count_list = list(word_count)
    for word, count_in_count in word_count_list[:5]:
        print(word, count_in_count)

Apps_for_Android
scripture 2
verse 2
tremendously 1
chapter 3
friendly 13
Book
discourse 16
feel 1440
anger 67
love 3812
consumed 26


### Intermediate step
Calculate counters.txt as done in our MapReduce Job

In [12]:
dataset_length = reviews_rdd.count()
dataset_length

78829

In [13]:
reviews_per_category_count_rdd = json_rdd.map(lambda x: (x['category'],1)).reduceByKey(lambda x, y: x + y)
reviews_per_category_count_rdd.collect()

[('Apps_for_Android', 2638),
 ('Book', 22507),
 ('Toys_and_Game', 2253),
 ('Office_Product', 1243),
 ('Digital_Music', 836),
 ('Automotive', 1374),
 ('Beauty', 2023),
 ('Kindle_Store', 3205),
 ('Electronic', 7825),
 ('Movies_and_TV', 4607),
 ('Tools_and_Home_Improvement', 1926),
 ('Grocery_and_Gourmet_Food', 1297),
 ('Musical_Instrument', 500),
 ('CDs_and_Vinyl', 3749),
 ('Clothing_Shoes_and_Jewelry', 5749),
 ('Home_and_Kitche', 4254),
 ('Cell_Phones_and_Accessorie', 3447),
 ('Pet_Supplie', 1235),
 ('Baby', 916),
 ('Health_and_Personal_Care', 2982),
 ('Patio_Lawn_and_Garde', 994),
 ('Sports_and_Outdoor', 3269)]

### Calculate Chi-Square

In [14]:
# calculate value A (number of documents in c which contain t)
A_value_rdd = reduced_rdd.map(lambda x: ((x[0], x[1][0]), x[1][1]))
A_value_rdd.take(1)

[(('Musical_Instrument', 'mic'), 24)]

In [15]:
# calculate how often a word occurs across all categories
B_value_complete_rdd = reduced_rdd.map(lambda x: (x[1][0],x[1][1])).reduceByKey(lambda x, y: x + y)
B_value_complete_rdd.take(1)

[('means', 824)]

In [16]:
#B_value_temp_rdd = reduced_rdd.map(lambda x: (x[1][0],(x[0],x[1][1]))).join(B_value_complete_rdd)
#B_value_temp_rdd.take(1)

In [17]:
# calulate B value (number of documents not in c which contain t) value is obtained by subtracting A from total count
# the value for the join is saved in a combined rdd to save compute power later on
A_B_value_rdd = A_value_rdd.map(lambda x: (x[0][1], (x[0][0], x[1]))).join(B_value_complete_rdd).map(lambda x: ((x[1][0][0], x[0]),  (x[1][0][1], x[1][1] - x[1][0][1])))
A_B_value_rdd.take(1)

[(('Musical_Instrument', 'items'), (5, 686))]

In [18]:
# calculate C value (number of documents in c without t) occurences of word per category subtracted from all documents in category
C_value_rdd = reduced_rdd.join(reviews_per_category_count_rdd).map(lambda x: ((x[0], x[1][0][0]), x[1][1] - x[1][0][1]))
C_value_rdd.take(1)

[(('Electronic', 'cable'), 7262)]

In [19]:
# calculate D value (number of documents not in c without t) add up A, B and C and subtract from total number of documents
# all values are saved in combined RDD to save compute power
A_B_C_D_values_rdd = A_B_value_rdd.join(C_value_rdd).map(lambda x: (x[0], (x[1][0][0], x[1][0][1], x[1][1], dataset_length - (x[1][0][0] + x[1][0][1] + x[1][1]))))
A_B_C_D_values_rdd.take(1)                                         

[(('Clothing_Shoes_and_Jewelry', 'outcome'), (1, 101, 5748, 72979))]

In [20]:
# caluclate chi square via formula in the slides
chi_square_rdd = A_B_C_D_values_rdd.map(lambda x: (x[0][0], (x[0][1],
                                                             ((dataset_length * (x[1][0] * x[1][3] - x[1][1] * x[1][2])** 2)/
                                                              ((x[1][0] + x[1][1]) * (x[1][0] + x[1][2])
                                                               * (x[1][1] + x[1][3]) * (x[1][2] + x[1][3]))))))

In [21]:
chi_square_rdd.take(1)

[('Movies_and_TV', ('scripture', 0.08225379139805572))]

In [22]:
# sort values by category and chi square value
chi_sort_rdd = chi_square_rdd.sortBy(lambda x: (x[0], x[1][1]), ascending=False)
chi_sort_rdd.take(10)

[('Toys_and_Game', ('toys', 1411.960520524251)),
 ('Toys_and_Game', ('loves', 1251.6935512104387)),
 ('Toys_and_Game', ('lego', 1219.5840386320606)),
 ('Toys_and_Game', ('son', 1183.031490681739)),
 ('Toys_and_Game', ('grandson', 953.6748919830555)),
 ('Toys_and_Game', ('dolls', 937.5559944440915)),
 ('Toys_and_Game', ('play', 847.7801996293716)),
 ('Toys_and_Game', ('birthday', 826.025153711533)),
 ('Toys_and_Game', ('kids', 799.8701275441781)),
 ('Toys_and_Game', ('christmas', 730.5924214814038))]

In [23]:
# sort values by category and select top 75 chi square values
chi_cropped_rdd = chi_sort_rdd.groupByKey().map(lambda x: (x[0], list(x[1])[:75])).sortByKey()
chi_cropped_rdd.take(1)

[('Apps_for_Android',
  [('games', 3081.1493374842926),
   ('play', 2158.3694068201294),
   ('graphics', 1505.5108977351497),
   ('kindle', 1470.820942569012),
   ('addictive', 1311.905562727777),
   ('challenging', 1038.1284558527927),
   ('coins', 1002.6647889526382),
   ('addicting', 990.8441134974868),
   ('fire', 956.1470053110605),
   ('levels', 825.3813282736016),
   ('playing', 692.9340396014182),
   ('ads', 642.3969794099202),
   ('puzzles', 596.7716753070063),
   ('apps', 548.7810653104153),
   ('free', 500.9884786241356),
   ('bingo', 409.2358492981346),
   ('mahjong', 322.00891943980963),
   ('download', 303.8649278202287),
   ('faotd', 288.8577201586641),
   ('facebook', 282.51705437029005),
   ('downloaded', 262.77022492215735),
   ('hints', 242.61029019440056),
   ('solitaire', 211.6429957838186),
   ('android', 211.58105849598613),
   ('puzzle', 198.85582217352504),
   ('gameplay', 198.5123356770461),
   ('freezes', 189.67737127837006),
   ('unlock', 185.7521008338788),

In [60]:
# collect RDD and write data to file
data = chi_cropped_rdd.collect()

# collect unique words from top 75
def extract_words(record):
    category, word_list = record
    return [word for word, _ in word_list]

words = chi_cropped_rdd.flatMap(extract_words)
words = sorted(set(words.collect()))

with open("output_rdd.txt", "w") as writer:
    for row in data:
        writer.write(str(row) + '\n')
    for word in words:
        writer.write(word + " ")

In [None]:
spark.stop()