# DIC SS 2023 - Exercise 1


## Introduction
Goal: Calculate Chi square measure for the top 75 items taken from the file:

'reviews_devset.json' which represents a small fraction of the large file 'reviewscombined.json' which contains a set of items reviews taken from AMAZON.



The computation of $\chi ^2 _{tc}$ square is given by the formula:

$ \chi ^2 _{tc} = \frac{N(AD-BC) ^2}{(A+B)(A+C)(B+D)(C+D)}$

where:

- A = number of items of category $c$ which contains the term $t$

- B = number of items not of category $c$ which contain the term $t$ 

- C = numer of items of category $c$ which do not contain the term $t$

- D = numer of items not of category $c$ which do not contain the term $t$ 

- N = total number of items (i.e. total number of reviews)



## Dataset

Here an extract of the file "reviews_devset.json"

The only keys that are useful are "category" and "reviewText"


In [1]:
import json

file = 'reviews_devset.json'

f = open(file)

# Since the json is not stored properly as a dictionary, must use list comprehension ...
data = [json.loads(line)
        for line in open(file, 'r', encoding='utf-8')]

In [2]:
data[0]

{'reviewerID': 'A2VNYWOPJ13AFP',
 'asin': '0981850006',
 'reviewerName': 'Amazon Customer "carringt0n"',
 'helpful': [6, 7],
 'reviewText': "This was a gift for my other husband.  He's making us things from it all the time and we love the food.  Directions are simple, easy to read and interpret, and fun to make.  We all love different kinds of cuisine and Raichlen provides recipes from everywhere along the barbecue trail as he calls it. Get it and just open a page.  Have at it.  You'll love the food and it has provided us with an insight into the culture that produced it. It's all about broadening horizons.  Yum!!",
 'overall': 5.0,
 'summary': 'Delish',
 'unixReviewTime': 1259798400,
 'reviewTime': '12 3, 2009',
 'category': 'Patio_Lawn_and_Garde'}

### Working Example for word_counts


In [3]:
### Working Example for word_counts applied to our json file


In [60]:
file = 'reviews_devset.json'
a = open(file ,'r').readlines()



out = open('reduced.json','w')
for l in a[:2000]:
    out.write(l)
    

In [5]:
print(type(a))

<class 'list'>


In [6]:
len(a)

78829

In [61]:
%%file wordcount_jupyter.py

# Basic Word Count Map-Reduce in Python
### 
import mrjob
from mrjob.job import MRJob
from mrjob.step import MRStep
import json
import re
import os,sys


"""
Nice example
https://medium.com/datable/beginners-guide-for-mapreduce-with-mrjob-in-python-dbd2e7dd0f86
"""

# Here we read the list of stopwords that must be ignored from the review text
stop_words = open('stopwords.txt').readlines()

# Here we read the list of stopwords that must be ignored from the review text
stop_words = [w.replace('\n', '') for w in open('stopwords.txt').readlines() ]
#stop_words[:10]

if not os.path.isdir('temp_out'):
    os.mkdir('temp_out')


# holding the number of reviews per category
dic_category_counts = {}
   

class WordCounter(MRJob):


    N = 0
    dic_category_counts = {}
    
    def mapper1(self, _, line):
        
        
        self.N +=1  
        
        data = json.loads(line)

        review = data['reviewText']
        category = str(data['category'])
        
        # updating the counts for categories
        if category not in self.dic_category_counts.keys():
            self.dic_category_counts[category] = 0
        self.dic_category_counts[category] += 1
        
        # unique ID
        review_time = str(data['unixReviewTime'])
        idd = data['reviewerID']
        unique = review_time+idd 
        
        ### Simplify word tokens
        review_words_list = re.split('[^a-zA-Z<>^|]+', review)  # splitting words
        review_words_list = [f.lower() for f in review_words_list] # lower case letters
        review_words_list = [f for f in review_words_list if len(f) > 1 ] # lower case letters
        
        # filter stop words
        review_words_list = [str(w) for w in review_words_list if w not in stop_words ]
        # remove duplicated words
        review_words_list = list(set(review_words_list))
        for word in review_words_list:
            yield category, (word, 1, self.N, self.dic_category_counts[category] )                 
            #yield (category, 1)
        self.dic_category_counts = dic_category_counts

    '''
    Output after the first mapper
    "Apps_for_Android"	["irrelevant",1]
    "Apps_for_Android"	["developer",2]
    "Apps_for_Android"	["update",5]
    '''
            
    def combiner1(self, cat, word_count):
        #yield  (word, sum(counts) )
        term_freq_dict = {}
        for term, freq in word_count:
            term_freq_dict[term] = term_freq_dict.get(term, 0) + freq
        for term, freq in term_freq_dict.items():
            yield cat, (term, freq)

    '''
    Output after the first combiner
    "Apps_for_Android"	["irrelevant",1]
    "Apps_for_Android"	["developer",2]
    "Apps_for_Android"	["update",5]
    '''
            
    
    def reducer1(self, category, term_freqs):
        term_freq_dict = {}
        for term, freq in term_freqs:
            term_freq_dict[term] = term_freq_dict.get(term, 0) + freq
        yield category, term_freq_dict

            
    '''
    Output after the first reducer
    "Patio_Lawn_and_Garde"	{"interpret":1,"yum":1,"provided":8,"simple":17,"gift":21,"easy":142,
    '''
   
    
    def mapper2(self, category, term_freqs):
        for term, freq in term_freqs.items():
            yield term, (category, freq)
            
    
    '''
    Output after mapper2
    "scripture"	["Apps_for_Android",2]
    "time"	["Apps_for_Android",162]
    '''
            

    def combiner2(self, term, cat_freqs):
        cat_freq_dict = {}
        for category, freq in cat_freqs:
            cat_freq_dict[category] = cat_freq_dict.get(category, 0) + freq
        for category, freq in cat_freq_dict.items():
            yield term, (category, freq)
    
    
    '''
    Output after cobiner2
    "abc"	["Apps_for_Android",1]
    "abcs"	["Apps_for_Android",2]
    "abhors"	["Apps_for_Android",1]
    "abilities"	["Apps_for_Android",1]
    "ability"	["Apps_for_Android",7]
    '''
    
    def reducer2(self, term, cat_freqs):
            cat_freq_dict = {}
            for category, freq in cat_freqs:
                cat_freq_dict[category] = freq
            yield term, cat_freq_dict
     

    '''
    Output after reducer 2
    "pizza"	{"Patio_Lawn_and_Garde":1}
    "pizzas"	{"Patio_Lawn_and_Garde":1}
    "place"	{"Patio_Lawn_and_Garde":34,"Apps_for_Android":7}
    '''
    
    
    def mapper_chi(self, term, cat_freq_dicts):
        
        #yield term, cat_freq_dicts         

        cat_total_dict = {} # dict holding the total freq of a term per category
        # -> it looks already the same ad the cat_freq_dicts  ????
        
        
        term_total = 0 # total frequency of the word summing all categories
        
        for category in cat_freq_dicts.keys():  # e.g. cat_freq_dicts = {"Patio_Lawn_and_Garde":34,"Apps_for_Android":7}
            freq = cat_freq_dicts[category]
            if category in cat_total_dict:
                cat_total_dict[category] += freq
            else:
                cat_total_dict[category] = freq
            term_total += freq
                
        #yield term, cat_freq_dicts         
        #yield term, (cat_total_dict, term_total)
        

        
        #N = sum(cat_total_dict.values())  ### I THINK THIS NUMBER IS WRONG...
        # -> this is the total number of items for all categories i.e. term_freq
        
        
        #yield term, cat_freq_dicts         
        #yield term, (cat_total_dict, term_total, N )
        
        '''
        - A = number of items of category $c$ which contains the term $t$

        - B = number of items not of category $c$ which contain the term $t$ 

        - C = numer of items of category $c$ which do not contain the term $t$

        - D = numer of items not of category $c$ which do not contain the term $t$ 

        - N = total number of items (i.e. total number of reviews)
        '''
        
        chi_square = []
        
        N = self.N # total number of reviews 
        
        for category in cat_freq_dicts.keys(): # e.g. cat_total_dict = {"Patio_Lawn_and_Garde":15,"Apps_for_Android":1},16]
            A = cat_freq_dicts[category]  # number of items of category $c$ which contains the term $t$ 
            B = sum(cat_freq_dicts.values()) - A # number of items not of category $c$ which contain the term $t$

            C = 2 # missing 
            D = 2 # missing

            chi = (N * ((A * D) - (B * C)) ** 2) / ((A + C) * (B + D) * (A + B) * (C + D))
            #chi_square.append((term, category, chi))
                
            yield category, (term, chi)   # example output "neighbors"	[{"Patio_Lawn_and_Garde":15,"Apps_for_Android":1},16]       

        #top_chi = sorted(top_chi, key=lambda x: x[2], reverse=True)[:75]
        
        #yield term[0], top_chi         
        #yield term, (cat_total_dict, term_total, chi_square, A, B  )   # example output "neighbors"	[{"Patio_Lawn_and_Garde":15,"Apps_for_Android":1},16]       
        #yield term, (cat_total_dict, chi_square )   # example output "neighbors"	[{"Patio_Lawn_and_Garde":15,"Apps_for_Android":1},16]       
        
    def reducer_chi(self, category, term_chi):
        
        
        #top_75 = sorted(term_chi, key=lambda x: x[2], reverse=True)[:75]
        all_chi = {}
        terms, chis = [],[]
        for t,c in term_chi:
            terms.append(t)
            chis.append(c)
            
        chis_s, terms_s = zip(*sorted(zip(chis, terms), reverse=True))
        terms_s = list(terms_s)
        chis_s = list(chis_s)
        
        d = dict(zip(terms_s[:10] , chis_s[:10] ))
        yield category, d
        
        
    def steps(self):
 
        return [
            MRStep(mapper  = self.mapper1,
                  ) ,
        
        ]

        '''
        return [
            MRStep(mapper  = self.mapper1,
                   combiner = self.combiner1,
                   reducer = self.reducer1,
                  ) ,

            MRStep(mapper  = self.mapper2,
                   combiner = self.combiner2,
                   reducer = self.reducer2,
                  ) ,    

            MRStep(mapper  = self.mapper_chi,
                   reducer = self.reducer_chi,
                  ) ,              
        ]
        '''
    
if __name__ == '__main__':
    WordCounter.run()

Overwriting wordcount_jupyter.py


In [62]:
#! python3.8 wordcount_jupyter.py reduced.json > output_jupyter.dat 
! python3.8 wordcount_jupyter.py reduced.json 


No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory /tmp/wordcount_jupyter.federico.20230502.201030.966057
Running step 1 of 1...
job output is in /tmp/wordcount_jupyter.federico.20230502.201030.966057/output
Streaming final output from /tmp/wordcount_jupyter.federico.20230502.201030.966057/output...
"Apps_for_Android"	["successor",1,1,1]
"Apps_for_Android"	["short",1,1,1]
"Apps_for_Android"	["storage",1,1,1]
"Apps_for_Android"	["accompanied",1,1,1]
"Apps_for_Android"	["wallpaper",1,1,1]
"Apps_for_Android"	["space",1,1,1]
"Apps_for_Android"	["issues",1,1,1]
"Apps_for_Android"	["android",1,1,1]
"Apps_for_Android"	["difficulty",1,1,1]
"Apps_for_Android"	["background",1,1,1]
"Apps_for_Android"	["account",1,1,1]
"Apps_for_Android"	["current",1,1,1]
"Apps_for_Android"	["give",1,1,1]
"Apps_for_Android"	["performs",1,1,1]
"Apps_for_Android"	["future",1,1,1]
"Apps_for_Android"	["application",1,1,1]
"Apps_for_Android"	["precious",1