# DIC EX2 - part 1

## Setup

### Initialize Spark context

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("DIC EX 2 - group 36") \
    .getOrCreate()
sc = spark.sparkContext

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/usr/lib/spark/jars/log4j-slf4j-impl-2.17.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/usr/lib/hadoop/lib/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


25/05/13 16:21:02 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/05/13 16:21:06 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


### Set path variables

In [2]:
data_path = "hdfs:///user/dic25_shared/amazon-reviews/full/reviews_devset.json"
stopwords_path = "stopwords.txt"
output_path = "output_rdd.txt"

### Load data

In [3]:
import json
rdd_json = sc.textFile(data_path).map(json.loads).persist()
rdd_json.first()

                                                                                

{'reviewerID': 'A2VNYWOPJ13AFP',
 'asin': '0981850006',
 'reviewerName': 'Amazon Customer "carringt0n"',
 'helpful': [6, 7],
 'reviewText': "This was a gift for my other husband.  He's making us things from it all the time and we love the food.  Directions are simple, easy to read and interpret, and fun to make.  We all love different kinds of cuisine and Raichlen provides recipes from everywhere along the barbecue trail as he calls it. Get it and just open a page.  Have at it.  You'll love the food and it has provided us with an insight into the culture that produced it. It's all about broadening horizons.  Yum!!",
 'overall': 5.0,
 'summary': 'Delish',
 'unixReviewTime': 1259798400,
 'reviewTime': '12 3, 2009',
 'category': 'Patio_Lawn_and_Garde'}

## Prepare Data
### Tokenize, Lowecase and Remove Stopwords

In [4]:
import re, string

def load_stopwords(path: str) -> list[str]:
    """
    Load stopwords from a file efficiently.
    """
    stopwords = set()
    with open(path, "r", encoding="utf-8") as f:
        stopwords = set(line.strip() for line in f if line.strip())
    return list(stopwords)

re_split = re.compile(
    r"[\s\t\d\(\)\[\]\{\}\.\!\?\,\;\:\+\=\-\_\"\'`\~\#\@\&\*\%\€\$\§\\\/]+"
)
stop = load_stopwords(stopwords_path)
stop_bc = sc.broadcast(stop)

def clean_tokens(row):
    cat = row["category"]
    text = row["reviewText"].lower()
    stop = stop_bc.value 

    tokens = set()
    for token in re_split.split(text):
        if token and len(token) > 1 and token not in stop:
            tokens.add(token)
    return [(cat, t) for t in tokens]

data = rdd_json.flatMap(clean_tokens).persist()    

data.take(10)

                                                                                

[('Patio_Lawn_and_Garde', 'insight'),
 ('Patio_Lawn_and_Garde', 'things'),
 ('Patio_Lawn_and_Garde', 'open'),
 ('Patio_Lawn_and_Garde', 'raichlen'),
 ('Patio_Lawn_and_Garde', 'horizons'),
 ('Patio_Lawn_and_Garde', 'food'),
 ('Patio_Lawn_and_Garde', 'make'),
 ('Patio_Lawn_and_Garde', 'broadening'),
 ('Patio_Lawn_and_Garde', 'cuisine'),
 ('Patio_Lawn_and_Garde', 'husband')]

## Frequency counts

In [5]:
cat_term_count = (
    data
      .map(lambda ct: (ct, 1))
      .reduceByKey(lambda a, b: a + b)
      .persist()
)

In [6]:
term_count = (
    data
      .map(lambda x: (x[1], 1))          
      .reduceByKey(lambda a, b: a + b)  
      .collectAsMap()
)

                                                                                

In [7]:
cat_count = (
    rdd_json                                 
      .map(lambda d: (d['category'], 1))     # use rdd_json['category'] to count tokens once per review
      .reduceByKey(lambda a, b: a + b)       
      .collectAsMap()
)

In [8]:
N = rdd_json.count()
term_count_bc = sc.broadcast(term_count)
cat_count_bc = sc.broadcast(cat_count)
N_bc = sc.broadcast(N)

## Chi-Squared Computation

In [9]:
def chisq(record):
    (cat, term), A = record            # A = n_ct
    n_t = term_count_bc.value[term]   # A+B
    n_c = cat_count_bc.value[cat]     # A+C
    N   = N_bc.value

    B = n_t - A
    C = n_c - A
    D = N  - A - B - C

    numerator =  N * (A*D - B*C)**2 
    denominator = (A+B)*(A+C)*(B+D)*(C+D)
    chi2  = numerator / denominator if denominator else 0.0
    return (cat, (term, chi2))

rdd_chisq = cat_term_count.map(chisq)

## Top 75 terms per category

In [10]:
K = 75
top_per_cat = (
    rdd_chisq
      .groupByKey()
      .mapValues(lambda it:
                 sorted(it, key=lambda x: -x[1])[:K])
      .persist()
)


In [11]:
category_lines = (
    top_per_cat
      .map(lambda ct: (ct[0], " ".join(f"{t}:{c:.4f}" for t, c in ct[1])))
      .sortByKey() # alphabetical order
      .map(lambda kv: f"{kv[0]}\t{kv[1]}")
      .collect()
)

dict_line = (
    top_per_cat
      .flatMap(lambda x: [t for t, _ in x[1]])
      .distinct()
      .sortBy(lambda x: x)
      .collect()
)
dict_line = " ".join(dict_line)


                                                                                

In [13]:
with open("output_rdd.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(category_lines + [dict_line]))
 

## Test

In [14]:
# top 10 in arbitrary category
(rdd_chisq
   .filter(lambda x: x[0] == "Patio_Lawn_and_Garde")
   .takeOrdered(10, key=lambda x: -x[1][1]))

                                                                                

[('Patio_Lawn_and_Garde', ('plants', 1375.2459845466976)),
 ('Patio_Lawn_and_Garde', ('lawn', 1202.315270475095)),
 ('Patio_Lawn_and_Garde', ('seeds', 1025.23041826743)),
 ('Patio_Lawn_and_Garde', ('yard', 970.6544432294293)),
 ('Patio_Lawn_and_Garde', ('garden', 863.0908397719647)),
 ('Patio_Lawn_and_Garde', ('seed', 816.1940705756133)),
 ('Patio_Lawn_and_Garde', ('feeder', 787.3686670546093)),
 ('Patio_Lawn_and_Garde', ('plant', 707.467245647197)),
 ('Patio_Lawn_and_Garde', ('gas', 678.5602454785391)),
 ('Patio_Lawn_and_Garde', ('grass', 622.2926746357932))]