# DIC EX2 - part 1

## Setup

### Initialize Spark context

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("DIC EX 2 - group 36") \
    .getOrCreate()
sc = spark.sparkContext

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/usr/lib/spark/jars/log4j-slf4j-impl-2.17.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/usr/lib/hadoop/lib/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


25/05/12 17:11:09 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/05/12 17:11:11 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


### Set path variables

In [2]:
data_path = "hdfs:///user/dic25_shared/amazon-reviews/full/reviews_devset.json"
stopwords_path = "stopwords.txt"
output_path = "output_rdd.txt"

### Load data

In [3]:
import json
rdd_json = sc.textFile(data_path).map(json.loads).persist()
rdd_json.first()

                                                                                

{'reviewerID': 'A2VNYWOPJ13AFP',
 'asin': '0981850006',
 'reviewerName': 'Amazon Customer "carringt0n"',
 'helpful': [6, 7],
 'reviewText': "This was a gift for my other husband.  He's making us things from it all the time and we love the food.  Directions are simple, easy to read and interpret, and fun to make.  We all love different kinds of cuisine and Raichlen provides recipes from everywhere along the barbecue trail as he calls it. Get it and just open a page.  Have at it.  You'll love the food and it has provided us with an insight into the culture that produced it. It's all about broadening horizons.  Yum!!",
 'overall': 5.0,
 'summary': 'Delish',
 'unixReviewTime': 1259798400,
 'reviewTime': '12 3, 2009',
 'category': 'Patio_Lawn_and_Garde'}

## Prepare Data
### Tokenize, Lowecase and Remove Stopwords

In [4]:
import re, string

def load_stopwords(path: str) -> list[str]:
    """
    Load stopwords from a file efficiently.
    """
    stopwords = set()
    with open(path, "r", encoding="utf-8") as f:
        stopwords = set(line.strip() for line in f if line.strip())
    return list(stopwords)

re_split = re.compile(
    r"[ \t\r\n\d\(\)\[\]\{\}\.\!\?,;:+=\-_\"'`~#@&*%€$§\\/]+"
)
stop = load_stopwords(stopwords_path)
stop_bc = sc.broadcast(stop)

def clean_tokens(row):
    cat = row["category"]
    text = row["reviewText"]
    stop = stop_bc.value 
    terms = [t.lower() for t in re_split.split(text) if t]
    terms = [t for t in terms if t not in stop]
    return [(cat, t) for t in terms]

data = rdd_json.flatMap(clean_tokens).persist()    

data.take(10)

                                                                                

[('Patio_Lawn_and_Garde', 'gift'),
 ('Patio_Lawn_and_Garde', 'husband'),
 ('Patio_Lawn_and_Garde', 'making'),
 ('Patio_Lawn_and_Garde', 'things'),
 ('Patio_Lawn_and_Garde', 'time'),
 ('Patio_Lawn_and_Garde', 'love'),
 ('Patio_Lawn_and_Garde', 'food'),
 ('Patio_Lawn_and_Garde', 'directions'),
 ('Patio_Lawn_and_Garde', 'simple'),
 ('Patio_Lawn_and_Garde', 'easy')]

## Frequency counts

In [5]:
cat_term_freq = (
    data
      .map(lambda ct: (ct, 1))
      .reduceByKey(lambda a, b: a + b)
      .persist()
)

In [6]:
term_totals = (
    data
      .map(lambda x: (x[1], 1))          # (term t, 1)
      .reduceByKey(lambda a, b: a + b)     # (term t, n_*t - occurences of t summed over all categories)
      .collectAsMap()
)

                                                                                

In [7]:
cat_totals = (
    data
      .map(lambda x: (x[0], 1))      # (category c, 1)
      .reduceByKey(lambda a, b: a + b)     # (category c, n_c* - number of all terms in that category)
      .collectAsMap()
)

                                                                                

In [8]:
# n - total terms over all categories
n_total = sum(cat_totals.values())

In [9]:
term_totals_bc = sc.broadcast(term_totals)
cat_totals_bc  = sc.broadcast(cat_totals)
n_bc        = sc.broadcast(n_total)

## Chi-Squared Computation

In [10]:
def chisq(record):
    (cat, term), n_ct = record
    n_t = term_totals_bc.value[term]
    n_c = cat_totals_bc.value[cat]
    n    = n_bc.value
    expected = n_t * n_c / n
    chi2 = (n_ct - expected) ** 2 / expected if expected else 0.0
    return (cat, (term, chi2))

rdd_chisq = cat_term_freq.map(chisq)

## Top 75 terms per category

In [11]:
K = 75
top_per_cat = (
    rdd_chisq
      .groupByKey()                             # (cat, iterable)
      .mapValues(lambda it:                          # sort descending (in-memory)
                 sorted(it, key=lambda x: -x[1])[:K])
      .persist()
)

category_lines = top_per_cat.map(lambda cat_terms: 
    cat_terms[0] + " " + 
    " ".join(f"{term}:{chi:.6f}" for term, chi in cat_terms[1])
)

In [12]:
joined_dict = (
    top_per_cat
      .flatMap(lambda x: [term for term, _ in x[1]])   # just the strings
      .distinct()
    .collect()
)

                                                                                

In [13]:
joined_dict.sort()
dict_line = " ".join(joined_dict)

In [14]:
all_lines = category_lines.collect() + [dict_line]

with open("output_rdd.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(all_lines))

 

## Test

In [15]:
# top 10 in arbitrary category
(rdd_chisq
   .filter(lambda x: x[0] == "Patio_Lawn_and_Garde")
   .takeOrdered(10, key=lambda x: -x[1][1]))

                                                                                

[('Patio_Lawn_and_Garde', ('plants', 2776.0402487639853)),
 ('Patio_Lawn_and_Garde', ('feeder', 2137.1335792021955)),
 ('Patio_Lawn_and_Garde', ('lawn', 2067.362695269985)),
 ('Patio_Lawn_and_Garde', ('yard', 1974.2790226500385)),
 ('Patio_Lawn_and_Garde', ('seeds', 1960.277872189091)),
 ('Patio_Lawn_and_Garde', ('seed', 1955.2784922235153)),
 ('Patio_Lawn_and_Garde', ('grass', 1931.802541885193)),
 ('Patio_Lawn_and_Garde', ('smoker', 1820.405663778677)),
 ('Patio_Lawn_and_Garde', ('traps', 1721.5338462230538)),
 ('Patio_Lawn_and_Garde', ('pool', 1659.1594264969642))]

## Check whether output directory already exists and delete with these commands to rerun

In [16]:
!hdfs dfs -ls

Found 6 items
drwxr-xr-x   - e12412672 supergroup          0 2025-05-12 17:11 .sparkStaging
drwxr-xr-x   - e12412672 supergroup          0 2025-03-26 20:38 books
drwxr-xr-x   - e12412672 supergroup          0 2025-05-11 21:12 feature_pipe_part2
drwxr-xr-x   - e12412672 supergroup          0 2025-05-11 21:16 models
drwxr-xr-x   - e12412672 supergroup          0 2025-05-12 16:20 output_rdd.txt
drwxr-xr-x   - e12412672 supergroup          0 2025-03-26 20:40 tmp


In [17]:
#!hdfs dfs -rm -r