# DIC EX2 - part 2

## Setup

### Initialize Spark context

In [28]:
from pyspark.ml.feature import (
    RegexTokenizer, StopWordsRemover,
    CountVectorizer, IDF, ChiSqSelector, StringIndexer
)

In [29]:
from pyspark.sql import SparkSession


spark = SparkSession \
    .builder \
    .appName("DIC EX 2 - group 36") \
    .getOrCreate()

### Set path variables

In [30]:
SEED = 42
DEV_JSON = "hdfs:///user/dic25_shared/amazon-reviews/full/reviews_devset.json"         
SAVE_PATH = "feature_pipe_part2"

### Load data

In [32]:
df = spark.read.json("reviews_devset.json")
df.printSchema()

root
 |-- asin: string (nullable = true)
 |-- category: string (nullable = true)
 |-- helpful: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- overall: double (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- reviewTime: string (nullable = true)
 |-- reviewerID: string (nullable = true)
 |-- reviewerName: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- unixReviewTime: long (nullable = true)



## Build pipeline

### Tokenize using regex

In [33]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

tokenizer = RegexTokenizer(inputCol="reviewText", outputCol="tokens", pattern="[\s\t\d\(\)\[\]\{\}\.\!\?\,\;\:\+\=\-\_\"\'`\~\#\@\&\*\%\€\$\§\\\/]+")

  tokenizer = RegexTokenizer(inputCol="reviewText", outputCol="tokens", pattern="[\s\t\d\(\)\[\]\{\}\.\!\?\,\;\:\+\=\-\_\"\'`\~\#\@\&\*\%\€\$\§\\\/]+")


### Remove stopwords

In [34]:
from pyspark.ml.feature import StopWordsRemover

def load_stopwords(path= "stopwords.txt") -> list[str]:
    """
    Load stopwords from a file efficiently.
    """
    stopwords = set()
    with open(path, "r", encoding="utf-8") as f:
        stopwords = set(line.strip() for line in f if line.strip())
    return list(stopwords)



stopper = StopWordsRemover(
    inputCol="tokens", outputCol="tokens_filt",
    stopWords=load_stopwords()
)

### Calculate token counts and idf

In [35]:
from pyspark.ml.feature import CountVectorizer, IDF, Tokenizer

tf = CountVectorizer(
    inputCol="tokens_filt", outputCol="tf",
    vocabSize=20_000, minDF=5
)


idf = IDF(inputCol="tf", outputCol="tf_idf")


In [36]:
# index category
label_indexer = StringIndexer(
    inputCol="category", outputCol="label", handleInvalid="skip"
)

### Calculate chi square values and select top 75 features

In [37]:
from pyspark.ml.feature import ChiSqSelector
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType



chisq = ChiSqSelector(
    featuresCol="tf_idf", outputCol="selected_features",
    labelCol="label",        
    numTopFeatures=2_000
)



In [38]:
from pyspark.ml import Pipeline, PipelineModel

feature_pipe = Pipeline(stages=[
    tokenizer, stopper, tf, idf, label_indexer, chisq
])

print("Pipeline running")
feature_model = feature_pipe.fit(df)



Pipeline running


In [13]:
from pyspark.ml import PipelineModel
import getpass

#safe to hdfs 
USER = getpass.getuser()                  
SAVE_PATH = f"hdfs:///user/{USER}/models/feature_pipe_part2"

feature_model.write().overwrite().save(SAVE_PATH)

### Get top tokens

In [42]:
from pyspark.ml.feature import CountVectorizerModel, ChiSqSelectorModel

cv_model = next(
    s for s in feature_model.stages
    if isinstance(s, CountVectorizerModel)
)
vocab = cv_model.vocabulary

sel_model = next(s for s in feature_model.stages
                 if isinstance(s, ChiSqSelectorModel))
all_selected = sel_model.selectedFeatures    


top75_tokens = [vocab[i] for i in all_selected]

with open("output_ds.txt", "w", encoding="utf-8") as f:
    f.write(" ".join(top75_tokens))

25/04/26 22:00:09 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB


                                                                                

['amazon', 'author', 'back', 'bad', 'big', 'bit', 'bought', 'buy', 'character', 'characters', 'day', 'easy', 'end', 'enjoyed', 'excellent', 'family', 'feel', 'find', 'fit', 'found', 'give', 'good', 'great', 'happy', 'hard', 'high', 'highly', 'interesting', 'job', 'light', 'long', 'lot', 'love', 'loved', 'made', 'make', 'makes', 'man', 'money', 'music', 'nice', 'part', 'people', 'perfect', 'pretty', 'price', 'problem', 'purchase', 'purchased', 'put', 'quality', 'quot', 'reading', 'real', 'recommend', 'review', 'series', 'set', 'size', 'small', 'sound', 'thing', 'things', 'thought', 'time', 'times', 'wanted', 'watch', 'work', 'works', 'world', 'worth', 'written', 'year', 'years']


### Write tokens to file

In [None]:
with open(output_path, "w") as f:
    f.write(" ".join(sorted(words)))