# Homework setup (file download, etc)

In [1]:
!curl -O https://raw.githubusercontent.com/mosesyhc/de300-2025sp-class/refs/heads/main/agnews_clean.csv
!curl -O https://raw.githubusercontent.com/mosesyhc/de300-2025sp-class/refs/heads/main/w.csv
!curl -O https://raw.githubusercontent.com/mosesyhc/de300-2025sp-class/refs/heads/main/bias.csv
!curl -O https://raw.githubusercontent.com/mosesyhc/de300-2025sp-class/refs/heads/main/data_for_svm.csv

In [2]:
# Import Spark session to initialize and manage the Spark application
from pyspark.sql import SparkSession

# Import NLTK and stopwords for text preprocessing
import nltk
from nltk.corpus import stopwords

# Import PySpark functions for working with columns and defining UDFs
from pyspark.sql.functions import udf, col

# Import data types for use with UDFs
from pyspark.sql.types import ArrayType, StringType

# Import numerical and data processing libraries
import numpy as np
import pandas as pd

# Import additional PySpark functions for transformations
import pyspark.sql.functions as F
from pyspark.sql.functions import col, explode, length, collect_list, when

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Task 1: ti-idf definition

In [4]:
# Create or retrieve a Spark session for running Spark operations
spark = (SparkSession.builder
         .master("local[*]")
         .appName("AG news")
         .getOrCreate()
        )

In [5]:
# Load the cleaned AG News dataset into a Spark DataFrame from the local directory
agnews = spark.read.csv(
    "agnews_clean.csv",
    inferSchema=True,
    header=True
)

# Import necessary functions and types
import pyspark.sql.functions as F
from pyspark.sql.types import ArrayType, StringType

# Convert the 'filtered' column from a JSON string to an array of strings
agnews = agnews.withColumn('filtered', F.from_json('filtered', ArrayType(StringType())))

## Task 1: Design the MapReduce functions for calculating the tf-idf measure.

In [6]:
# Count the frequency of each word across all documents
word_counts = (
    agnews
    .select(F.explode(F.col("filtered")).alias("word"))  # Flatten the 'filtered' array into individual words
    .groupBy("word")                                     # Group by each unique word
    .count()                                             # Count occurrences of each word
    .orderBy("count", ascending=False)                   # Sort words by frequency in descending order
)


This shows that I should most likely remove words like 39 or 1 or quot or gt lt, as well as stop words. These are pretty meaningless. So I will remove:

39, quot, u, b and lt and qt (these may have been greater than or less than). I will also get rid of AP and ruters as these are just news sources.

In [7]:
# Load the standard list of English stopwords from NLTK
stopword_list = set(stopwords.words('english'))

# Define a custom set of additional unwanted tokens specific to the dataset
custom_list = {"39", "quot", "u", "b", "lt", "gt", "qt", "ap", "reuters"}

# Combine the NLTK stopwords with the custom list to form the full filter list
full_filter_list = stopword_list.union(custom_list)

There are 207 words that I will remove as I go row by row through the dataset.

In [8]:
# Explode the filtered word list so each word appears in a separate row, keeping track of the document ID
words_df = agnews.select(
    col("_c0").alias("doc_id"),                             # Rename the document ID column
    explode(col("filtered")).alias("word")                  # Flatten the array of words into individual rows
)

# Filter out stopwords and words with length <= 2
words_df_clean = words_df.filter(
    (~col("word").isin(full_filter_list)) & (F.length(col("word")) > 2)
)

# Group cleaned words back into a list per document
agnews_cleaned = words_df_clean.groupBy("doc_id") \
    .agg(F.collect_list("word").alias("filtered"))

In [9]:
# Re-explode the cleaned word list so each word from each document appears in its own row
words_df = agnews_cleaned.select(
    col("doc_id"),
    explode(col("filtered")).alias("word")
)

In [10]:
# Count how many times each word appears in each document
word_counts = words_df.groupBy("doc_id", "word").count().withColumnRenamed("count", "word_count")

# Count the total number of words in each document
total_words = words_df.groupBy("doc_id").count().withColumnRenamed("count", "total_words")

In [11]:
# Join word-level and document-level counts to prepare for TF calculation
tf = word_counts.join(total_words, "doc_id")

# Calculate term frequency (TF) as word_count divided by total_words in the document
tf = tf.withColumn("tf", col("word_count") / col("total_words"))

In [12]:
# Calculate document frequency (DF) for each word:
# how many documents each word appears in (only once per document)
doc_freq = (
    words_df
    .dropDuplicates(["doc_id", "word"])  # Keep only one occurrence of each word per document
    .groupBy("word")
    .count()
    .withColumnRenamed("count", "doc_freq")
)

In [13]:
# Count the total number of documents
num_docs = agnews.count()

# Calculate inverse document frequency (IDF) for each word
idf = doc_freq.withColumn("idf", F.log(num_docs / F.col("doc_freq")))

## Task 2: Calculate tf-idf measure for each row in the agnews_clean.csv. Save the measures in a new column.

In [14]:
# Join the TF and IDF data to prepare for TF-IDF computation
tfidf = tf.join(idf, "word")

# Calculate TF-IDF by multiplying term frequency with inverse document frequency
tfidf = tfidf.withColumn("tfidf", col("tf") * col("idf"))

## Task 3: Print out the tf-idf measure for the first 5 documents.

Shown below are the TF-IDF scores for the first five documents, sorted by how important each word is in its document.
These scores tell us which words are the most unique and meaningful in each article. For example, in document 0, the word “cynics” has a high score because it doesn’t show up in many other documents, even though it only appears once. On the other hand, words like “black” or “wall” appear more often across the dataset, so their scores are lower. TF-IDF helps highlight the key terms that make each document stand out.

In [15]:
# Display all TF-IDF scores for the first 5 documents (doc_id < 5),
# sorted by document and then by TF-IDF score in descending order
tfidf.filter(col("doc_id") < 5) \
     .orderBy("doc_id", "tfidf", ascending=[True, False]) \
     .show(n=tfidf.count(), truncate=False)

+--------------+------+----------+-----------+--------------------+--------+------------------+-------------------+
|word          |doc_id|word_count|total_words|tf                  |doc_freq|idf               |tfidf              |
+--------------+------+----------+-----------+--------------------+--------+------------------+-------------------+
|cynics        |0     |1         |15         |0.06666666666666667 |5       |10.147217737458726|0.6764811824972484 |
|wall          |0     |2         |15         |0.13333333333333333 |1277    |4.604386793860288 |0.6139182391813717 |
|claw          |0     |1         |15         |0.06666666666666667 |16      |8.984066927653044 |0.5989377951768696 |
|dwindling     |0     |1         |15         |0.06666666666666667 |34      |8.230295125276665 |0.548686341685111  |
|sellers       |0     |1         |15         |0.06666666666666667 |41      |8.043083583188519 |0.5362055722125679 |
|ultra         |0     |1         |15         |0.06666666666666667 |76   

# Task 2: SVM objective function

## Task 1: Design the MapReduce functions required to calculate the loss function.


In [16]:
# Load the full dataset (features and labels) from the local directory
data = pd.read_csv("data_for_svm.csv", header=None)

# Load the SVM weight vector and flatten it into a 1D array
w = pd.read_csv("w.csv", header=None).values.flatten()

# Load the bias term as a scalar
b = pd.read_csv("bias.csv", header=None).values[0][0]

# Split the dataset into feature matrix X and label vector y
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

## Task 2: Using these functions, create a function loss_SVM(w, b, X, y) to calculate the SVM objective for a given choice of w, b with data stored in X, y.


In [17]:
# Compute the soft-margin SVM loss function
def loss_SVM(w, b, X, y, lam=0.1):
    reg_term = lam * np.linalg.norm(w) ** 2              # Regularization term: lambda * ||w||^2
    margins = 1 - y * (X @ w + b)                        # Compute margins: 1 - y_i * (w^T x_i + b)
    hinge_losses = np.maximum(0, margins)                # Apply hinge loss: max(0, margin)
    avg_hinge = np.mean(hinge_losses)                    # Average hinge loss over all examples
    total_loss = reg_term + avg_hinge                    # Total loss = regularization + average hinge loss
    return total_loss

## Task 3: You are given the following dataset data_for_svm.csv, where the first 64 columns contain X and the last column contains y. Using the weights and bias provided in w.csv and bias.csv, calculate the objective value.

In [18]:
# Compute the SVM loss using the provided weights, bias, features, and labels
loss = loss_SVM(w, b, X, y)

# Print the resulting loss value
print("SVM loss:", loss)

SVM loss: 1.0000454245191739


The SVM loss is about 1. This means that given the provided weights and bias, the model has a relatively low combined loss. It indicates that while some predictions may fall within or beyond the margin (causing hinge loss), the overall error and regularization penalty are modest. This suggests the given weights and bias produce a reasonable fit to the data.

In [19]:
# Load the dataset for prediction from the local directory
data_pred = spark.read.csv("data_for_svm.csv", header=False, inferSchema=True)

# Rename columns for clarity
for i in range(65):
    data_pred = data_pred.withColumnRenamed(f"_c{i}", f"c{i}")

# Compute the dot product: w^T x + b
dot_product = sum([col(f"c{i}") * float(w[i]) for i in range(64)]) + float(b)

# Apply SVM decision rule: sign(w^T x + b)
data_pred = data_pred.withColumn("prediction", when(dot_product >= 0, 1).otherwise(-1))

# Show first few predictions
data_pred.select("prediction").show()

+----------+
|prediction|
+----------+
|        -1|
|        -1|
|        -1|
|         1|
|        -1|
|         1|
|        -1|
|        -1|
|         1|
|        -1|
|         1|
|        -1|
|        -1|
|        -1|
|         1|
|         1|
|         1|
|        -1|
|         1|
|        -1|
+----------+
only showing top 20 rows



Displayed above are the SVM predictions for the first 20 observations in the dataset.

In [20]:
# Write the DataFrame containing predictions to a CSV file named 'svm_predictions'
# This will create a directory with part files inside
data_pred.write.csv("svm_predictions", header=True, mode="overwrite")

# AI disclosure

I used Generative Artificial Intelligence (ChatGPT) to support parts of the coding and documentation for this assignment. Specifically, I used ChatGPT to:

- Add comments to my code for clarity and readability, especially for Spark transformations like withColumn, groupBy, and explode.
Prompt example: “Can you add basic comments to this PySpark code chunk that calculates TF-IDF?”

- Confirming and editing the overall flow of my TF-IDF pipeline — including how to calculate term frequency (TF), document frequency (DF), inverse document frequency (IDF), and finally compute TF-IDF.
Prompt example: "Can you help me build a TF-IDF pipeline using PySpark from tokenized text?”

- Clarify the steps involved in computing TF-IDF using PySpark, and confirm that my approach matched MapReduce logic.
Prompt example: “Did I calculate TF-IDF for each row in the dataset and save the result in a new column?”

- Help write the SVM loss function
Prompt example: “Am I writing this SVM loss function correctly?”