In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('./data/sampled_2018.csv')

In [5]:
df.head()['categories']

0    ["Dining & Entertaining", "Dinnerware & Servew...
1    ["Dining & Entertaining", "Dinnerware & Servew...
2              ["Home & Kitchen", "Bath", "Bath Rugs"]
3              ["Home & Kitchen", "Bath", "Bath Rugs"]
4              ["Home & Kitchen", "Bath", "Bath Rugs"]
Name: categories, dtype: object

In [4]:
df.columns

Index(['rating', 'title_x', 'text', 'asin', 'parent_asin', 'user_id',
       'timestamp', 'helpful_vote', 'verified_purchase', 'month',
       'main_category', 'title_y', 'average_rating', 'rating_number',
       'features', 'description', 'price', 'store', 'categories', 'brand',
       'material', 'color', 'capacity', 'style', 'pattern',
       'care_instructions', 'unit_count', 'dimensions', 'num_items',
       'item_weight', 'best_sellers_rank', 'discontinued',
       'date_first_available'],
      dtype='object')

In [22]:
df[['rating', 'title_x', 'text','timestamp','helpful_vote','verified_purchase','categories','price','parent_asin','features']]#['features'][3]

Unnamed: 0,rating,title_x,text,timestamp,helpful_vote,verified_purchase,categories,price,parent_asin,features
0,5.0,"Very nice set, is very temperamental but made ...",Really nice and cute. I bought these to put on...,2018-01-12 23:07:11.420,19,1,"[""Dining & Entertaining"", ""Dinnerware & Servew...",29.99,B00M2IAPXK,"[""Condiment and dipping bowl set includes beau..."
1,5.0,Really like it,"I really like it, always receive good comments...",2018-12-15 20:13:58.764,0,1,"[""Dining & Entertaining"", ""Dinnerware & Servew...",19.82,B01BGQ82LU,"[""Arch de dip chip & dip bowl"", ""Party ready c..."
2,1.0,Frayed material the first week! Not a good pro...,Junk it started to fray the same week I bought...,2018-01-29 19:39:33.495,0,1,"[""Home & Kitchen"", ""Bath"", ""Bath Rugs""]",14.96,B018CWUO3K,"[""Skid resistant: safe & reliable pvc construc..."
3,1.0,Small over priced crappy product. Would never buy,Small over priced crappy product. Would never ...,2018-01-21 21:42:14.843,2,1,"[""Home & Kitchen"", ""Bath"", ""Bath Rugs""]",19.34,B09K4WX5ND,"[""Fuzzy Microfiber"", ""Imported"", ""SUPER SOFT: ..."
4,1.0,One Star,Returned,2018-02-07 18:14:13.772,0,1,"[""Home & Kitchen"", ""Bath"", ""Bath Rugs""]",,B00LRYF0XA,"[""30\""X46\"" Size Bath Rug Perfect for Shower, ..."
...,...,...,...,...,...,...,...,...,...,...
51930,5.0,Easy to install and great,This took just a few minutes to install and ma...,2018-12-26 05:27:32.269,0,1,[],62.95,B00N109ZPM,"[""Dedicated smoke mode with user-controlled fe..."
51931,5.0,Bigger than expected,These were larger capacity than expected. Won’...,2018-12-15 16:29:55.529,0,1,[],,B06W2K6XW4,[]
51932,5.0,turned out well,I liked the fact I could turn them around also...,2018-12-03 20:57:58.357,0,1,[],,B07C3NWYSS,[]
51933,5.0,Cute,My toddler doesn't go to sleep without this pi...,2018-12-04 10:45:26.514,0,1,[],26.45,B06XBG6V51,[]


In [65]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace, split, when
from pyspark.sql.types import FloatType

# Initialize Spark session
spark = SparkSession.builder \
    .appName("AmazonReviewDataCleaning") \
    .getOrCreate()

def clean_amazon_reviews(df):
    """
    Function to clean Amazon review data at scale using PySpark.
    Parameters:
        df (DataFrame): Input Spark DataFrame containing Amazon review data.
    Returns:
        DataFrame: Cleaned Spark DataFrame.
    """

    # 1. Drop duplicates
    df = df.dropDuplicates()

    # 2. Handle missing values
    # - Drop reviews with missing text or rating
    df = df.filter(col("text").isNotNull() & col("rating").isNotNull())    

    # - Replace null lists with empty lists in 'categories' and 'features'
    df = df.withColumn("categories", split(regexp_replace(col("categories").cast("string"), "null", "[]"), ", "))
    df = df.withColumn("features", split(regexp_replace(col("features").cast("string"), "null", "[]"), ", "))

    # - Handle missing values by replacing NaN with 'unknown', helpful_vote with 0
    df = df.fillna({"title_x": "unknown", "helpful_vote": 0})
    df = df.withColumn("price", 
                       when(col("price").isNull() | (col("price").cast("string") == "NaN"), "unknown")
                       .otherwise(col("price").cast("string")))
    
    # 3. Clean and normalize text columns
    # - Remove extra whitespaces in 'title_x' and 'text' columns
    df = df.withColumn("title_x", regexp_replace(col("title_x"), r"\s+", " "))
    df = df.withColumn("text", regexp_replace(col("text"), r"\s+", " "))

    return df


col_list = ['rating', 'title_x', 'text','timestamp','helpful_vote','verified_purchase','categories','price','parent_asin','features']
# Assuming your data is in a Spark DataFrame called `df_raw`
df_raw = spark.createDataFrame(df[col_list].copy())

print("Null values in price before cleaning:", df_raw.filter(col("price").isNull()).count())

df_cleaned = clean_amazon_reviews(df_raw)

print("Null values in price after cleaning:", df_cleaned.filter(col("price") == "unknown").count())
# Show the cleaned data
df_cleaned.show()


24/10/26 17:11:52 WARN TaskSetManager: Stage 84 contains a task of very large size (2717 KiB). The maximum recommended task size is 1000 KiB.


Null values in price before cleaning: 0


24/10/26 17:11:52 WARN TaskSetManager: Stage 87 contains a task of very large size (2717 KiB). The maximum recommended task size is 1000 KiB.


Null values in price after cleaning: 20218


24/10/26 17:11:52 WARN TaskSetManager: Stage 93 contains a task of very large size (2717 KiB). The maximum recommended task size is 1000 KiB.


+------+--------------------+--------------------+--------------------+------------+-----------------+--------------------+-------+-----------+--------------------+
|rating|             title_x|                text|           timestamp|helpful_vote|verified_purchase|          categories|  price|parent_asin|            features|
+------+--------------------+--------------------+--------------------+------------+-----------------+--------------------+-------+-----------+--------------------+
|   5.0|Great for the price!|Bought these for ...|2018-08-18 23:07:...|           0|                1|[["Home & Kitchen...|  29.99| B07YJ4GB3Y|[["Chenille", "So...|
|   1.0|Started to smell ...|Started to smell ...|2018-01-25 23:20:...|           0|                1|[["Home & Kitchen...|  15.99| B07VS4K6P5|[["100% Pvc", "SH...|
|   2.0|Terrible DO NOT BUY!|Cool idea, but ab...|2018-08-09 03:25:...|           0|                0|[["Home & Kitchen...|unknown| B077V6329T|[["2018 update 25...|
|   5.0|  

In [64]:
df_cleaned[['price']].show()

24/10/26 17:11:08 WARN TaskSetManager: Stage 81 contains a task of very large size (2717 KiB). The maximum recommended task size is 1000 KiB.


+------+
| price|
+------+
| 29.99|
| 15.99|
|   NaN|
|  5.95|
|  19.9|
|  9.95|
|  9.95|
|   NaN|
| 28.83|
|   NaN|
|   NaN|
|   NaN|
| 44.99|
|   8.7|
|239.44|
|   NaN|
| 61.37|
| 37.03|
|   NaN|
| 13.49|
+------+
only showing top 20 rows



In [46]:
from pyspark.sql.functions import countDistinct
df_cleaned.select(countDistinct("text")).show()

24/10/26 16:52:35 WARN TaskSetManager: Stage 26 contains a task of very large size (2717 KiB). The maximum recommended task size is 1000 KiB.

+--------------------+
|count(DISTINCT text)|
+--------------------+
|               49204|
+--------------------+



                                                                                

In [32]:
df_temp = df[col_list].copy()

In [72]:
df_temp[['price']==pd.isna()]

Unnamed: 0,price
0,29.99
1,19.82
2,14.96
3,19.34
4,
...,...
51930,62.95
51931,
51932,
51933,26.45


In [44]:
df_temp[['text']].value_counts()

text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           
Love it                                                                                                                                                                                                                                                                                                                                                                                                                                 

In [50]:
df_cleaned.count()

24/10/26 16:54:32 WARN TaskSetManager: Stage 48 contains a task of very large size (2717 KiB). The maximum recommended task size is 1000 KiB.


51935