In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('./data/sampled_data_2018_2019.csv')

In [3]:
df.head()['categories']

0                                                  NaN
1    Home & Kitchen,Heating, Cooling & Air Quality,...
2    Home & Kitchen,Home Décor Products,Clocks,Spec...
3    Home & Kitchen,Kitchen & Dining,Coffee, Tea & ...
4    Home & Kitchen,Kitchen & Dining,Coffee, Tea & ...
Name: categories, dtype: object

In [4]:
df.columns

Index(['review_month', 'rating', 'parent_asin', 'asin', 'helpful_vote', 'text',
       'timestamp', 'title', 'user_id', 'verified_purchase',
       'review_date_timestamp', 'main_category', 'product_name', 'categories',
       'price', 'average_rating', 'rating_number', 'year'],
      dtype='object')

In [6]:
# df[['rating', 'title', 'text','timestamp','helpful_vote','verified_purchase','categories','price','parent_asin','features','care_instructions']]#['features'][3]

In [7]:
# df[['care_instructions']].value_counts()

In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace, split, when
from pyspark.sql.types import FloatType

# Initialize Spark session
spark = SparkSession.builder \
    .appName("AmazonReviewDataCleaning") \
    .getOrCreate()

def clean_amazon_reviews(df):
    """
    Function to clean Amazon review data at scale using PySpark.
    Parameters:
        df (DataFrame): Input Spark DataFrame containing Amazon review data.
    Returns:
        DataFrame: Cleaned Spark DataFrame.
    """

    # 1. Drop duplicates
    df = df.dropDuplicates()

    # 2. Handle missing values
    # - Drop reviews with missing text or rating
    df = df.filter(col("text").isNotNull() & col("rating").isNotNull())    

    # - Replace null lists with empty lists in 'categories' and 'features'
    # df = df.withColumn("categories", split(regexp_replace(col("categories").cast("string"), "null", "[]"), ", "))

    # - Handle missing values by replacing NaN with 'unknown', helpful_vote with 0
    df = df.fillna({"title": "unknown", "helpful_vote": 0})
    df = df.withColumn("price", 
                       when(col("price").isNull() | (col("price").cast("string") == "NaN"), "unknown")
                       .otherwise(col("price").cast("string")))
    
    # 3. Clean and normalize text columns
    # - Remove extra whitespaces in 'title' and 'text' columns
    df = df.withColumn("title", regexp_replace(col("title"), r"\s+", " "))
    df = df.withColumn("text", regexp_replace(col("text"), r"\s+", " "))

    return df

# df = pd.read_csv('./data/sampled_2018.csv')

# col_list = ['rating', 'title', 'text','timestamp','helpful_vote','verified_purchase','categories','price','parent_asin']
# Assuming your data is in a Spark DataFrame called `df_raw`
df_raw = spark.createDataFrame(df.copy()) #[col_list]

print("Null values in price before cleaning:", df_raw.filter(col("price").isNull()).count())

df_cleaned = clean_amazon_reviews(df_raw)

print("Null values in price after cleaning:", df_cleaned.filter(col("price") == "unknown").count())
# Show the cleaned data
df_cleaned.show()


24/10/29 11:41:25 WARN Utils: Your hostname, Legion resolves to a loopback address: 127.0.1.1; using 10.0.0.86 instead (on interface wlo1)
24/10/29 11:41:25 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/29 11:41:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/10/29 11:41:33 WARN TaskSetManager: Stage 0 contains a task of very large size (5354 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

Null values in price before cleaning: 0


24/10/29 11:41:34 WARN TaskSetManager: Stage 3 contains a task of very large size (5354 KiB). The maximum recommended task size is 1000 KiB.
24/10/29 11:41:36 WARN TaskSetManager: Stage 9 contains a task of very large size (5354 KiB). The maximum recommended task size is 1000 KiB.


Null values in price after cleaning: 51130


                                                                                

+------------+------+-----------+----------+------------+--------------------+-------------+--------------------+--------------------+-----------------+---------------------+--------------------+--------------------+--------------------+-------+--------------+-------------+----+
|review_month|rating|parent_asin|      asin|helpful_vote|                text|    timestamp|               title|             user_id|verified_purchase|review_date_timestamp|       main_category|        product_name|          categories|  price|average_rating|rating_number|year|
+------------+------+-----------+----------+------------+--------------------+-------------+--------------------+--------------------+-----------------+---------------------+--------------------+--------------------+--------------------+-------+--------------+-------------+----+
|           2|   1.0| B077J535KS|B077J535KS|          11|Was no good at li...|1519341036620|         Junk, sorry|AHHS5673RG6HRG6TG...|             true|        

In [9]:
df_raw.printSchema()

root
 |-- review_month: long (nullable = true)
 |-- rating: double (nullable = true)
 |-- parent_asin: string (nullable = true)
 |-- asin: string (nullable = true)
 |-- helpful_vote: long (nullable = true)
 |-- text: string (nullable = true)
 |-- timestamp: long (nullable = true)
 |-- title: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- verified_purchase: boolean (nullable = true)
 |-- review_date_timestamp: string (nullable = true)
 |-- main_category: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- categories: string (nullable = true)
 |-- price: double (nullable = true)
 |-- average_rating: double (nullable = true)
 |-- rating_number: long (nullable = true)
 |-- year: long (nullable = true)



In [24]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace, split, when
from pyspark.sql.types import FloatType

# Initialize Spark session
spark = SparkSession.builder \
    .appName("AmazonReviewDataCleaning") \
    .getOrCreate()

def clean_amazon_reviews(df):
    """
    Function to clean Amazon review data at scale using PySpark.
    Parameters:
        df (DataFrame): Input Spark DataFrame containing Amazon review data.
    Returns:
        DataFrame: Cleaned Spark DataFrame.
    """

    # 1. Drop duplicates
    df = df.dropDuplicates()

    # 2. Handle missing values
    # - Drop reviews with missing text or rating
    df = df.filter(col("text").isNotNull() & col("rating").isNotNull())    

    # - Replace null lists with empty lists in 'categories' and 'features'
    # df = df.withColumn("categories", split(regexp_replace(col("categories").cast("string"), "null", "[]"), ", "))
    # df = df.withColumn("features", split(regexp_replace(col("features").cast("string"), "null", "[]"), ", "))

    # - Handle missing values by replacing NaN with 'unknown', helpful_vote with 0
    df = df.fillna({"title": "unknown", "helpful_vote": 0})
    df = df.withColumn("price", 
                       when(col("price").isNull() | (col("price").cast("string") == "NaN"), "unknown")
                       .otherwise(col("price").cast("string")))
    
    # 3. Clean and normalize text columns
    # - Remove extra whitespaces in 'title' and 'text' columns
    df = df.withColumn("title", regexp_replace(col("title"), r"\s+", " "))
    df = df.withColumn("text", regexp_replace(col("text"), r"\s+", " "))

    return df

# Read the data directly into a Spark DataFrame
df_raw = spark.read.csv(
    './data/sampled_data_2018_2019.csv',  # Path to your CSV file
    header=True,                # Use the first row as headers
    inferSchema=True,          # Automatically infer the data types
    quote='"',                  # Specify double quotes as the quote character
    escape='"',                 # Escape character for quotes within quoted strings
    # mode='DROPMALFORMED'       # Drop rows that are malformed
)
# Select relevant columns
# col_list = ['rating', 'title', 'text', 'timestamp', 'helpful_vote', 'verified_purchase', 'categories', 'price', 'parent_asin']
# df_raw = df_raw.select(*col_list)

# Print null values in price before cleaning
print("Null values in price before cleaning:", df_raw.filter(col("price").isNull()).count())

# Clean the data
df_cleaned = clean_amazon_reviews(df_raw)

# Print null values in price after cleaning
print("Null values in price after cleaning:", df_cleaned.filter(col("price") == "unknown").count())

# Show the cleaned data
df_cleaned.show()


Null values in price before cleaning: 51147
Null values in price after cleaning: 51124
+------------+------+-----------+----------+------------+--------------------+-------------+--------------------+--------------------+-----------------+---------------------+--------------------+--------------------+--------------------+------+--------------+-------------+----+
|review_month|rating|parent_asin|      asin|helpful_vote|                text|    timestamp|               title|             user_id|verified_purchase|review_date_timestamp|       main_category|        product_name|          categories| price|average_rating|rating_number|year|
+------------+------+-----------+----------+------------+--------------------+-------------+--------------------+--------------------+-----------------+---------------------+--------------------+--------------------+--------------------+------+--------------+-------------+----+
|           2|   1.0| B07VWSCGC2|B00KIG3QVG|           3|Looks great until..

In [19]:
df_cleaned.show()

+------------+------+-----------+----------+------------+--------------------+-------------+--------------------+--------------------+-----------------+---------------------+--------------------+--------------------+--------------------+------+--------------+-------------+----+
|review_month|rating|parent_asin|      asin|helpful_vote|                text|    timestamp|               title|             user_id|verified_purchase|review_date_timestamp|       main_category|        product_name|          categories| price|average_rating|rating_number|year|
+------------+------+-----------+----------+------------+--------------------+-------------+--------------------+--------------------+-----------------+---------------------+--------------------+--------------------+--------------------+------+--------------+-------------+----+
|           2|   1.0| B07VWSCGC2|B00KIG3QVG|           3|Looks great until...|1551322890901|Don’t waste your ...|AFX5APGYM7BMT7Z7U...|             true|           

In [12]:
from pyspark.sql.functions import countDistinct
df_cleaned.select(countDistinct("text")).show()

24/10/29 11:42:34 WARN TaskSetManager: Stage 20 contains a task of very large size (5354 KiB). The maximum recommended task size is 1000 KiB.


+--------------------+
|count(DISTINCT text)|
+--------------------+
|              131470|
+--------------------+



In [13]:
df_raw.printSchema()

root
 |-- rating: string (nullable = true)
 |-- title_x: string (nullable = true)
 |-- text: string (nullable = true)
 |-- asin: string (nullable = true)
 |-- parent_asin: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- helpful_vote: string (nullable = true)
 |-- verified_purchase: string (nullable = true)
 |-- month: string (nullable = true)
 |-- main_category: string (nullable = true)
 |-- title_y: string (nullable = true)
 |-- average_rating: double (nullable = true)
 |-- rating_number: string (nullable = true)
 |-- features: string (nullable = true)
 |-- description: string (nullable = true)
 |-- price: string (nullable = true)
 |-- store: string (nullable = true)
 |-- categories: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- material: string (nullable = true)
 |-- color: string (nullable = true)
 |-- capacity: string (nullable = true)
 |-- style: string (nullable = true)
 |-- pattern: string (nullab

In [32]:
df_temp = df[col_list].copy()

In [77]:
df_temp['price'].isna().value_counts()

price
False    31717
True     20218
Name: count, dtype: int64

In [44]:
df_temp[['text']].value_counts()

text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           
Love it                                                                                                                                                                                                                                                                                                                                                                                                                                 

In [50]:
df_cleaned.count()

24/10/26 16:54:32 WARN TaskSetManager: Stage 48 contains a task of very large size (2717 KiB). The maximum recommended task size is 1000 KiB.


51935

In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace, split, when

def create_spark_session():
    """
    Create a Spark session.
    Returns:
        SparkSession: Initialized Spark session.
    """
    spark = SparkSession.builder \
        .appName("AmazonReviewDataCleaning") \
        .getOrCreate()
    return spark

def clean_amazon_reviews(spark, file_path):
    """
    Function to clean Amazon review data at scale using PySpark.
    
    Parameters:
        spark (SparkSession): The Spark session.
        file_path (str): Path to the input CSV file.
    
    Returns:
        DataFrame: Cleaned Spark DataFrame.
    """

    # Read CSV as Spark DataFrame
    df_raw = spark.read.csv(file_path, header=True, inferSchema=True)

    # Filter for specific columns
    # col_list = ['rating', 'title', 'text', 'timestamp', 'helpful_vote', 'verified_purchase', 'categories', 'price', 'parent_asin']
    
    # review_month,rating,parent_asin,asin,helpful_vote,text,timestamp,title,user_id,verified_purchase,review_date_timestamp,main_category,product_name,categories,price,average_rating,rating_number,year

    # df_raw = df_raw.select(col_list)

    # 1. Drop duplicates
    df_raw = df_raw.dropDuplicates()

    # 2. Handle missing values
    # - Drop reviews with missing text or rating
    df_raw = df_raw.filter(col("text").isNotNull() & col("rating").isNotNull())    

    # - Replace null lists with empty lists in 'categories'
    # df_raw = df_raw.withColumn("categories", split(regexp_replace(col("categories").cast("string"), "null", "[]"), ", "))

    # - Handle missing values by replacing NaN with 'unknown', helpful_vote with 0
    df_raw = df_raw.fillna({"title": "unknown", "helpful_vote": 0})
    # df_raw = df_raw.withColumn("price", 
    #                    when(col("price").isNull() | (col("price").cast("string") == "NaN"), "unknown")
    #                    .otherwise(col("price").cast("string")))
    
    # 3. Clean and normalize text columns
    # - Remove extra whitespaces in 'title' and 'text' columns
    df_raw = df_raw.withColumn("title", regexp_replace(col("title"), r"\s+", " "))
    df_raw = df_raw.withColumn("text", regexp_replace(col("text"), r"\s+", " "))

    return df_raw


In [13]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace, split, when

# Initialize Spark session
spark = SparkSession.builder \
    .appName("AmazonReviewDataCleaning") \
    .getOrCreate()

def clean_amazon_reviews(df):
    # Remove duplicates and filter out rows with null ratings or texts
    col_list = ['rating', 'title', 'text', 'timestamp', 'helpful_vote', 'verified_purchase', 'price', 'parent_asin']
    df = df.select(col_list)


    df = df.dropDuplicates()
    df = df.filter(col("text").isNotNull() & col("rating").isNotNull())
    
    # # Clean categories and features columns
    # df = df.withColumn("categories", 
    #                    split(regexp_replace(col("categories").cast("string"), "null", "[]"), ", "))
    # df = df.withColumn("features", 
    #                    split(regexp_replace(col("features").cast("string"), "null", "[]"), ", "))
    
    # Fill nulls
    df = df.fillna({"title": "unknown", "helpful_vote": 0})
    
    # Clean price column
    df = df.withColumn("price", 
                       when(col("price").isNull() | (col("price").cast("string") == "NaN"), "unknown")
                       .otherwise(col("price").cast("string")))
    
    # Clean text and title columns
    df = df.withColumn("title", regexp_replace(col("title_x"), r"\s+", " "))
    df = df.withColumn("text", regexp_replace(col("text"), r"\s+", " "))
    
    return df

# Read the data
df_raw = spark.read.csv(
    'staging/data/sampled_2018.csv', 
    header=True, 
    inferSchema=True,
    quote='"',  # Handle quoted strings correctly
    escape='"'
    # mode='DROPMALFORMED',  # Drop malformed lines
)

# Trim column names
df_raw = df_raw.toDF(*(c.strip() for c in df_raw.columns))

# Clean the data
df_cleaned = clean_amazon_reviews(df_raw)

# Show the cleaned data
df_cleaned.show(truncate=False)

# Optionally convert to Pandas for better readability
df_cleaned_pd = df_cleaned.toPandas()
print(df_cleaned_pd)


+------+---------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+------------+-----------------+--------------------------------------------------------------------------------------------------------------------------------------------------------+-----------+
|rating|title_x                                                  |text                                                                                                                      

In [21]:
df_cleaned.show()

+------------+------+-----------+----------+------------+--------------------+-------------+--------------------+--------------------+-----------------+---------------------+--------------------+--------------------+--------------------+------+--------------+-------------+----+
|review_month|rating|parent_asin|      asin|helpful_vote|                text|    timestamp|               title|             user_id|verified_purchase|review_date_timestamp|       main_category|        product_name|          categories| price|average_rating|rating_number|year|
+------------+------+-----------+----------+------------+--------------------+-------------+--------------------+--------------------+-----------------+---------------------+--------------------+--------------------+--------------------+------+--------------+-------------+----+
|           2|   1.0| B07VWSCGC2|B00KIG3QVG|           3|Looks great until...|1551322890901|Don’t waste your ...|AFX5APGYM7BMT7Z7U...|             true|           

### Labeling

In [25]:
df_cleaned.write.option("header", True).csv("./data/pyspark/prelabel/cleaned_data_pyspark.csv", mode="overwrite")
del df_cleaned

In [2]:
from pyspark.sql import SparkSession
# from pyspark.sql.functions import col, regexp_replace, split, when
# from pyspark.sql.types import FloatType

spark = SparkSession.builder \
    .appName("AmazonReviewDataCleaning") \
    .getOrCreate()

df_cleaned = spark.read.option("header", True).csv("./data/pyspark/prelabel/cleaned_data_pyspark.csv")

24/10/29 12:14:09 WARN Utils: Your hostname, Legion resolves to a loopback address: 127.0.1.1; using 10.0.0.86 instead (on interface wlo1)
24/10/29 12:14:09 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/29 12:14:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [10]:
df_cleaned#.show()

DataFrame[review_month: string, rating: string, parent_asin: string, asin: string, helpful_vote: string, text: string, timestamp: string, title: string, user_id: string, verified_purchase: string, review_date_timestamp: string, main_category: string, product_name: string, categories: string, price: string, average_rating: string, rating_number: string, year: string]

In [13]:
# df_pandas['rating'][0]

'1.0'

In [4]:
from snorkel.labeling import labeling_function

POSITIVE = 1
NEGATIVE = -1
NEUTRAL = 0


# Define additional keywords for positive and negative sentiments
positive_words = ["excellent", "amazing", "great", "fantastic", "love", "wonderful", "perfect", "satisfied", "good", "best", "recommend"]
negative_words = ["bad", "poor", "terrible", "awful", "disappointed", "waste", "horrible", "worst", "not worth", "refund", "broken", "defective"]

# Labeling function for positive sentiment based on text
@labeling_function()
def lf_contains_positive_words(row):
    return POSITIVE if any(word in row.text.lower() for word in positive_words) else NEUTRAL

# Labeling function for negative sentiment based on text
@labeling_function()
def lf_contains_negative_words(row):
    return NEGATIVE if any(word in row.text.lower() for word in negative_words) else NEUTRAL

# Combine with previous rating-based labeling functions
@labeling_function()
def lf_high_rating(row):
    return POSITIVE if row.rating >= 4 else NEUTRAL

@labeling_function()
def lf_low_rating(row):
    return NEGATIVE if row.rating <= 2 else NEUTRAL

# Assemble the expanded labeling functions
lfs = [lf_high_rating, lf_low_rating, lf_contains_positive_words, lf_contains_negative_words]


In [9]:
from snorkel.labeling import PandasLFApplier#, LabelModel
df_pandas = df_cleaned.toPandas()

# Apply labeling functions to the DataFrame
applier = PandasLFApplier(lfs)
L = applier.apply(df_pandas)

# Train Snorkel’s LabelModel to combine these weak labels
label_model = LabelModel(cardinality=3, verbose=True)
label_model.fit(L_train=L, n_epochs=500, log_freq=100)

# Predict final labels using Snorkel’s model
df_pandas['label_snorkel_rating'] = label_model.predict(L)

  0%|          | 1/140735 [00:00<00:29, 4782.56it/s]                            


TypeError: '>=' not supported between instances of 'str' and 'int'