In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('./data/sampled_data_2018_2019.csv')

In [3]:
df.head()['categories']

0                                                  NaN
1    Home & Kitchen,Heating, Cooling & Air Quality,...
2    Home & Kitchen,Home Décor Products,Clocks,Spec...
3    Home & Kitchen,Kitchen & Dining,Coffee, Tea & ...
4    Home & Kitchen,Kitchen & Dining,Coffee, Tea & ...
Name: categories, dtype: object

In [4]:
df.columns

Index(['review_month', 'rating', 'parent_asin', 'asin', 'helpful_vote', 'text',
       'timestamp', 'title', 'user_id', 'verified_purchase',
       'review_date_timestamp', 'main_category', 'product_name', 'categories',
       'price', 'average_rating', 'rating_number', 'year'],
      dtype='object')

In [6]:
# df[['rating', 'title', 'text','timestamp','helpful_vote','verified_purchase','categories','price','parent_asin','features','care_instructions']]#['features'][3]

In [7]:
# df[['care_instructions']].value_counts()

In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace, split, when
from pyspark.sql.types import FloatType

# Initialize Spark session
spark = SparkSession.builder \
    .appName("AmazonReviewDataCleaning") \
    .getOrCreate()

def clean_amazon_reviews(df):
    """
    Function to clean Amazon review data at scale using PySpark.
    Parameters:
        df (DataFrame): Input Spark DataFrame containing Amazon review data.
    Returns:
        DataFrame: Cleaned Spark DataFrame.
    """

    # 1. Drop duplicates
    df = df.dropDuplicates()

    # 2. Handle missing values
    # - Drop reviews with missing text or rating
    df = df.filter(col("text").isNotNull() & col("rating").isNotNull())    

    # - Replace null lists with empty lists in 'categories' and 'features'
    # df = df.withColumn("categories", split(regexp_replace(col("categories").cast("string"), "null", "[]"), ", "))

    # - Handle missing values by replacing NaN with 'unknown', helpful_vote with 0
    df = df.fillna({"title": "unknown", "helpful_vote": 0})
    df = df.withColumn("price", 
                       when(col("price").isNull() | (col("price").cast("string") == "NaN"), "unknown")
                       .otherwise(col("price").cast("string")))
    
    # 3. Clean and normalize text columns
    # - Remove extra whitespaces in 'title' and 'text' columns
    df = df.withColumn("title", regexp_replace(col("title"), r"\s+", " "))
    df = df.withColumn("text", regexp_replace(col("text"), r"\s+", " "))

    return df

# df = pd.read_csv('./data/sampled_2018.csv')

# col_list = ['rating', 'title', 'text','timestamp','helpful_vote','verified_purchase','categories','price','parent_asin']
# Assuming your data is in a Spark DataFrame called `df_raw`
df_raw = spark.createDataFrame(df.copy()) #[col_list]

print("Null values in price before cleaning:", df_raw.filter(col("price").isNull()).count())

df_cleaned = clean_amazon_reviews(df_raw)

print("Null values in price after cleaning:", df_cleaned.filter(col("price") == "unknown").count())
# Show the cleaned data
df_cleaned.show()


24/10/29 11:41:25 WARN Utils: Your hostname, Legion resolves to a loopback address: 127.0.1.1; using 10.0.0.86 instead (on interface wlo1)
24/10/29 11:41:25 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/29 11:41:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/10/29 11:41:33 WARN TaskSetManager: Stage 0 contains a task of very large size (5354 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

Null values in price before cleaning: 0


24/10/29 11:41:34 WARN TaskSetManager: Stage 3 contains a task of very large size (5354 KiB). The maximum recommended task size is 1000 KiB.
24/10/29 11:41:36 WARN TaskSetManager: Stage 9 contains a task of very large size (5354 KiB). The maximum recommended task size is 1000 KiB.


Null values in price after cleaning: 51130


                                                                                

+------------+------+-----------+----------+------------+--------------------+-------------+--------------------+--------------------+-----------------+---------------------+--------------------+--------------------+--------------------+-------+--------------+-------------+----+
|review_month|rating|parent_asin|      asin|helpful_vote|                text|    timestamp|               title|             user_id|verified_purchase|review_date_timestamp|       main_category|        product_name|          categories|  price|average_rating|rating_number|year|
+------------+------+-----------+----------+------------+--------------------+-------------+--------------------+--------------------+-----------------+---------------------+--------------------+--------------------+--------------------+-------+--------------+-------------+----+
|           2|   1.0| B077J535KS|B077J535KS|          11|Was no good at li...|1519341036620|         Junk, sorry|AHHS5673RG6HRG6TG...|             true|        

In [9]:
df_raw.printSchema()

root
 |-- review_month: long (nullable = true)
 |-- rating: double (nullable = true)
 |-- parent_asin: string (nullable = true)
 |-- asin: string (nullable = true)
 |-- helpful_vote: long (nullable = true)
 |-- text: string (nullable = true)
 |-- timestamp: long (nullable = true)
 |-- title: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- verified_purchase: boolean (nullable = true)
 |-- review_date_timestamp: string (nullable = true)
 |-- main_category: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- categories: string (nullable = true)
 |-- price: double (nullable = true)
 |-- average_rating: double (nullable = true)
 |-- rating_number: long (nullable = true)
 |-- year: long (nullable = true)



In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace, split, when
from pyspark.sql.types import FloatType

# Initialize Spark session
spark = SparkSession.builder \
    .appName("AmazonReviewDataCleaning") \
    .getOrCreate()

def clean_amazon_reviews(df):
    """
    Function to clean Amazon review data at scale using PySpark.
    Parameters:
        df (DataFrame): Input Spark DataFrame containing Amazon review data.
    Returns:
        DataFrame: Cleaned Spark DataFrame.
    """

    # 1. Drop duplicates
    df = df.dropDuplicates()

    # 2. Handle missing values
    # - Drop reviews with missing text or rating
    df = df.filter(col("text").isNotNull() & col("rating").isNotNull())    

    # - Replace null lists with empty lists in 'categories' and 'features'
    # df = df.withColumn("categories", split(regexp_replace(col("categories").cast("string"), "null", "[]"), ", "))
    # df = df.withColumn("features", split(regexp_replace(col("features").cast("string"), "null", "[]"), ", "))

    # - Handle missing values by replacing NaN with 'unknown', helpful_vote with 0
    df = df.fillna({"title": "unknown", "helpful_vote": 0})
    df = df.withColumn("price", 
                       when(col("price").isNull() | (col("price").cast("string") == "NaN"), "unknown")
                       .otherwise(col("price").cast("string")))
    
    # 3. Clean and normalize text columns
    # - Remove extra whitespaces in 'title' and 'text' columns
    df = df.withColumn("title", regexp_replace(col("title"), r"\s+", " "))
    df = df.withColumn("text", regexp_replace(col("text"), r"\s+", " "))

    return df

# Read the data directly into a Spark DataFrame
df_raw = spark.read.csv(
    './data/sampled_data_2018_2019.csv',  # Path to your CSV file
    header=True,                # Use the first row as headers
    inferSchema=True,          # Automatically infer the data types
    quote='"',                  # Specify double quotes as the quote character
    escape='"',                 # Escape character for quotes within quoted strings
    # mode='DROPMALFORMED'       # Drop rows that are malformed
)
# Select relevant columns
# col_list = ['rating', 'title', 'text', 'timestamp', 'helpful_vote', 'verified_purchase', 'categories', 'price', 'parent_asin']
# df_raw = df_raw.select(*col_list)

# Print null values in price before cleaning
print("Null values in price before cleaning:", df_raw.filter(col("price").isNull()).count())

# Clean the data
df_cleaned = clean_amazon_reviews(df_raw)

# Print null values in price after cleaning
print("Null values in price after cleaning:", df_cleaned.filter(col("price") == "unknown").count())

# Show the cleaned data
df_cleaned.show()


24/10/30 07:47:43 WARN Utils: Your hostname, Legion resolves to a loopback address: 127.0.1.1; using 10.0.0.86 instead (on interface wlo1)
24/10/30 07:47:43 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/30 07:47:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

Null values in price before cleaning: 51147


                                                                                

Null values in price after cleaning: 51124
+------------+------+-----------+----------+------------+--------------------+-------------+--------------------+--------------------+-----------------+---------------------+--------------------+--------------------+--------------------+------+--------------+-------------+----+
|review_month|rating|parent_asin|      asin|helpful_vote|                text|    timestamp|               title|             user_id|verified_purchase|review_date_timestamp|       main_category|        product_name|          categories| price|average_rating|rating_number|year|
+------------+------+-----------+----------+------------+--------------------+-------------+--------------------+--------------------+-----------------+---------------------+--------------------+--------------------+--------------------+------+--------------+-------------+----+
|           2|   1.0| B07VWSCGC2|B00KIG3QVG|           3|Looks great until...|1551322890901|Don’t waste your ...|AFX5APG

In [2]:
df_cleaned.filter(col("rating").isNull()).show()


+------------+------+-----------+----+------------+----+---------+-----+-------+-----------------+---------------------+-------------+------------+----------+-----+--------------+-------------+----+
|review_month|rating|parent_asin|asin|helpful_vote|text|timestamp|title|user_id|verified_purchase|review_date_timestamp|main_category|product_name|categories|price|average_rating|rating_number|year|
+------------+------+-----------+----+------------+----+---------+-----+-------+-----------------+---------------------+-------------+------------+----------+-----+--------------+-------------+----+
+------------+------+-----------+----+------------+----+---------+-----+-------+-----------------+---------------------+-------------+------------+----------+-----+--------------+-------------+----+



In [4]:
# from pyspark.sql.functions import countDistinct
# df_cleaned.select(countDistinct("text")).show()

In [5]:
# df_raw.printSchema()

In [6]:
# df_temp = df[col_list].copy()

In [7]:
# df_temp['price'].isna().value_counts()

In [8]:
# df_temp[['text']].value_counts()

In [9]:
# df_cleaned.count()

In [10]:
# from pyspark.sql import SparkSession
# from pyspark.sql.functions import col, regexp_replace, split, when

# def create_spark_session():
#     """
#     Create a Spark session.
#     Returns:
#         SparkSession: Initialized Spark session.
#     """
#     spark = SparkSession.builder \
#         .appName("AmazonReviewDataCleaning") \
#         .getOrCreate()
#     return spark

# def clean_amazon_reviews(spark, file_path):
#     """
#     Function to clean Amazon review data at scale using PySpark.
    
#     Parameters:
#         spark (SparkSession): The Spark session.
#         file_path (str): Path to the input CSV file.
    
#     Returns:
#         DataFrame: Cleaned Spark DataFrame.
#     """

#     # Read CSV as Spark DataFrame
#     df_raw = spark.read.csv(file_path, header=True, inferSchema=True)

#     # Filter for specific columns
#     # col_list = ['rating', 'title', 'text', 'timestamp', 'helpful_vote', 'verified_purchase', 'categories', 'price', 'parent_asin']
    
#     # review_month,rating,parent_asin,asin,helpful_vote,text,timestamp,title,user_id,verified_purchase,review_date_timestamp,main_category,product_name,categories,price,average_rating,rating_number,year

#     # df_raw = df_raw.select(col_list)

#     # 1. Drop duplicates
#     df_raw = df_raw.dropDuplicates()

#     # 2. Handle missing values
#     # - Drop reviews with missing text or rating
#     df_raw = df_raw.filter(col("text").isNotNull() & col("rating").isNotNull())    

#     # - Replace null lists with empty lists in 'categories'
#     # df_raw = df_raw.withColumn("categories", split(regexp_replace(col("categories").cast("string"), "null", "[]"), ", "))

#     # - Handle missing values by replacing NaN with 'unknown', helpful_vote with 0
#     df_raw = df_raw.fillna({"title": "unknown", "helpful_vote": 0})
#     # df_raw = df_raw.withColumn("price", 
#     #                    when(col("price").isNull() | (col("price").cast("string") == "NaN"), "unknown")
#     #                    .otherwise(col("price").cast("string")))
    
#     # 3. Clean and normalize text columns
#     # - Remove extra whitespaces in 'title' and 'text' columns
#     df_raw = df_raw.withColumn("title", regexp_replace(col("title"), r"\s+", " "))
#     df_raw = df_raw.withColumn("text", regexp_replace(col("text"), r"\s+", " "))

#     return df_raw


In [11]:
# from pyspark.sql import SparkSession
# from pyspark.sql.functions import col, regexp_replace, split, when

# # Initialize Spark session
# spark = SparkSession.builder \
#     .appName("AmazonReviewDataCleaning") \
#     .getOrCreate()

# def clean_amazon_reviews(df):
#     # Remove duplicates and filter out rows with null ratings or texts
#     col_list = ['rating', 'title', 'text', 'timestamp', 'helpful_vote', 'verified_purchase', 'price', 'parent_asin']
#     df = df.select(col_list)


#     df = df.dropDuplicates()
#     df = df.filter(col("text").isNotNull() & col("rating").isNotNull())
    
#     # # Clean categories and features columns
#     # df = df.withColumn("categories", 
#     #                    split(regexp_replace(col("categories").cast("string"), "null", "[]"), ", "))
#     # df = df.withColumn("features", 
#     #                    split(regexp_replace(col("features").cast("string"), "null", "[]"), ", "))
    
#     # Fill nulls
#     df = df.fillna({"title": "unknown", "helpful_vote": 0})
    
#     # Clean price column
#     df = df.withColumn("price", 
#                        when(col("price").isNull() | (col("price").cast("string") == "NaN"), "unknown")
#                        .otherwise(col("price").cast("string")))
    
#     # Clean text and title columns
#     df = df.withColumn("title", regexp_replace(col("title_x"), r"\s+", " "))
#     df = df.withColumn("text", regexp_replace(col("text"), r"\s+", " "))
    
#     return df

# # Read the data
# df_raw = spark.read.csv(
#     'staging/data/sampled_2018.csv', 
#     header=True, 
#     inferSchema=True,
#     quote='"',  # Handle quoted strings correctly
#     escape='"'
#     # mode='DROPMALFORMED',  # Drop malformed lines
# )

# # Trim column names
# df_raw = df_raw.toDF(*(c.strip() for c in df_raw.columns))

# # Clean the data
# df_cleaned = clean_amazon_reviews(df_raw)

# # Show the cleaned data
# df_cleaned.show(truncate=False)

# # Optionally convert to Pandas for better readability
# df_cleaned_pd = df_cleaned.toPandas()
# print(df_cleaned_pd)


In [21]:
df_cleaned.show()

+------------+------+-----------+----------+------------+--------------------+-------------+--------------------+--------------------+-----------------+---------------------+--------------------+--------------------+--------------------+------+--------------+-------------+----+
|review_month|rating|parent_asin|      asin|helpful_vote|                text|    timestamp|               title|             user_id|verified_purchase|review_date_timestamp|       main_category|        product_name|          categories| price|average_rating|rating_number|year|
+------------+------+-----------+----------+------------+--------------------+-------------+--------------------+--------------------+-----------------+---------------------+--------------------+--------------------+--------------------+------+--------------+-------------+----+
|           2|   1.0| B07VWSCGC2|B00KIG3QVG|           3|Looks great until...|1551322890901|Don’t waste your ...|AFX5APGYM7BMT7Z7U...|             true|           

### Labeling

In [25]:
df_cleaned.write.option("header", True).csv("./data/pyspark/prelabel/cleaned_data_pyspark.csv", mode="overwrite")
del df_cleaned

In [12]:
from pyspark.sql import SparkSession
# from pyspark.sql.functions import col, regexp_replace, split, when
# from pyspark.sql.types import FloatType

spark = SparkSession.builder \
    .appName("AmazonReviewDataCleaning") \
    .getOrCreate()

df_cleaned = spark.read.option("header", True).csv("./data/pyspark/prelabel/cleaned_data_pyspark.csv")

In [13]:
df_cleaned#.show()

DataFrame[review_month: string, rating: string, parent_asin: string, asin: string, helpful_vote: string, text: string, timestamp: string, title: string, user_id: string, verified_purchase: string, review_date_timestamp: string, main_category: string, product_name: string, categories: string, price: string, average_rating: string, rating_number: string, year: string]

In [14]:
# df_pandas['rating'][0]

In [15]:
import pandas as pd
from snorkel.labeling import labeling_function, LFApplier
from snorkel.labeling.model import LabelModel

# Define sentiment labels
POSITIVE = 1
NEGATIVE = -1
NEUTRAL = 0

# Define additional keywords for positive and negative sentiments
positive_words = ["excellent", "amazing", "great", "fantastic", "love", "wonderful", "perfect", "satisfied", "good", "best", "recommend"]
negative_words = ["bad", "poor", "terrible", "awful", "disappointed", "waste", "horrible", "worst", "not worth", "refund", "broken", "defective"]

# Labeling function for positive sentiment based on text
@labeling_function()
def lf_contains_positive_words(row):
    return POSITIVE if row["text"] and any(word in row["text"].lower() for word in positive_words) else NEUTRAL

# Labeling function for negative sentiment based on text
@labeling_function()
def lf_contains_negative_words(row):
    return NEGATIVE if row["text"] and any(word in row["text"].lower() for word in negative_words) else NEUTRAL

# Labeling function based on rating
@labeling_function()
def lf_high_rating(row):
    return POSITIVE if row["rating"] >= 4 else NEUTRAL

@labeling_function()
def lf_low_rating(row):
    return NEGATIVE if row["rating"] <= 2 else NEUTRAL

# List of labeling functions
lfs = [lf_high_rating, lf_low_rating, lf_contains_positive_words, lf_contains_negative_words]

# Convert DataFrame to pandas, ensure rating column is numeric, and then to dictionary format
df_pandas = df_cleaned.toPandas()
df_pandas['rating'] = pd.to_numeric(df_pandas['rating'], errors='coerce')
data_points = df_pandas.to_dict(orient="records")  # Convert to list of dictionaries

# Apply labeling functions
applier = LFApplier(lfs)
L = applier.apply(data_points)

# Train LabelModel and predict
label_model = LabelModel(cardinality=3, verbose=True)
label_model.fit(L_train=L, n_epochs=500, log_freq=100)

# Assign predicted labels
df_pandas['label_snorkel_rating'] = label_model.predict(L)


140735it [00:01, 120435.75it/s]
INFO:root:Computing O...
INFO:root:Estimating \mu...
  0%|          | 0/500 [00:00<?, ?epoch/s]INFO:root:[0 epochs]: TRAIN:[loss=6.172]
INFO:root:[100 epochs]: TRAIN:[loss=0.009]
INFO:root:[200 epochs]: TRAIN:[loss=0.003]
 49%|████▉     | 246/500 [00:00<00:00, 2458.83epoch/s]INFO:root:[300 epochs]: TRAIN:[loss=0.001]
INFO:root:[400 epochs]: TRAIN:[loss=0.001]
100%|██████████| 500/500 [00:00<00:00, 2782.07epoch/s]
INFO:root:Finished Training


In [200]:
import pandas as pd
from snorkel.labeling import labeling_function, LFApplier
from snorkel.labeling.model import LabelModel
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transformers import pipeline

# Initialize VADER and RoBERTa sentiment analyzers
vader_analyzer = SentimentIntensityAnalyzer()
# roberta_analyzer = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment")

# Define sentiment labels
POSITIVE = 1
NEGATIVE = -1
NEUTRAL = 0

# Define keywords for positive and negative sentiment
positive_words = ["excellent", "amazing", "great", "fantastic", "love", "wonderful", "perfect", "satisfied", "good", "best", "recommend"]
negative_words = ["bad", "poor", "terrible", "awful", "disappointed", "waste", "horrible", "worst", "not worth", "refund", "broken", "defective"]

# Refined Ratio-based labeling function with a narrower neutral range
@labeling_function()
def lf_positive_negative_ratio(row):
    if row["text"]:
        positive_count = sum(row["text"].lower().count(word) for word in positive_words)
        negative_count = sum(row["text"].lower().count(word) for word in negative_words)
        
        if positive_count > 0 and negative_count > 0:
            ratio = positive_count / negative_count
            if 0.9 <= ratio <= 1.1:  # Narrower neutral range
                return NEUTRAL
            elif ratio > 1.1:
                return POSITIVE
            else:
                return NEGATIVE
        elif positive_count > 0:  # If only positive words are found
            return POSITIVE
        elif negative_count > 0:  # If only negative words are found
            return NEGATIVE
    return NEUTRAL  # Return NEUTRAL if no positive or negative words are found

# Labeling function based on high rating
@labeling_function()
def lf_high_rating(row):
    return POSITIVE if row["rating"] >= 4 else NEUTRAL  # Adjusted to include ratings of 4 and above

# Labeling function based on low rating (rating 2 or below as NEGATIVE)
@labeling_function()
def lf_low_rating(row):
    return NEGATIVE if row["rating"] <= 2 else NEUTRAL  # Broadened to capture low ratings

# VADER-based labeling function with stricter thresholds
@labeling_function()
def lf_vader_sentiment(row):
    if row["text"]:
        score = vader_analyzer.polarity_scores(row["text"])["compound"]
        if score >= 0.05:  # Slightly higher threshold for positive
            return POSITIVE
        elif score <= -0.05:  # Slightly lower threshold for negative
            return NEGATIVE
    return NEUTRAL
# @labeling_function()
# def lf_vader_sentiment(row):
#     if row["text"]:
#         scores = vader_analyzer.polarity_scores(row["text"])
#         pos_score = scores["pos"]
#         neg_score = scores["neg"]
#         neu_score = scores["neu"]
        
#         # Set thresholds based on typical compound threshold for positive, neutral, and negative
#         if pos_score >= 0.3:# and neu_score <= 0.6 and neg_score <= 0.1:
#             return POSITIVE
#         elif neg_score >= 0.5:# and neu_score <= 0.6 and pos_score <= 0.1:
#             return NEGATIVE
#         else:
#             return NEUTRAL  # Treat as neutral if it doesn’t strongly meet the positive or negative criteria
#     return NEUTRAL


# List of labeling functions
# lfs = [lf_positive_negative_ratio, lf_high_rating, lf_low_rating, lf_vader_sentiment]
lfs = [lf_high_rating, lf_low_rating, lf_vader_sentiment]

# Convert DataFrame to pandas, ensure rating column is numeric, and then to dictionary format
df_pandas = df_cleaned.toPandas()
df_pandas['rating'] = pd.to_numeric(df_pandas['rating'], errors='coerce')
data_points = df_pandas.to_dict(orient="records")  # Convert to list of dictionaries

# Apply labeling functions
applier = LFApplier(lfs)
L = applier.apply(data_points)

# Train LabelModel and predict
label_model = LabelModel(cardinality=3, verbose=True)
label_model.fit(L_train=L, n_epochs=500, log_freq=100)

# Assign predicted labels
df_pandas['label_snorkel_rating'] = label_model.predict(L)

# Display the labeled DataFrame
df_pandas.head()


140735it [00:10, 14035.74it/s]
INFO:root:Computing O...
INFO:root:Estimating \mu...
  0%|          | 0/500 [00:00<?, ?epoch/s]INFO:root:[0 epochs]: TRAIN:[loss=3.907]
INFO:root:[100 epochs]: TRAIN:[loss=0.004]
INFO:root:[200 epochs]: TRAIN:[loss=0.001]
INFO:root:[300 epochs]: TRAIN:[loss=0.000]
 64%|██████▍   | 320/500 [00:00<00:00, 3192.94epoch/s]INFO:root:[400 epochs]: TRAIN:[loss=0.000]
100%|██████████| 500/500 [00:00<00:00, 3204.18epoch/s]
INFO:root:Finished Training


Unnamed: 0,review_month,rating,parent_asin,asin,helpful_vote,text,timestamp,title,user_id,verified_purchase,review_date_timestamp,main_category,product_name,categories,price,average_rating,rating_number,year,label_snorkel_rating
0,2,1.0,B07VWSCGC2,B00KIG3QVG,3,Looks great until you get the glasses. Bought ...,1551323000000.0,Don’t waste your money😡😡,AFX5APGYM7BMT7Z7UGDP6AWB7FXA,True,2019-02-27,Amazon Home,Libbey Stemless 12-Piece Wine Glass Party Set ...,"Home & Kitchen,Kitchen & Dining,Dining & Enter...",32.99,4.7,4948.0,2019.0,2
1,2,1.0,B08FSP9ZQT,B015JT4AWA,0,Didn't work after setting up. Wouldn't turn at...,1551115000000.0,Didn't work.,AHLSDEJV73T2G7SMAOYCDZASYWFA,True,2019-02-25,Amazon Home,e-Flame USA Regal Freestanding Electric Firepl...,"Home & Kitchen,Heating, Cooling & Air Quality,...",299.99,4.0,760.0,2019.0,0
2,2,1.0,B0BT2ZM8D8,B0030EH7S6,1,There really is nothing I like about the machi...,1517768000000.0,There really is nothing I like about the machine,AHJQUMJPSY5GKYDPGQCQU73VH66Q,True,2018-02-04,Amazon Home,Cuisinart CJE-500 Compact Juice Extractor Blac...,"Home & Kitchen,Kitchen & Dining,Small Applianc...",99.95,4.4,4162.0,2018.0,2
3,2,1.0,B07JN7TDV6,B002BDTETW,0,over half of them were broken upon receive,1549441000000.0,DON'T BUY,AELMQWNMJCPY6HJRA4HPZNGFV7XA,True,2019-02-06,Amazon Home,Sterilite 16668004 90 Quart/85 Liter Storage B...,"Home & Kitchen,Storage & Organization,Baskets,...",44.34,4.3,582.0,2019.0,2
4,4,3.0,B0BGP7T3DP,B015DRQ36E,0,The lid is very tight and in order to take if ...,1554930000000.0,This is a nice canister BUT the lid if very ha...,AFU7L5DBEBJUUFFFUWBM7E7GA2XA,True,2019-04-10,Amazon Home,Utopia Kitchen Compost Bin for Kitchen Counter...,"Home & Kitchen,Storage & Organization,Trash, R...",21.99,4.6,15740.0,2019.0,0


In [18]:
import pandas as pd
from snorkel.labeling import labeling_function, LFApplier
from snorkel.labeling.model import LabelModel
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transformers import pipeline

# Initialize VADER and RoBERTa sentiment analyzers
vader_analyzer = SentimentIntensityAnalyzer()
# roberta_analyzer = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment")

# Define sentiment labels
POSITIVE = 1
NEGATIVE = -1
NEUTRAL = 0

# Define keywords for positive and negative sentiment
positive_words = ["excellent", "amazing", "great", "fantastic", "love", "wonderful", "perfect", "satisfied", "good", "best", "recommend"]
negative_words = ["bad", "poor", "terrible", "awful", "disappointed", "waste", "horrible", "worst", "not worth", "refund", "broken", "defective"]

# Refined Ratio-based labeling function with a narrower neutral range
@labeling_function()
def lf_positive_negative_ratio(row):
    if row["text"]:
        positive_count = sum(row["text"].lower().count(word) for word in positive_words)
        negative_count = sum(row["text"].lower().count(word) for word in negative_words)
        
        if positive_count > 0 and negative_count > 0:
            ratio = positive_count / negative_count
            if 0.9 <= ratio <= 1.1:  # Narrower neutral range
                return NEUTRAL
            elif ratio > 1.1:
                return POSITIVE
            else:
                return NEGATIVE
        elif positive_count > 0:  # If only positive words are found
            return POSITIVE
        elif negative_count > 0:  # If only negative words are found
            return NEGATIVE
    return NEUTRAL  # Return NEUTRAL if no positive or negative words are found

# Labeling function based on high rating
@labeling_function()
def lf_high_rating(row):
    return POSITIVE if row["rating"] >= 4 else NEUTRAL  # Adjusted to include ratings of 4 and above

# Labeling function based on low rating (rating 2 or below as NEGATIVE)
@labeling_function()
def lf_low_rating(row):
    return NEGATIVE if row["rating"] <= 2 else NEUTRAL  # Broadened to capture low ratings

# VADER-based labeling function with stricter thresholds
@labeling_function()
def lf_vader_sentiment(row):
    if row["text"]:
        score = vader_analyzer.polarity_scores(row["text"])["compound"]
        if score >= 0.05:  # Slightly higher threshold for positive
            return POSITIVE
        elif score <= -0.05:  # Slightly lower threshold for negative
            return NEGATIVE
    return NEUTRAL

# List of labeling functions
# lfs = [lf_positive_negative_ratio, lf_high_rating, lf_low_rating, lf_vader_sentiment]
lfs = [lf_vader_sentiment, lf_high_rating, lf_low_rating]

# Convert DataFrame to pandas, ensure rating column is numeric, and then to dictionary format
df_pandas = df_cleaned.toPandas()
df_pandas['rating'] = pd.to_numeric(df_pandas['rating'], errors='coerce')
data_points = df_pandas.to_dict(orient="records")  # Convert to list of dictionaries

# Apply labeling functions
applier = LFApplier(lfs)
L = applier.apply(data_points)

# Train LabelModel and predict
label_model = LabelModel(cardinality=3, verbose=True)
label_model.fit(L_train=L, n_epochs=500, log_freq=100)

# Assign predicted labels
df_pandas['label_snorkel_rating'] = label_model.predict(L)

# Display the labeled DataFrame
df_pandas.head()


140735it [00:09, 14188.83it/s]
INFO:root:Computing O...
INFO:root:Estimating \mu...
  0%|          | 0/500 [00:00<?, ?epoch/s]INFO:root:[0 epochs]: TRAIN:[loss=3.542]
INFO:root:[100 epochs]: TRAIN:[loss=0.005]
INFO:root:[200 epochs]: TRAIN:[loss=0.001]
INFO:root:[300 epochs]: TRAIN:[loss=0.000]
 61%|██████    | 305/500 [00:00<00:00, 3047.99epoch/s]INFO:root:[400 epochs]: TRAIN:[loss=0.000]
100%|██████████| 500/500 [00:00<00:00, 2899.51epoch/s]
INFO:root:Finished Training


Unnamed: 0,review_month,rating,parent_asin,asin,helpful_vote,text,timestamp,title,user_id,verified_purchase,review_date_timestamp,main_category,product_name,categories,price,average_rating,rating_number,year,label_snorkel_rating
0,2,1.0,B07VWSCGC2,B00KIG3QVG,3,Looks great until you get the glasses. Bought ...,1551323000000.0,Don’t waste your money😡😡,AFX5APGYM7BMT7Z7UGDP6AWB7FXA,True,2019-02-27,Amazon Home,Libbey Stemless 12-Piece Wine Glass Party Set ...,"Home & Kitchen,Kitchen & Dining,Dining & Enter...",32.99,4.7,4948.0,2019.0,2
1,2,1.0,B08FSP9ZQT,B015JT4AWA,0,Didn't work after setting up. Wouldn't turn at...,1551115000000.0,Didn't work.,AHLSDEJV73T2G7SMAOYCDZASYWFA,True,2019-02-25,Amazon Home,e-Flame USA Regal Freestanding Electric Firepl...,"Home & Kitchen,Heating, Cooling & Air Quality,...",299.99,4.0,760.0,2019.0,0
2,2,1.0,B0BT2ZM8D8,B0030EH7S6,1,There really is nothing I like about the machi...,1517768000000.0,There really is nothing I like about the machine,AHJQUMJPSY5GKYDPGQCQU73VH66Q,True,2018-02-04,Amazon Home,Cuisinart CJE-500 Compact Juice Extractor Blac...,"Home & Kitchen,Kitchen & Dining,Small Applianc...",99.95,4.4,4162.0,2018.0,2
3,2,1.0,B07JN7TDV6,B002BDTETW,0,over half of them were broken upon receive,1549441000000.0,DON'T BUY,AELMQWNMJCPY6HJRA4HPZNGFV7XA,True,2019-02-06,Amazon Home,Sterilite 16668004 90 Quart/85 Liter Storage B...,"Home & Kitchen,Storage & Organization,Baskets,...",44.34,4.3,582.0,2019.0,2
4,4,3.0,B0BGP7T3DP,B015DRQ36E,0,The lid is very tight and in order to take if ...,1554930000000.0,This is a nice canister BUT the lid if very ha...,AFU7L5DBEBJUUFFFUWBM7E7GA2XA,True,2019-04-10,Amazon Home,Utopia Kitchen Compost Bin for Kitchen Counter...,"Home & Kitchen,Storage & Organization,Trash, R...",21.99,4.6,15740.0,2019.0,0


In [19]:
# Define mapping based on cardinality and label index
label_map = {0: 'NEUTRAL', 1: 'POSITIVE', 2: 'NEGATIVE'}

# Convert numeric predictions to descriptive labels in `df_pandas`
df_pandas['label_snorkel_rating'] = [label_map[label] for label in label_model.predict(L)]

# Verify results by checking the first few predictions with descriptive labels
df_pandas[['text', 'rating', 'label_snorkel_rating']].head()


Unnamed: 0,text,rating,label_snorkel_rating
0,Looks great until you get the glasses. Bought ...,1.0,NEGATIVE
1,Didn't work after setting up. Wouldn't turn at...,1.0,NEUTRAL
2,There really is nothing I like about the machi...,1.0,NEGATIVE
3,over half of them were broken upon receive,1.0,NEGATIVE
4,The lid is very tight and in order to take if ...,3.0,NEUTRAL


In [22]:
for i in range(100):
    print(df_pandas['text'][
        (df_pandas['label_snorkel_rating'] == 'NEUTRAL') & 
        (df_pandas['rating'] == 3)
    ].iloc[i])#[3]


The lid is very tight and in order to take if off , it takes two hands and a grip around the handle. This should be fixed over all however it fits for what I need.
Purchased on sale for $29.99, and in two weeks, I'm happy because I don't have to hold a button down for a minute, just touch and continue preparing breakfast. I had it on the finest grind, and it got stuck day one, so I set it back one and no problems since. Plus, it measures for me. 4.5 is perfect for my needs, and the hopper holds more than a week's worth of beans.<br /><br />I'm so thankful not to have to push down the inconvenient plunger on a Capresso blade grinder--which, by the way, necessitated I do 2 batches: dump the uncompressed ground coffee into the lid, to dig out the fine grind stuck in the bottom, then finish grinding the rest...a painful mess, every single day). I even had a special spatula I used to keep from dulling the blades with metal.
I was so excited about this! I had high expectations but I shouldn'

In [15]:
for i in range(100):
    print(df_pandas['text'][
        (df_pandas['label_snorkel_rating'] == 'NEUTRAL') & 
        (df_pandas['rating'] == 4) |         (df_pandas['rating'] == 5)

    ].iloc[i])#[3]


Love this and for the price!
I bought these as a gift for a friend. On receipt, I checked to make sure nothing was cracked / broken during shipping but these are well packed. Everything looked perfect. I picked one up and it was quite heavy and felt good in the hand. My friend really liked them as well!
Good product. Very durable
Overall I like it a lot. It looks classy and has lots of options to hang stuff.<br />If one of the shelves was a bit wider it could fit a soap dish sitting flat<br />and I would like the suction cups to be movable to adjust to avoid tile joints.<br />I'm glad I bought it.
I had similar results to others, trying to simply use it out of the box failed with anything like an egg. Heat, add butter or oil, add egg, insta-stick and a huge mess. :( Other things like ground meat, etc did the same, even after repeated use. So after a year or so decided to smooth it out. Started with 100 grit sandpaper, then 600 wet, then 2000 wet, then finally finish with toothpaste. Th

In [17]:
for i in range(100):
    print(df_pandas['text'][
        (df_pandas['label_snorkel_rating'] == 'POSITIVE')
    ].iloc[i])#[3]


Very thin but love it for the price
They matched my daughters theme perfect
good for keep at home for family
KNife is good and sharp as this price, kNife is well finish from the spine. The price you pay is worth what you get. But the handle is slippery could be better.
Great
Love everything about this can except for the closing mechanism- you have to make sure you lay the bag just right against the lid or the lid won’t close- Although it’s a very night trash can, I think it’s overpriced
Nice
Brought back fond memories of days gone by.<br />Good quality construction.<br />Inscence not as good smelling as I remember.<br />Scotch tape holding roof on for shipping left tape residue and lifted finish.
It seems like it has issues reading under 100mg.
Love this and for the price!
I bought these as a gift for a friend. On receipt, I checked to make sure nothing was cracked / broken during shipping but these are well packed. Everything looked perfect. I picked one up and it was quite heavy and 

In [7]:
df_pandas[
        (df_pandas['label_snorkel_rating'] == 'NEUTRAL') & 
        (df_pandas['rating'] == 5)
    ].shape, df_pandas[
        (df_pandas['label_snorkel_rating'] == 'NEUTRAL') & 
        (df_pandas['rating'] == 1)
    ].shape

((7493, 19), (2376, 19))

In [8]:
df_pandas[df_pandas['label_snorkel_rating'] == 'NEUTRAL']['rating'].value_counts()

rating
3.0    8480
5.0    7493
1.0    2376
4.0    1470
2.0    1069
Name: count, dtype: int64

In [9]:
df_pandas[df_pandas['label_snorkel_rating'] == 'POSITIVE']['rating'].value_counts()


rating
5.0    90326
4.0    13463
Name: count, dtype: int64

In [10]:
df_pandas[df_pandas['label_snorkel_rating'] == 'NEGATIVE']['rating'].value_counts()


rating
1.0    10812
2.0     5240
Name: count, dtype: int64

In [206]:
df_pandas['rating'].value_counts()

rating
5.0    97819
4.0    14933
1.0    13188
3.0     8480
2.0     6309
Name: count, dtype: int64

In [207]:
# for i in range(100):
#     print(df_pandas['text'][(df_pandas['label_snorkel_rating'] == 'NEUTRAL')].iloc[i])

In [208]:
df_pandas['label_snorkel_rating'].value_counts()

label_snorkel_rating
POSITIVE    103789
NEUTRAL      20894
NEGATIVE     16052
Name: count, dtype: int64

In [11]:
df_pandas['label_snorkel_rating'].value_counts()

label_snorkel_rating
POSITIVE    103789
NEUTRAL      20894
NEGATIVE     16052
Name: count, dtype: int64

In [30]:
df_pandas[['rating','label_snorkel_rating','text']]#.value_counts()#['rating'][0]

Unnamed: 0,label_snorkel_rating,text
0,0,Looks great until you get the glasses. Bought ...
1,-1,Didn't work after setting up. Wouldn't turn at...
2,-1,There really is nothing I like about the machi...
3,-1,over half of them were broken upon receive
4,0,The lid is very tight and in order to take if ...
...,...,...
140730,-1,I guess all the 5 stars are when the product i...
140731,-1,This aired didn't even come with a air pump.Ho...
140732,-1,This does not work! Bed bugs got out and we st...
140733,-1,Super cheap - so disappointed- don’t expect it...


In [53]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import col, lower, when
from pyspark.sql.types import IntegerType
from snorkel.labeling import labeling_function, LFApplier
from snorkel.labeling.model import LabelModel


# Initialize Spark session
spark = SparkSession.builder.appName("SentimentLabeling").getOrCreate()

# Define sentiment labels
POSITIVE = 1
NEGATIVE = -1
NEUTRAL = 0

# Define keywords for positive and negative sentiment
positive_words = ["excellent", "amazing", "great", "fantastic", "love", "wonderful", "perfect", "satisfied", "good", "best", "recommend"]
negative_words = ["bad", "poor", "terrible", "awful", "disappointed", "waste", "horrible", "worst", "not worth", "refund", "broken", "defective"]

# Load your data
# df_cleaned = spark.read.csv("your_file.csv", header=True, inferSchema=True)

# Convert "rating" column to integer if it isn't already
df_cleaned = df_cleaned.withColumn("rating", col("rating").cast(IntegerType()))

# Labeling function for positive sentiment based on text
@labeling_function()
def lf_contains_positive_words(row):
    return POSITIVE if row["text"] and any(word in row["text"].lower() for word in positive_words) else NEUTRAL

# Labeling function for negative sentiment based on text
@labeling_function()
def lf_contains_negative_words(row):
    return NEGATIVE if row["text"] and any(word in row["text"].lower() for word in negative_words) else NEUTRAL

# Labeling function based on rating
@labeling_function()
def lf_high_rating(row):
    return POSITIVE if row["rating"] >= 4 else NEUTRAL

@labeling_function()
def lf_low_rating(row):
    return NEGATIVE if row["rating"] <= 2 else NEUTRAL

# List of labeling functions
lfs = [lf_high_rating, lf_low_rating, lf_contains_positive_words, lf_contains_negative_words]

# Convert Spark DataFrame to list of dictionaries for Snorkel compatibility
data_points = df_cleaned.select("text", "rating").rdd.map(lambda row: row.asDict()).collect()

# Apply labeling functions
applier = LFApplier(lfs)
L = applier.apply(data_points)

# Train LabelModel and predict
label_model = LabelModel(cardinality=3, verbose=True)  # cardinality=3 for three labels
label_model.fit(L_train=L, n_epochs=500, log_freq=100)

# Get predictions as a list
predictions = label_model.predict(L)

# Convert predictions to a Spark DataFrame with an index
predictions_df = spark.createDataFrame([(int(pred),) for pred in predictions], ["label_snorkel_rating"])

# Add an index column to both DataFrames
df_cleaned = df_cleaned.withColumn("index", F.monotonically_increasing_id())
predictions_df = predictions_df.withColumn("index", F.monotonically_increasing_id())

# Join on index and drop extra column
df_cleaned = df_cleaned.join(predictions_df, on="index").drop("index")

# Optional: Map integer labels to text labels
df_cleaned = df_cleaned.withColumn(
    "sentiment_label",
    when(df_cleaned.label_snorkel_rating == 1, "POSITIVE")
    .when(df_cleaned.label_snorkel_rating == 0, "NEUTRAL")
    .when(df_cleaned.label_snorkel_rating == -1, "NEGATIVE")
)

# Show final DataFrame
df_cleaned.select("label_snorkel_rating", "sentiment_label").show()


4514it [00:00, 122499.86it/s]


TypeError: '>=' not supported between instances of 'NoneType' and 'int'

In [48]:
label_counts = df_cleaned.groupBy("label_snorkel_rating").count()

# Display the counts
label_counts.show()


+--------------------+------+
|label_snorkel_rating| count|
+--------------------+------+
|                   0|131825|
|                   1|  3306|
+--------------------+------+



In [54]:
df_cleaned.filter(col("rating").isNull()).show()


+--------------------+------+--------------------+--------------------+------------+----+---------+-------+-------+-----------------+---------------------+-------------+------------+----------+-------+--------------+-------------+----+
|        review_month|rating|         parent_asin|                asin|helpful_vote|text|timestamp|  title|user_id|verified_purchase|review_date_timestamp|main_category|product_name|categories|  price|average_rating|rating_number|year|
+--------------------+------+--------------------+--------------------+------------+----+---------+-------+-------+-----------------+---------------------+-------------+------------+----------+-------+--------------+-------------+----+
|Assorted colors -...|  NULL|                NULL|                 3.9|         284|2019|     NULL|unknown|   NULL|             NULL|                 NULL|         NULL|        NULL|      NULL|unknown|          NULL|         NULL|NULL|
|              4 Cup"|  NULL|                NULL|      

In [19]:
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
import re
import nltk
from nltk.tokenize import sent_tokenize

# Download NLTK punkt only once in the driver
nltk.download('punkt')

# Initialize Spark session
spark = SparkSession.builder.appName("Extract Sentences").getOrCreate()

# Broadcast punkt tokenizer to worker nodes
punkt_model = nltk.data.load('tokenizers/punkt/english.pickle')
sc = spark.sparkContext
broadcast_punkt = sc.broadcast(punkt_model)

# Sample DataFrame creation (replace with your actual DataFrame)
# df = spark.createDataFrame([...])

# Define aspects and corresponding keywords
aspects_keywords = {
    "Product Quality": ["quality", "durability", "build", "craftsmanship", "material"],
    "Usability": ["easy to use", "user-friendly", "install", "setup", "interface"],
    "Customer Service": ["customer service", "support", "response", "help", "assistance"],
    "Pricing": ["price", "cost", "value", "expensive", "cheap"],
    "Shipping and Delivery": ["shipping", "delivery", "arrive", "packaging", "tracking"]
}

# Function to extract sentences for each aspect
def extract_sentences(text, aspect):
    # Use the broadcasted punkt model to tokenize text into sentences
    sentences = broadcast_punkt.value.tokenize(text)
    # Filter sentences that contain any keywords from the specific aspect
    aspect_sentences = [s for s in sentences if any(keyword in s.lower() for keyword in aspects_keywords[aspect])]
    return aspect_sentences

# Create a new DataFrame to hold results
aspect_sentences = []

# Iterate over each aspect and extract sentences
for aspect in aspects_keywords.keys():
    df_aspect = df_cleaned.rdd.flatMap(lambda row: [(row.asin, row.parent_asin, row.user_id, aspect, sentence) 
                                             for sentence in extract_sentences(row.text, aspect)])
    aspect_sentences.extend(df_aspect.collect())

# Create a new DataFrame from the extracted sentences
df_extracted_sentences = spark.createDataFrame(aspect_sentences, ["asin", "parent_asin", "user_id", "aspect", "sentence"])

# Show the resulting DataFrame
df_extracted_sentences.show(truncate=False)

# Optionally, save the extracted sentences to a CSV file
# df_extracted_sentences.write.csv("extracted_sentences.csv", header=True)

# Stop Spark session
spark.stop()


[nltk_data] Downloading package punkt to /home/hrs/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/home/hrs/nltk_data'
    - '/home/hrs/anaconda3/envs/mlops_exp/nltk_data'
    - '/home/hrs/anaconda3/envs/mlops_exp/share/nltk_data'
    - '/home/hrs/anaconda3/envs/mlops_exp/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [31]:
import pandas as pd
import nltk

# Download the punkt tokenizer if not already done
nltk.download('punkt')

# df = pd.DataFrame(data)

# Define aspects and corresponding keywords
aspects_keywords = {
    "Product Quality": ["quality", "durability", "build", "craftsmanship", "material"],
    "Usability": ["easy to use", "user-friendly", "install", "setup", "interface"],
    "Customer Service": ["customer service", "support", "response", "help", "assistance"],
    "Pricing": ["price", "cost", "value", "expensive", "cheap"],
    "Shipping and Delivery": ["shipping", "delivery", "arrive", "packaging", "tracking"]
}

# Function to extract sentences for each aspect
def extract_sentences(text, aspect):
    # Tokenize text into sentences
    sentences = nltk.tokenize.sent_tokenize(text)
    # Filter sentences that contain any keywords from the specific aspect
    aspect_sentences = [s for s in sentences if any(keyword in s.lower() for keyword in aspects_keywords[aspect])]
    return aspect_sentences

# Create a new DataFrame to hold results
aspect_sentences = []

# Iterate over each aspect and extract sentences
for aspect in aspects_keywords.keys():
    for _, row in df_pandas.iterrows():
        sentences = extract_sentences(row['text'], aspect)
        for sentence in sentences:
            aspect_sentences.append({
                "asin": row['asin'],
                "parent_asin": row['parent_asin'],
                "user_id": row['user_id'],
                "aspect": aspect,
                "sentence": sentence
            })

# Create a new DataFrame from the extracted sentences
df_extracted_sentences = pd.DataFrame(aspect_sentences)

# Display the resulting DataFrame
print(df_extracted_sentences)


[nltk_data] Downloading package punkt to /home/hrs/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/home/hrs/nltk_data'
    - '/home/hrs/anaconda3/envs/mlops_exp/nltk_data'
    - '/home/hrs/anaconda3/envs/mlops_exp/share/nltk_data'
    - '/home/hrs/anaconda3/envs/mlops_exp/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [34]:
print(nltk.data.path)
nltk.tokenize.sent_tokenize("DO NOT BUY! The quality is awful, but I liked the packaging.")

['/home/hrs/nltk_data', '/home/hrs/anaconda3/envs/mlops_exp/nltk_data', '/home/hrs/anaconda3/envs/mlops_exp/share/nltk_data', '/home/hrs/anaconda3/envs/mlops_exp/lib/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data']


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/home/hrs/nltk_data'
    - '/home/hrs/anaconda3/envs/mlops_exp/nltk_data'
    - '/home/hrs/anaconda3/envs/mlops_exp/share/nltk_data'
    - '/home/hrs/anaconda3/envs/mlops_exp/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************
