In [1]:
import pandas as pd
import os

## Helper Functions

In [2]:
apify_format = ["productAsin", "ratingScore", "reviewTitle",
                 "reviewReaction", "reviewDescription", "isVerified"]

scrapehero_format = ["asin", "review_rating", "review_header",
                 "no_of_people_reacted_helpful", "review_text", "badge"]

def alignFormat(data):
    data = data[scrapehero_format]

    for index, row in data.iterrows():
        if row["badge"] == "Verified Purchase":
            data.at[index,"badge"] = "TRUE"
        else:
            data.at[index,"badge"] = "FALSE"

    for i in range(len(apify_format)):
        data = data.rename(columns = {scrapehero_format[i]:apify_format[i]})
    
    return data
    
def add_column(data, category_name):
    if category_name is not None:
        category = category_name
        category_list = [category]*len(data)
        data['category'] = category_list
    
    return data

def check_remove_duplicates(new_data):
    has_duplicate = new_data.duplicated(subset=['reviewDescription']).any()
    if has_duplicate:
        new_data.drop_duplicates(subset=['reviewDescription'], inplace=True)
        
    return new_data
    
def visualize_duplicates(data):
    duplicates = data[data.duplicated(['reviewDescription'])]
    print("Visualizing " + str(len(duplicates)) + " duplicate reviews:")

    return duplicates

def save_to_json(data, json_filename, category_name, export):
    new_data = add_column(data, category_name)

    if export:
        new_data.to_json(json_filename + "_preprocessed.csv")
    
    return new_data

def save_to_csv(data, csv_filename, category_name, export):
    new_data = add_column(data, category_name)
    
    if export:
        new_data.to_csv(csv_filename + "_preprocessed.csv", index=False)
    
    return new_data

# Import negative reviews crawled from ScrapeHero

In [3]:
# Import configuration
dir_path = r'/Users/bryson/Documents/bryson/NTU/modules/CZ4045_NLP/negative-reviews'

books_csv_file_names = []
for path in os.listdir(dir_path):
    # check if current path is a file
    if os.path.isfile(os.path.join(dir_path, path)):
        print(path)
        books_csv_file_names.append(path)

.DS_Store
NegativeReviewDetails_HnE_2.csv
NegativeReviewDetails_Children_4.csv
NegativeReviewDetails_Children_5.csv
NegativeReviewDetails_HnE_3.csv
NegativeReviewDetails_HnE_1.csv
NegativeReviewDetails_Children_7.csv
NegativeReviewDetails_Children_6.csv
NegativeReviewDetails_HnE_4.csv
NegativeReviewDetails_Children_2.csv
NegativeReviewDetails_Children_3.csv
NegativeReviewDetails_HnE_5.csv
NegativeReviewDetails_HnE_7.csv
NegativeReviewDetails_Children_1.csv
NegativeReviewDetails_HnE_6.csv


## Export Settings and Summary Tools

In [4]:
# Summary and export settings
reviews_crawled = 0
export_to_csv = False
final_df_list = []
column_format = ["productAsin", "ratingScore", "reviewTitle",
                 "reviewReaction", "reviewDescription", "isVerified", "category"]

In [5]:
file_name = "NegativeReviewDetails_HnE_1"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "humor_entertainment")
final_df_list.append(book_reviews_df[column_format])

In [6]:
file_name = "NegativeReviewDetails_HnE_2"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "humor_entertainment")
final_df_list.append(book_reviews_df[column_format])

In [7]:
file_name = "NegativeReviewDetails_HnE_3"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "humor_entertainment")
final_df_list.append(book_reviews_df[column_format])

In [8]:
file_name = "NegativeReviewDetails_HnE_4"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "humor_entertainment")
final_df_list.append(book_reviews_df[column_format])

In [9]:
file_name = "NegativeReviewDetails_HnE_5"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "humor_entertainment")
final_df_list.append(book_reviews_df[column_format])

In [10]:
file_name = "NegativeReviewDetails_HnE_6"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "humor_entertainment")
final_df_list.append(book_reviews_df[column_format])

In [11]:
file_name = "NegativeReviewDetails_HnE_7"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "humor_entertainment")
final_df_list.append(book_reviews_df[column_format])

In [12]:
file_name = "NegativeReviewDetails_Children_1"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "children")
final_df_list.append(book_reviews_df[column_format])

In [13]:
file_name = "NegativeReviewDetails_Children_2"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "children")
final_df_list.append(book_reviews_df[column_format])

In [14]:
file_name = "NegativeReviewDetails_Children_3"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "children")
final_df_list.append(book_reviews_df[column_format])

In [15]:
file_name = "NegativeReviewDetails_Children_4"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "children")
final_df_list.append(book_reviews_df[column_format])

In [16]:
file_name = "NegativeReviewDetails_Children_5"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "children")
final_df_list.append(book_reviews_df[column_format])

In [17]:
file_name = "NegativeReviewDetails_Children_6"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "children")
final_df_list.append(book_reviews_df[column_format])

In [18]:
file_name = "NegativeReviewDetails_Children_7"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "children")
final_df_list.append(book_reviews_df[column_format])

# Concatenate datas and remove duplicates

In [19]:
final_df_list_concatenated = pd.concat(final_df_list, ignore_index = True)
duplicates = visualize_duplicates(final_df_list_concatenated)

duplicates = duplicates.sort_values(by='reviewDescription')
duplicates

Visualizing 14 duplicate reviews:


Unnamed: 0,productAsin,ratingScore,reviewTitle,reviewReaction,reviewDescription,isVerified,category
739,606389830,1.0,One Star,1.0,Boring,True,humor_entertainment
746,606389830,1.0,Boring,,Boring,True,humor_entertainment
837,606389830,1.0,One Star,4.0,Boring,True,humor_entertainment
1018,1250069831,1.0,Boring,,Boring,True,humor_entertainment
2046,60935464,1.0,One Star,4.0,Boring,True,children
1799,145217380X,1.0,One Star,6.0,Dumb,False,children
1731,145217380X,1.0,One Star,31.0,Stupid book,False,children
303,B019MMUA8S,1.0,Waste of money,,Waste of money,True,humor_entertainment
1467,1472290690,1.0,boring,1.0,boring,False,humor_entertainment
2043,60935464,1.0,boring,1.0,boring,True,children


In [20]:
final_df_list_concatenated = check_remove_duplicates(final_df_list_concatenated)
save_to_csv(final_df_list_concatenated, "concatenated_duplicates_removed_negative", None, False)

final_df_list_concatenated

Unnamed: 0,productAsin,ratingScore,reviewTitle,reviewReaction,reviewDescription,isVerified,category
0,1647222613,1.0,Could this BE any crappier?,,This thing is awful- my girlfriend is a huge F...,TRUE,humor_entertainment
1,1647222613,1.0,Find something else!!,,Definitely not what I was expecting! You get a...,TRUE,humor_entertainment
2,1647222613,1.0,Waste of money,,Very disappointed for the price you pay all yo...,TRUE,humor_entertainment
3,1647222613,1.0,Don't buy it. Complete garbage!,,Complete waste of money. I knew going into thi...,TRUE,humor_entertainment
4,1647222613,1.0,Not worth it,,Very few items that are worth keeping or givin...,FALSE,humor_entertainment
...,...,...,...,...,...,...,...
2983,399255370,1.0,Terrible for children.,10.0,I wouldn't read this to anyone under the age o...,FALSE,children
2984,399255370,1.0,I am not happy.,,My book came covered in dirt. I was only able ...,FALSE,children
2985,399255370,1.0,... just don't understand why this book has so...,2.0,I just don't understand why this book has so m...,TRUE,children
2986,399255370,1.0,If you like wasting money then perhaps these a...,,These are beyond rubbish. Only the ends of the...,TRUE,children


In [21]:
words = 0
for review in final_df_list_concatenated["reviewDescription"]:
    words += len(str(review).split())

# Crawled Negative Data Summary

In [22]:
duplicates_removed = reviews_crawled - len(final_df_list_concatenated)

print('Number of crawled reviews: ' + str(reviews_crawled))
print('Number of removed duplicate reviews: ' + str(duplicates_removed))
print('Number of unique negative reviews: ' + str(len(final_df_list_concatenated)))
print('Number of words: ' + str(words))

Number of crawled reviews: 2988
Number of removed duplicate reviews: 14
Number of unique negative reviews: 2974
Number of words: 219181
