In [1]:
import pandas as pd
import os

## Helper Functions

In [2]:
apify_format = ["productAsin", "ratingScore", "reviewTitle",
                 "reviewReaction", "reviewDescription", "isVerified"]

scrapehero_format = ["asin", "review_rating", "review_header",
                 "no_of_people_reacted_helpful", "review_text", "badge"]

def alignFormat(data):
    data = data[scrapehero_format]

    for index, row in data.iterrows():
        if row["badge"] == "Verified Purchase":
            data.at[index,"badge"] = "TRUE"
        else:
            data.at[index,"badge"] = "FALSE"

    for i in range(len(apify_format)):
        data = data.rename(columns = {scrapehero_format[i]:apify_format[i]})
    
    return data
    
def add_column(data, category_name):
    if category_name is not None:
        category = category_name
        category_list = [category]*len(data)
        data['category'] = category_list
    
    return data

def check_remove_duplicates(new_data):
    has_duplicate = new_data.duplicated(subset=['reviewDescription']).any()
    if has_duplicate:
        new_data.drop_duplicates(subset=['reviewDescription'], inplace=True)
        
    return new_data
    
def visualize_duplicates(data):
    duplicates = data[data.duplicated(['reviewDescription'])]
    print("Visualizing " + str(len(duplicates)) + " duplicate reviews:")

    return duplicates

def save_to_json(data, json_filename, category_name, export):
    new_data = add_column(data, category_name)

    if export:
        new_data.to_json(json_filename + ".json")
    
    return new_data

def save_to_csv(data, csv_filename, category_name, export):
    new_data = add_column(data, category_name)
    
    if export:
        new_data.to_csv(csv_filename + ".csv", index=False)
    
    return new_data

# Import negative reviews crawled from ScrapeHero

In [3]:
# Import configuration
dir_path = r'/Users/bryson/Documents/bryson/NTU/modules/CZ4045_NLP/data/negative-reviews'

books_csv_file_names = []
for path in os.listdir(dir_path):
    # check if current path is a file
    if os.path.isfile(os.path.join(dir_path, path)):
        print(path)
        books_csv_file_names.append(path)

NegativeReviews_Children_6.csv
NegativeReviews_Romance_5.csv
NegativeReviews_Romance_4.csv
NegativeReviews_Children_7.csv
NegativeReviews_Children_5.csv
NegativeReviews_Romance_6.csv
NegativeReviews_Romance_7.csv
NegativeReviews_Children_4.csv
.DS_Store
NegativeReviews_Romance_3.csv
NegativeReviews_Romance_2.csv
NegativeReviews_Children_1.csv
NegativeReviews_Children_3.csv
NegativeReviews_Romance_1.csv
NegativeReviews_Children_2.csv
NegativeReviews_HnE_1.csv
NegativeReviews_HnE_2.csv
NegativeReviews_HnE_3.csv
NegativeReviews_HnE_7.csv
NegativeReviews_HnE_6.csv
NegativeReviews_HnE_4.csv
NegativeReviews_HnE_5.csv
NegativeReviews_Mystery_3.csv
NegativeReviews_Mystery_2.csv
NegativeReviews_Romance_9.csv
NegativeReviews_Romance_8.csv
NegativeReviews_Mystery_1.csv


## Export Settings and Summary Tools

In [4]:
# Summary and export settings
reviews_crawled = 0
export_to_csv = False
final_df_list = []
column_format = ["productAsin", "ratingScore", "reviewTitle",
                 "reviewReaction", "reviewDescription", "isVerified", "category"]

## Crawling from Humor & Entertainment Book Category

In [5]:
file_name = "NegativeReviews_HnE_1"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "humor_entertainment")
final_df_list.append(book_reviews_df[column_format])

In [6]:
file_name = "NegativeReviews_HnE_2"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "humor_entertainment")
final_df_list.append(book_reviews_df[column_format])

In [7]:
file_name = "NegativeReviews_HnE_3"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "humor_entertainment")
final_df_list.append(book_reviews_df[column_format])

In [8]:
file_name = "NegativeReviews_HnE_4"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "humor_entertainment")
final_df_list.append(book_reviews_df[column_format])

In [9]:
file_name = "NegativeReviews_HnE_5"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "humor_entertainment")
final_df_list.append(book_reviews_df[column_format])

In [10]:
file_name = "NegativeReviews_HnE_6"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "humor_entertainment")
final_df_list.append(book_reviews_df[column_format])

In [11]:
file_name = "NegativeReviews_HnE_7"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "humor_entertainment")
final_df_list.append(book_reviews_df[column_format])

## Crawling from Children Book Category

In [12]:
file_name = "NegativeReviews_Children_1"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "children")
final_df_list.append(book_reviews_df[column_format])

In [13]:
file_name = "NegativeReviews_Children_2"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "children")
final_df_list.append(book_reviews_df[column_format])

In [14]:
file_name = "NegativeReviews_Children_3"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "children")
final_df_list.append(book_reviews_df[column_format])

In [15]:
file_name = "NegativeReviews_Children_4"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "children")
final_df_list.append(book_reviews_df[column_format])

In [16]:
file_name = "NegativeReviews_Children_5"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "children")
final_df_list.append(book_reviews_df[column_format])

In [17]:
file_name = "NegativeReviews_Children_6"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "children")
final_df_list.append(book_reviews_df[column_format])

In [18]:
file_name = "NegativeReviews_Children_7"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "children")
final_df_list.append(book_reviews_df[column_format])

## Crawling from Mystery Book Category

In [19]:
file_name = "NegativeReviews_Mystery_1"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "children")
final_df_list.append(book_reviews_df[column_format])

In [20]:
file_name = "NegativeReviews_Mystery_2"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "children")
final_df_list.append(book_reviews_df[column_format])

In [21]:
file_name = "NegativeReviews_Mystery_3"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "children")
final_df_list.append(book_reviews_df[column_format])

## Crawling from Romance Book Category

In [22]:
file_name = "NegativeReviews_Romance_1"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "children")
final_df_list.append(book_reviews_df[column_format])

In [23]:
file_name = "NegativeReviews_Romance_2"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "children")
final_df_list.append(book_reviews_df[column_format])

In [24]:
file_name = "NegativeReviews_Romance_3"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "children")
final_df_list.append(book_reviews_df[column_format])

In [25]:
file_name = "NegativeReviews_Romance_4"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "children")
final_df_list.append(book_reviews_df[column_format])

In [26]:
file_name = "NegativeReviews_Romance_5"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "children")
final_df_list.append(book_reviews_df[column_format])

In [27]:
file_name = "NegativeReviews_Romance_6"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "children")
final_df_list.append(book_reviews_df[column_format])

In [28]:
file_name = "NegativeReviews_Romance_7"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "children")
final_df_list.append(book_reviews_df[column_format])

In [29]:
file_name = "NegativeReviews_Romance_8"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "children")
final_df_list.append(book_reviews_df[column_format])

# Concatenate datas and remove duplicates

In [30]:
final_df_list_concatenated = pd.concat(final_df_list, ignore_index = True)
duplicates = visualize_duplicates(final_df_list_concatenated)

duplicates = duplicates.sort_values(by='reviewDescription')
duplicates

Visualizing 119 duplicate reviews:


Unnamed: 0,productAsin,ratingScore,reviewTitle,reviewReaction,reviewDescription,isVerified,category
5940,1982137452,1.0,Disappointed,,* Spoilers * Not sure why the reviews were so ...,FALSE,children
5957,1982137452,1.0,I Disapointment,,"A choice of my book club, and a groaned most o...",TRUE,children
5959,1982137452,1.0,This book stinks,4.0,"A stupid story with shallow, predictable chara...",TRUE,children
5914,1982137452,1.0,Ugh,1.0,"A super boring, tedious version of Sex in the ...",TRUE,children
5947,1982137452,1.0,In Five Years?????,4.0,A very disappointing book. All sad. Nothing go...,TRUE,children
...,...,...,...,...,...,...,...
5358,63215381,1.0,TERRIBLE QUALITY,,,TRUE,children
5637,1984806750,1.0,The cover came riped,1.0,,TRUE,children
5685,1984806750,1.0,The cover of the book is ripped,,,TRUE,children
5725,1984806750,1.0,Package was SOAKING wet and now my pages are w...,,,TRUE,children


In [31]:
final_df_list_concatenated = check_remove_duplicates(final_df_list_concatenated)
save_to_csv(final_df_list_concatenated, "final_books_dataset_duplicates_removed_negative", None, True)

final_df_list_concatenated

Unnamed: 0,productAsin,ratingScore,reviewTitle,reviewReaction,reviewDescription,isVerified,category
0,1647222613,1.0,Could this BE any crappier?,,This thing is awful- my girlfriend is a huge F...,TRUE,humor_entertainment
1,1647222613,1.0,Find something else!!,,Definitely not what I was expecting! You get a...,TRUE,humor_entertainment
2,1647222613,1.0,Waste of money,,Very disappointed for the price you pay all yo...,TRUE,humor_entertainment
3,1647222613,1.0,Don't buy it. Complete garbage!,,Complete waste of money. I knew going into thi...,TRUE,humor_entertainment
4,1647222613,1.0,Not worth it,,Very few items that are worth keeping or givin...,FALSE,humor_entertainment
...,...,...,...,...,...,...,...
6199,1250316774,1.0,Just awful.,,"Stupid plot, insipid characters. Nothing about...",TRUE,children
6200,1250316774,1.0,So much blablabla,,Don’t like the type of book that don’t focus j...,TRUE,children
6201,1250316774,1.0,"boring, predictable, 2 dimensional characters",,I should have read more reviews before purchas...,FALSE,children
6202,1250316774,1.0,Cringe,1.0,Characters taken right from a terrible rejecte...,TRUE,children


In [32]:
words = 0
for review in final_df_list_concatenated["reviewDescription"]:
    words += len(str(review).split())

# Crawled Negative Data Summary

In [33]:
duplicates_removed = reviews_crawled - len(final_df_list_concatenated)

print('Number of crawled reviews: ' + str(reviews_crawled))
print('Number of removed duplicate reviews: ' + str(duplicates_removed))
print('Number of unique negative reviews: ' + str(len(final_df_list_concatenated)))
print('Number of words: ' + str(words))

Number of crawled reviews: 6204
Number of removed duplicate reviews: 119
Number of unique negative reviews: 6085
Number of words: 398054
