In [None]:
import pandas as pd
import os

## Helper Functions

In [None]:
apify_format = ["productAsin", "ratingScore", "reviewTitle",
                 "reviewReaction", "reviewDescription", "isVerified"]

scrapehero_format = ["asin", "review_rating", "review_header",
                 "no_of_people_reacted_helpful", "review_text", "badge"]

def alignFormat(data):
    data = data[scrapehero_format]

    for index, row in data.iterrows():
        if row["badge"] == "Verified Purchase":
            data.at[index,"badge"] = "TRUE"
        else:
            data.at[index,"badge"] = "FALSE"

    for i in range(len(apify_format)):
        data = data.rename(columns = {scrapehero_format[i]:apify_format[i]})
    
    return data
    
def add_column(data, category_name):
    if category_name is not None:
        category = category_name
        category_list = [category]*len(data)
        data['category'] = category_list
    
    return data

def check_remove_duplicates(new_data):
    has_duplicate = new_data.duplicated(subset=['reviewDescription']).any()
    if has_duplicate:
        new_data.drop_duplicates(subset=['reviewDescription'], inplace=True)
        
    return new_data
    
def visualize_duplicates(data):
    duplicates = data[data.duplicated(['reviewDescription'])]
    print("Visualizing " + str(len(duplicates)) + " duplicate reviews:")

    return duplicates

def save_to_json(data, json_filename, category_name, export):
    new_data = add_column(data, category_name)

    if export:
        new_data.to_json(json_filename + ".json")
    
    return new_data

def save_to_csv(data, csv_filename, category_name, export):
    new_data = add_column(data, category_name)
    
    if export:
        new_data.to_csv(csv_filename + ".csv", index=False)
    
    return new_data

# Import negative reviews crawled from ScrapeHero

In [None]:
# Import configuration
dir_path = r'/Users/bryson/Documents/bryson/NTU/modules/CZ4045_NLP/data/negative-reviews'

books_csv_file_names = []
for path in os.listdir(dir_path):
    # check if current path is a file
    if os.path.isfile(os.path.join(dir_path, path)):
        print(path)
        books_csv_file_names.append(path)

## Export Settings and Summary Tools

In [None]:
# Summary and export settings
reviews_crawled = 0
export_to_csv = False
final_df_list = []
column_format = ["productAsin", "ratingScore", "reviewTitle",
                 "reviewReaction", "reviewDescription", "isVerified", "category"]

## Crawling from Humor & Entertainment Book Category

In [None]:
file_name = "NegativeReviews_HnE_1"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "humor_entertainment")
final_df_list.append(book_reviews_df[column_format])

In [None]:
file_name = "NegativeReviews_HnE_2"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "humor_entertainment")
final_df_list.append(book_reviews_df[column_format])

In [None]:
file_name = "NegativeReviews_HnE_3"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "humor_entertainment")
final_df_list.append(book_reviews_df[column_format])

In [None]:
file_name = "NegativeReviews_HnE_4"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "humor_entertainment")
final_df_list.append(book_reviews_df[column_format])

In [None]:
file_name = "NegativeReviews_HnE_5"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "humor_entertainment")
final_df_list.append(book_reviews_df[column_format])

In [None]:
file_name = "NegativeReviews_HnE_6"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "humor_entertainment")
final_df_list.append(book_reviews_df[column_format])

In [None]:
file_name = "NegativeReviews_HnE_7"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "humor_entertainment")
final_df_list.append(book_reviews_df[column_format])

## Crawling from Children Book Category

In [None]:
file_name = "NegativeReviews_Children_1"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "children")
final_df_list.append(book_reviews_df[column_format])

In [None]:
file_name = "NegativeReviews_Children_2"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "children")
final_df_list.append(book_reviews_df[column_format])

In [None]:
file_name = "NegativeReviews_Children_3"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "children")
final_df_list.append(book_reviews_df[column_format])

In [None]:
file_name = "NegativeReviews_Children_4"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "children")
final_df_list.append(book_reviews_df[column_format])

In [None]:
file_name = "NegativeReviews_Children_5"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "children")
final_df_list.append(book_reviews_df[column_format])

In [None]:
file_name = "NegativeReviews_Children_6"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "children")
final_df_list.append(book_reviews_df[column_format])

In [None]:
file_name = "NegativeReviews_Children_7"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "children")
final_df_list.append(book_reviews_df[column_format])

## Crawling from Mystery Book Category

In [None]:
file_name = "NegativeReviews_Mystery_1"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "children")
final_df_list.append(book_reviews_df[column_format])

In [None]:
file_name = "NegativeReviews_Mystery_2"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "children")
final_df_list.append(book_reviews_df[column_format])

In [None]:
file_name = "NegativeReviews_Mystery_3"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "children")
final_df_list.append(book_reviews_df[column_format])

## Crawling from Romance Book Category

In [None]:
file_name = "NegativeReviews_Romance_1"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "children")
final_df_list.append(book_reviews_df[column_format])

In [None]:
file_name = "NegativeReviews_Romance_2"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "children")
final_df_list.append(book_reviews_df[column_format])

In [None]:
file_name = "NegativeReviews_Romance_3"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "children")
final_df_list.append(book_reviews_df[column_format])

In [None]:
file_name = "NegativeReviews_Romance_4"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "children")
final_df_list.append(book_reviews_df[column_format])

In [None]:
file_name = "NegativeReviews_Romance_5"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "children")
final_df_list.append(book_reviews_df[column_format])

In [None]:
file_name = "NegativeReviews_Romance_6"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "children")
final_df_list.append(book_reviews_df[column_format])

In [None]:
file_name = "NegativeReviews_Romance_7"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "children")
final_df_list.append(book_reviews_df[column_format])

In [None]:
file_name = "NegativeReviews_Romance_8"
file_extension = ".csv"
book_reviews_df = pd.read_csv(dir_path + '/' + file_name + file_extension)
book_reviews_df = alignFormat(book_reviews_df)

reviews_crawled += len(book_reviews_df)
book_reviews_df = add_column(book_reviews_df, "children")
final_df_list.append(book_reviews_df[column_format])

# Concatenate datas and remove duplicates

In [None]:
final_df_list_concatenated = pd.concat(final_df_list, ignore_index = True)
duplicates = visualize_duplicates(final_df_list_concatenated)

duplicates = duplicates.sort_values(by='reviewDescription')
duplicates

In [None]:
final_df_list_concatenated = check_remove_duplicates(final_df_list_concatenated)
save_to_csv(final_df_list_concatenated, "final_books_dataset_duplicates_removed_negative", None, True)

final_df_list_concatenated

In [None]:
words = 0
for review in final_df_list_concatenated["reviewDescription"]:
    words += len(str(review).split())

# Crawled Negative Data Summary

In [None]:
duplicates_removed = reviews_crawled - len(final_df_list_concatenated)

print('Number of crawled reviews: ' + str(reviews_crawled))
print('Number of removed duplicate reviews: ' + str(duplicates_removed))
print('Number of unique negative reviews: ' + str(len(final_df_list_concatenated)))
print('Number of words: ' + str(words))