In [17]:
import pandas as pd
import numpy as np
import re
import ast

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)  

## Preprocessing Book Catalog Data

In [18]:
books = pd.read_csv(r"C:\Users\ThinkPad\OneDrive\Kuliah_Ghana\SEMESTER_3\PBP\process_data\archive (1)\books_data.csv")

In [20]:
# filter out books with less than 5 ratings
book = books.dropna()

allowed_categories = ["Cooking", "Sports & Recreation", "Technology & Engineering", "Science",
                      "Foreign Language Study", "Travel",
                      "Computers", "Architecture", "Business & Economics", "Biography"]
book = book[book["ratingsCount"] <= 5]

book["categories"] = book["categories"].apply(lambda row: ', '.join(ast.literal_eval(row)))
book["authors"] = book["authors"].apply(lambda row: ', '.join(ast.literal_eval(row)))
book["publishedDate"] = book["publishedDate"].apply(lambda row: row.split("-")[0])
book["ratingsCount"] = book["ratingsCount"].astype(int)
book = book[book["categories"].isin(allowed_categories)]

# change column names
new_columns = ["title", "description", "author", "image_link", "preview_link", 
               "publisher", "published_date", "info_link", "category", "rating"]   
book.columns = new_columns

# sampling 50 books
sampled_book = book.sample(n=50, random_state=2000)
sampled_book.head(2)

Unnamed: 0,title,description,author,image_link,preview_link,publisher,published_date,info_link,category,rating
596,Extreme Exploits: Advanced Defenses Against Ha...,Provides information on how hackers target exp...,"Andrew Vladimirov, Konstantin Gavrilenko, Andr...",http://books.google.com/books/content?id=ue1SA...,http://books.google.nl/books?id=ue1SAAAAMAAJ&q...,McGraw Hill Professional,2006,http://books.google.nl/books?id=ue1SAAAAMAAJ&d...,Computers,1
127075,Fatal Harvest (Fatal Harvest Series #1),Fatal Harvest takes an unprecedented look at o...,Andrew Kimbrell,http://books.google.com/books/content?id=plTcV...,http://books.google.nl/books?id=plTcVDph_SQC&p...,Island Press,2002,http://books.google.nl/books?id=plTcVDph_SQC&d...,Business & Economics,2


In [21]:
# check the distribution of ratings
sampled_book.rating.value_counts()

1    21
2    14
3     9
4     5
5     1
Name: rating, dtype: int64

In [22]:
# remove non-ascii characters
sampled_book = sampled_book.reset_index(drop=True)

text_col = ["title", "description", "author", "category", "publisher"]
for col in text_col:
    sampled_book[col].replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)

In [23]:
# save the sampled book data
sampled_book.to_csv("book_sampled.csv", index=False)

## Preprocessing Book Review Data

In [24]:
rating_ori = pd.read_csv(r"C:\Users\ThinkPad\OneDrive\Kuliah_Ghana\SEMESTER_3\PBP\process_data\archive (1)/Books_rating.csv")

In [25]:
# filter books
rating = rating_ori.dropna()
rating = rating[rating["Title"].isin(sampled_book["title"])].drop(["Id", "Price", "review/helpfulness", 
                                                                   "review/time", "User_id"], axis=1)

# sample 50 reviews
rating_sampled = rating.sample(n=50, random_state=1)

# change column names
new_columns_rating = ["book_title", "reviewer_name", "review_score", "review_summary", "review_text"]
rating_sampled.columns = new_columns_rating

In [26]:
rating_sampled["review_date"] = pd.to_datetime(["2023-01-01" for _ in range(len(rating_sampled))])
rating_sampled["review_score"] = rating_sampled["review_score"].astype(int)
rating_sampled = rating_sampled.reset_index(drop=True)
rating_sampled.head(2)

Unnamed: 0,book_title,reviewer_name,review_score,review_summary,review_text,review_date
0,The Whole Foods Allergy Cookbook: Two Hundred ...,Michelle M. Pagan,1,Waste of time and money,This cookbook is a waste of money. The recipes...,2023-01-01
1,A Taste of Haiti (Hippocrene Cookbook Library),willie31,1,Disappointed,It seems that some ingredients were not correc...,2023-01-01


In [27]:
# save the sampled rating data
rating_sampled.to_csv("rating_sampled.csv", index=False)