In [1]:
import pandas as pd
import numpy as np
import re

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)  

## Preprocessing Book Catalog Data

In [2]:
books = pd.read_csv(r"C:\Users\ThinkPad\OneDrive\Kuliah_Ghana\SEMESTER_3\PBP\process_data\archive (1)\books_data.csv")

In [3]:
# filter out books with less than 5 ratings
book = books.dropna()

allowed_categories = ["Cooking", "Sports & Recreation", "Technology & Engineering", "Science",
                      "Foreign Language Study", "Travel", "Family & Relationships",
                      "Computers", "Architecture", "Business & Economics", "Biography"]
book = book[book["ratingsCount"] <= 5]

book["categories"] = book["categories"].apply(lambda row: re.findall(r"\['(.*?)'\]", row)[0] 
                                                   if len(re.findall(r"\['(.*?)'\]", row)) >= 1 else np.nan)
book["authors"] = book["authors"].apply(lambda row: re.findall(r"\['(.*?)'\]", row)[0] 
                                                   if len(re.findall(r"\['(.*?)'\]", row)) >= 1 else np.nan)
book["publishedDate"] = book["publishedDate"].apply(lambda row: row.split("-")[0])
book["ratingsCount"] = book["ratingsCount"].astype(int)
book = book[book["categories"].isin(allowed_categories)]

# change column names
new_columns = ["title", "description", "author", "image_link", "preview_link", 
               "publisher", "published_date", "info_link", "category", "rating"]   
book.columns = new_columns

# sampling 50 books
sampled_book = book.sample(n=50, random_state=2023)
sampled_book.head(2)

Unnamed: 0,title,description,author,image_link,preview_link,publisher,published_date,info_link,category,rating
197756,THE CORMORANT.,"Christian Beamish, a former editor at The Surf...",Christian Beamish,http://books.google.com/books/content?id=kQLnB...,http://books.google.com/books?id=kQLnBAAAQBAJ&...,Patagonia,2013,https://play.google.com/store/books/details?id...,Sports & Recreation,1
132745,"Girl Time: A Celebration of Chick Flicks, Bad ...",Presents a series of sixteen stories that are ...,Laura Jensen Walker,http://books.google.com/books/content?id=TgThm...,http://books.google.com/books?id=TgThmc48TuMC&...,Fleming H Revell Company,2004,http://books.google.com/books?id=TgThmc48TuMC&...,Family & Relationships,1


In [4]:
# check the distribution of ratings
sampled_book.rating.value_counts()

1    28
2    12
3     6
5     2
4     2
Name: rating, dtype: int64

In [5]:
# remove non-ascii characters
sampled_book = sampled_book.reset_index(drop=True)

text_col = ["title", "description", "author", "category", "publisher"]
for col in text_col:
    sampled_book[col].replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)

In [6]:
# save the sampled book data
sampled_book.to_csv("book_sampled.csv", index=False)

## Preprocessing Book Review Data

In [7]:
rating_ori = pd.read_csv(r"C:\Users\ThinkPad\OneDrive\Kuliah_Ghana\SEMESTER_3\PBP\process_data\archive (1)/Books_rating.csv")

In [8]:
# filter books
rating = rating_ori.dropna()
rating = rating[rating["Title"].isin(sampled_book["title"])].drop(["Id", "Price", "review/helpfulness", 
                                                                   "review/time", "User_id"], axis=1)

# sample 50 reviews
rating_sampled = rating.sample(n=50, random_state=1)

# change column names
new_columns_rating = ["book_title", "reviewer_name", "review_score", "review_summary", "review_text"]
rating_sampled.columns = new_columns_rating

In [9]:
rating_sampled["review_date"] = pd.to_datetime(["2023-01-01" for _ in range(len(rating_sampled))])
rating_sampled["review_score"] = rating_sampled["review_score"].astype(int)
rating_sampled = rating_sampled.reset_index(drop=True)
rating_sampled.head(2)

Unnamed: 0,book_title,reviewer_name,review_score,review_summary,review_text,review_date
0,French Made Simple: Learn to speak and underst...,Mache,5,french made simple,after searching for a french manual our teache...,2023-01-01
1,Programming Jabber: Extending XML Messaging (O...,Julian Missig,5,DJ delivers an excellent primer into the world...,"As a disclaimer, I have been involved with Jab...",2023-01-01


In [10]:
# save the sampled rating data
rating_sampled.to_csv("rating_sampled.csv", index=False)