In [30]:
import pandas as pd
import json

## Load & Pre-Process Dataset

In [33]:
data_df = pd.read_csv('../data/raw/amazon_popular_books_dataset.csv', usecols=['title', 'categories', 'rating', 'reviews_count'])

# Pre-process Dataset
data_df['rating'] = pd.to_numeric(data_df['rating'].str.slice(0, 3))

# Add weighted rating
m = 100  # set the minimum number of reviews to 100
C = data_df['rating'].mean()  # calculate the mean rating across all books

# define a function to calculate the weighted rating
def weighted_rating(row):
    v = row['reviews_count']
    R = row['rating']
    return (v / (v+m)) * R + (m / (v+m)) * C

# apply the function to each row of the dataframe to calculate the weighted rating
data_df['weighted_rating'] = data_df.apply(weighted_rating, axis=1)

# Remove "Books" entry from categories
data_df['categories'] = data_df['categories'].apply(lambda categories: json.loads(categories)[1:])

data_df

Unnamed: 0,rating,reviews_count,title,categories,weighted_rating
0,4.6,13451,Wuthering Heights (Collins Classics),"[Literature & Fiction, Genre Fiction]",4.600165
1,4.8,16628,THE DAYS THE CRAYONS QUIT,"[Children's Books, Literature & Fiction]",4.798938
2,4.8,11275,War Lord: Book 13 (The Last Kingdom Series),"[Literature & Fiction, Genre Fiction]",4.798438
3,4.8,15520,Code Name Bananas: The hilarious and epic new ...,"[Children's Books, Literature & Fiction]",4.798863
4,4.8,10884,Skincare: The award-winning ultimate no-nonsen...,"[Crafts, Hobbies & Home, Home Improvement & De...",4.798383
...,...,...,...,...,...
2264,4.2,17923,Unspeakable Things,"[Mystery, Thriller & Suspense, Thrillers & Sus...",4.202343
2265,4.8,25304,What to Expect When You’re Expecting,"[Health, Fitness & Dieting, Women's Health, Pr...",4.799301
2266,4.7,11040,The Home Edit: A Guide to Organizing and Reali...,"[Crafts, Hobbies & Home, Home Improvement & De...",4.699303
2267,4.4,28030,The Family Upstairs: A Novel,"[Mystery, Thriller & Suspense, Thrillers & Sus...",4.400790


## Get Unique Categories

In [39]:
categories = data_df.explode('categories')['categories'].unique()
categories

array(['Literature & Fiction', 'Genre Fiction', "Children's Books",
       'Crafts, Hobbies & Home', 'Home Improvement & Design',
       'Mythology & Folk Tales', 'Science Fiction & Fantasy', 'Classics',
       'Business & Money', 'Investing', 'Mystery, Thriller & Suspense',
       'Thrillers & Suspense', 'Christian Books & Bibles',
       'Christian Living', 'Health, Fitness & Dieting',
       'Diets & Weight Loss', 'Humor & Entertainment', 'Humor',
       'Biographies & Memoirs', 'Arts & Literature',
       'Arts, Music & Photography', 'Politics & Social Sciences',
       'Politics & Government', 'Self-Help', 'Happiness',
       'Teen & Young Adult', 'History', 'World',
       'Religion & Spirituality', 'New Age & Spirituality',
       'Growing Up & Facts of Life', 'United States',
       'Cookbooks, Food & Wine', 'Cooking Education & Reference',
       'Reference', 'Writing, Research & Publishing Guides',
       'Comics & Graphic Novels', 'Graphic Novels', 'Dramas & Plays',
       '