In [31]:
import pandas as pd
import json
import clips

## Load & Pre-Process Dataset

In [32]:
data_df = pd.read_csv('../data/raw/amazon_popular_books_dataset.csv', usecols=['title', 'categories', 'rating', 'reviews_count'])

# Pre-process Dataset
data_df['rating'] = pd.to_numeric(data_df['rating'].str.slice(0, 3))

# Remove rows with duplicate title
data_df = data_df.drop_duplicates(subset='title', keep="last")

# Add weighted rating
m = 100  # set the minimum number of reviews to 100
C = data_df['rating'].mean()  # calculate the mean rating across all books

# define a function to calculate the weighted rating
def weighted_rating(row):
    v = row['reviews_count']
    R = row['rating']
    return (v / (v+m)) * R + (m / (v+m)) * C

# apply the function to each row of the dataframe to calculate the weighted rating
data_df['weighted_rating'] = data_df.apply(weighted_rating, axis=1)

# Remove "Books" entry from categories
data_df['categories'] = data_df['categories'].apply(lambda categories: json.loads(categories)[1:])

data_df

Unnamed: 0,rating,reviews_count,title,categories,weighted_rating
0,4.6,13451,Wuthering Heights (Collins Classics),"[Literature & Fiction, Genre Fiction]",4.600184
1,4.8,16628,THE DAYS THE CRAYONS QUIT,"[Children's Books, Literature & Fiction]",4.798953
2,4.8,11275,War Lord: Book 13 (The Last Kingdom Series),"[Literature & Fiction, Genre Fiction]",4.798461
3,4.8,15520,Code Name Bananas: The hilarious and epic new ...,"[Children's Books, Literature & Fiction]",4.798879
5,4.8,20453,The Hobbit & The Lord of the Rings Boxed Set,"[Literature & Fiction, Mythology & Folk Tales]",4.799148
...,...,...,...,...,...
2264,4.2,17923,Unspeakable Things,"[Mystery, Thriller & Suspense, Thrillers & Sus...",4.202358
2265,4.8,25304,What to Expect When You’re Expecting,"[Health, Fitness & Dieting, Women's Health, Pr...",4.799311
2266,4.7,11040,The Home Edit: A Guide to Organizing and Reali...,"[Crafts, Hobbies & Home, Home Improvement & De...",4.699326
2267,4.4,28030,The Family Upstairs: A Novel,"[Mystery, Thriller & Suspense, Thrillers & Sus...",4.400800


## Choose First 10 book titles

In [33]:
first_10_titles = data_df.head(10)['title'].tolist()
first_10_titles

['Wuthering Heights (Collins Classics)',
 'THE DAYS THE CRAYONS QUIT',
 'War Lord: Book 13 (The Last Kingdom Series)',
 'Code Name Bananas: The hilarious and epic new children’s book from multi-million bestselling author David Walliams',
 'The Hobbit & The Lord of the Rings Boxed Set',
 'The Chronicles of Narnia (Box Set)',
 'The Giving Tree',
 'The Intelligent Investor Rev Ed.: The Definitive Book on Value Investing',
 'Mere Christianity',
 'The One and Only Ivan']

## Get Unique Categories

In [34]:
categories = data_df.explode('categories')['categories'].unique()

## Create CLIPS environment

In [44]:
env = clips.Environment()

# Book Template
BOOK_TEMPLATE_STRING = """
(deftemplate book
    (slot title (type STRING))
    (slot category (type STRING))
    (slot w_rating (type FLOAT)))
"""
env.build(BOOK_TEMPLATE_STRING)

# Categorized Book Template
CATEGORIZED_BOOK_TEMPLATE_STRING = """
(deftemplate categorized-book
    (slot title (type STRING))
    (slot category (type STRING)))
"""
env.build(CATEGORIZED_BOOK_TEMPLATE_STRING)

# Uncategorized Book Template
UNCATEGORIZED_BOOK_TEMPLATE_STRING = """
(deftemplate uncategorized-book
    (slot title (type STRING)))
"""
env.build(UNCATEGORIZED_BOOK_TEMPLATE_STRING)

# Recommended Book Template
RECOMMENDED_BOOK_TEMPLATE_STRING = """
(deftemplate recommended-book
    (slot title (type STRING))
    (slot w_rating (type FLOAT)))
"""
env.build(RECOMMENDED_BOOK_TEMPLATE_STRING)

# Category Counter Template
CATEGORY_COUNTER_TEMPLATE_STRING = """
(deftemplate category-counter
    (slot category (type STRING))
    (slot count (type INTEGER)))
"""
env.build(CATEGORY_COUNTER_TEMPLATE_STRING)

# Categorized Books Rule
CATEGORIZED_BOOKS_RULE_STRING = """
(defrule categorize-books
    "Categorize books that have a category."
    (declare (salience 50))
    (book (title ?title) (category ?category))
    =>
    (assert (categorized-book (title ?title) (category ?category))))
"""
env.build(CATEGORIZED_BOOKS_RULE_STRING)

# Count Category Rule
COUNT_CATEGORY_RULE_STRING = """
(defrule count-category
    "Counter for each category."
    (declare (salience 40))
    ?categorized-book-adr <- (categorized-book (title ?title) (category ?category))
    ?category-counter-adr <- (category-counter (count ?count) (category ?category-title&:(eq ?category-title ?category)))
    ?uncategorized-book-adr <- (uncategorized-book (title ?uncategorized-title&:(eq ?uncategorized-title ?title)))
    =>
    (retract ?categorized-book-adr ?category-counter-adr)
    (assert (category-counter (category ?category) (count (+ ?count 1)))))
"""
env.build(COUNT_CATEGORY_RULE_STRING)

# Highest Category Count Rule
HIGHEST_CATEGORY_COUNT_RULE_STRING = """
(defrule highest-category-count
    "Find the category with the highest count."
    (declare (salience 30))
    (category-counter (count ?n1) (category ?category))
    (not (category-counter (count ?n2&:(> ?n2 ?n1))))
    =>
    (assert (highest-category ?category)))
"""
env.build(HIGHEST_CATEGORY_COUNT_RULE_STRING)

# Rule that picks only one "highest-category"
PICK_ONE_HIGHEST_CATEGORY_RULE_STRING = """
(defrule pick-one-highest-category
    "Picks only one 'highest category', just in case there are two categories with the same count."
    (declare (salience 20))
    (highest-category ?category)
    (not (highest-category-picked))
    =>
    (assert (most-popular-category ?category))
    (assert (highest-category-picked)))
"""
env.build(PICK_ONE_HIGHEST_CATEGORY_RULE_STRING)

# Return Recommended Books Rule
RETURN_RECOMMENDED_BOOKS_RULE_STRING = """
(defrule return-recommended-books
    "Return recommended books."
    (declare (salience 10))
    (book (title ?title) (category ?category) (w_rating ?w_rating))
    (most-popular-category ?most-popular-category&:(eq ?most-popular-category ?category))
    =>
    (assert (recommended-book (title ?title) (w_rating ?w_rating))))
"""
env.build(RETURN_RECOMMENDED_BOOKS_RULE_STRING)

book_template = env.find_template('book')
uncategorized_book_template = env.find_template('uncategorized-book')
category_counter_template = env.find_template('category-counter')

# Assert each category and set its count to 0
for category in categories:
    category_counter_template.assert_fact(category=category, count=0)

# Assert each book and its respective category
for _, row in data_df.iterrows():
    for category in row['categories']:
        book_template.assert_fact(
            title=row['title'],
            category=category,
            w_rating=row['weighted_rating']
        )

# Assert each book from the first 10 titles as an uncategorized book
for title in first_10_titles:
    uncategorized_book_template.assert_fact(title=title)

env.run()

4665

## Recommended Books

In [45]:
most_popular_category = None
recommended_books = []

for fact in env.facts():
    if fact.template.name == 'most-popular-category':
        most_popular_category = str(fact)[24:-2]
    if fact.template.name == 'recommended-book':
        fact_str = str(fact)
        book_title = fact_str[26:fact_str.index('")')]
        w_rating = float(fact_str[fact_str.index('(w_rating') + 10:-2])

        recommended_books.append({
            'book_title': book_title,
            'w_rating': w_rating
        })

# Sort by weighted rating
recommended_books = map(
    lambda book: book['book_title'],
    sorted(recommended_books, key=lambda book: book['w_rating'], reverse=True))

# Remove books that are in the original chosen 10 book titles
recommended_books = list(set(recommended_books) - set(first_10_titles))[:10]

print('Most Popular Category: ')
print(most_popular_category)
print()

print('Top 10 Recommended Books: ')
recommended_books

Most Popular Category: 
Literature & Fiction

Top 10 Recommended Books: 


['The One',
 'Project Hail Mary: A Novel',
 'Milk and Honey',
 'Six of Crows (Six of Crows, 1)',
 'The Rose Code: A Novel',
 'Americanah',
 'The Black Book',
 'A Wanted Man: A Jack Reacher Novel',
 'The Couple Next Door: A Novel',
 'The Alice Network: A Novel']

## Test Implementation

In [53]:
# Get most popular category of the first 10 books
first_10_book_categories = data_df.head(10).explode('categories')['categories'].tolist()
first_10_most_popular_category =  max(
    set(first_10_book_categories), 
    key=first_10_book_categories.count
)

# Test that the most popular category of the first 10 books is 
# the same as the returned most popular category
assert first_10_most_popular_category == most_popular_category

# Test that all recommended books contain the most popular category in the dataset
for book_title in recommended_books:
    assert most_popular_category in data_df[
        data_df['title'] == book_title
    ]['categories'].tolist()[0]
