In [1]:
 # install dependencies
!pip install mlxtend
!pip install efficient-apriori
!pip install kagglehub



In [5]:
import kagglehub
path = kagglehub.dataset_download("mohamedbakhet/amazon-books-reviews")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\aemal\.cache\kagglehub\datasets\mohamedbakhet\amazon-books-reviews\versions\1


In [7]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import os

from efficient_apriori import apriori
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)
final_path = os.path.join(path, "Books_rating.csv")

In [9]:
# To create baskets I have used: the set of books reviewed by a same user, using books as items
cols_to_use = ['User_id', 'Title']
df = pd.read_csv(final_path, usecols=cols_to_use)
print("Total rows:", len(df))
df.head()


Total rows: 3000000


Unnamed: 0,Title,User_id
0,Its Only Art If Its Well Hung!,AVCGYZL8FQQTD
1,Dr. Seuss: American Icon,A30TK6U7DNS82R
2,Dr. Seuss: American Icon,A3UH4UZ4RSVO82
3,Dr. Seuss: American Icon,A2MVUWT453QH61
4,Dr. Seuss: American Icon,A22X4XUPKF66MR


In [11]:
# Clean data
df.dropna(subset=['User_id', 'Title'], inplace=True)
df.drop_duplicates(subset=['User_id', 'Title'], inplace=True)
df['Title'] = df['Title'].str.strip().str.lower()
print("Rows after cleaning:", len(df))
df.head()


Rows after cleaning: 2115811


Unnamed: 0,Title,User_id
0,its only art if its well hung!,AVCGYZL8FQQTD
1,dr. seuss: american icon,A30TK6U7DNS82R
2,dr. seuss: american icon,A3UH4UZ4RSVO82
3,dr. seuss: american icon,A2MVUWT453QH61
4,dr. seuss: american icon,A22X4XUPKF66MR


In [13]:
# Create baskets
baskets = df.groupby('User_id')['Title'].apply(list).tolist()
baskets = [basket for basket in baskets if len(basket) > 1]
print(f"Total baskets created: {len(baskets)}")
print("Sample basket:", baskets[0])


Total baskets created: 279562
Sample basket: ['the richest man in babylon', 'the richest man in babylon (babylonian parables dealing with the principles of finance, etc)', 'the richest man in babylon', 'attitude 101']


In [15]:
# Run A-Priori using efficient-apriori
min_support = 0.005
min_confidence = 0.5
itemsets, rules = apriori(baskets, min_support=min_support, min_confidence=min_confidence)

# Show sample rules
for rule in sorted(rules, key=lambda r: r.confidence, reverse=True):
    print(rule)


{george orwell 1984} -> {1984} (conf: 1.000, supp: 0.005, lift: 194.140, conv: 994849085.355)
{nineteen eighty-four} -> {1984} (conf: 1.000, supp: 0.005, lift: 194.140, conv: 994849085.355)
{1984} -> {nineteen eighty-four} (conf: 1.000, supp: 0.005, lift: 194.140, conv: 994849085.355)
{george orwell 1984} -> {nineteen eighty-four} (conf: 1.000, supp: 0.005, lift: 194.140, conv: 994849085.355)
{the great gatsby} -> {great gatsby (everyman)} (conf: 1.000, supp: 0.005, lift: 197.152, conv: 994927779.884)
{the great gatsby (leading english literature library)} -> {great gatsby (everyman)} (conf: 1.000, supp: 0.005, lift: 197.152, conv: 994927779.884)
{pride & prejudice (classic library)} -> {pride & prejudice (new windmill)} (conf: 1.000, supp: 0.006, lift: 159.204, conv: 993718745.752)
{pride & prejudice (classic library)} -> {pride & prejudice (penguin classics)} (conf: 1.000, supp: 0.006, lift: 159.023, conv: 993711591.704)
{pride & prejudice (classic library)} -> {pride and prejudice} 

In [17]:
def recommend_books(input_book, rules_list, top_n=5):
    """
    Recommend books using efficient-apriori rule objects.
    Prints book suggestions based on input_book.
    """
    input_book = input_book.lower().strip()
    input_set = frozenset([input_book])
    matching_rules = [r for r in rules_list if input_set.issubset(r.lhs)]
    matching_rules.sort(key=lambda r: (r.confidence, r.lift), reverse=True)
    recommendations = []
    for rule in matching_rules:
        for book in rule.rhs:
            if book != input_book and book not in recommendations:
                recommendations.append(book)
            if len(recommendations) >= top_n:
                break
        if len(recommendations) >= top_n:
            break

    # Print recommendations
    if recommendations:
        print(f"📚 Based on your interest in '{input_book}', you may also like:")
        for i, name in enumerate(recommendations, 1):
            print(f"{i}. {name}")
    else:
        print(f"❗ Sorry, no strong recommendations found for '{input_book}'.")


In [23]:
recommend_books('pride and prejudice', rules)
# Other names to try out:
# 1- pride & prejudice (penguin classics)
# 2- 1984
# 3- the hobbit
# 4- great gatsby (everyman)

📚 Based on your interest in 'pride and prejudice', you may also like:
1. pride & prejudice (new windmill)
2. pride & prejudice (penguin classics)
3. pride & prejudice (classic library)
