# Search Engine

The datasets are huge (2 - 5 GBs when compressed). Hence reading them in using usual methods would be inefficient. We'll use a streaming method to read the data. We'll read the data without unzipping the file.

In [8]:
import gzip
with gzip.open("goodreads_books.json.gz", 'r') as f:
    line = f.readline()
line

b'{"isbn": "0312853122", "text_reviews_count": "1", "series": [], "country_code": "US", "language_code": "", "popular_shelves": [{"count": "3", "name": "to-read"}, {"count": "1", "name": "p"}, {"count": "1", "name": "collection"}, {"count": "1", "name": "w-c-fields"}, {"count": "1", "name": "biography"}], "asin": "", "is_ebook": "false", "average_rating": "4.00", "kindle_asin": "", "similar_books": [], "description": "", "format": "Paperback", "link": "https://www.goodreads.com/book/show/5333265-w-c-fields", "authors": [{"author_id": "604031", "role": ""}], "publisher": "St. Martin\'s Press", "num_pages": "256", "publication_day": "1", "isbn13": "9780312853129", "publication_month": "9", "edition_information": "", "publication_year": "1984", "url": "https://www.goodreads.com/book/show/5333265-w-c-fields", "image_url": "https://images.gr-assets.com/books/1310220028m/5333265.jpg", "book_id": "5333265", "ratings_count": "3", "work_id": "5400751", "title": "W.C. Fields: A Life on Film", "t

This is a single line containing the metadata of a single book.

In [9]:
import json
json.loads(line) 

{'isbn': '0312853122',
 'text_reviews_count': '1',
 'series': [],
 'country_code': 'US',
 'language_code': '',
 'popular_shelves': [{'count': '3', 'name': 'to-read'},
  {'count': '1', 'name': 'p'},
  {'count': '1', 'name': 'collection'},
  {'count': '1', 'name': 'w-c-fields'},
  {'count': '1', 'name': 'biography'}],
 'asin': '',
 'is_ebook': 'false',
 'average_rating': '4.00',
 'kindle_asin': '',
 'similar_books': [],
 'description': '',
 'format': 'Paperback',
 'link': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'authors': [{'author_id': '604031', 'role': ''}],
 'publisher': "St. Martin's Press",
 'num_pages': '256',
 'publication_day': '1',
 'isbn13': '9780312853129',
 'publication_month': '9',
 'edition_information': '',
 'publication_year': '1984',
 'url': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'image_url': 'https://images.gr-assets.com/books/1310220028m/5333265.jpg',
 'book_id': '5333265',
 'ratings_count': '3',
 'work_id': '5400751',
 'title': '

In [13]:
# funtion to fetch the required metadata for each of the books
def fetch_metadata(line):
    data = json.loads(line)
    return({
        "book_id": data["book_id"],
        "title": data["title_without_series"],
        "ratings": data["ratings_count"],
        "url": data["url"],
        "cover_image": data["image_url"],
    })

In [14]:
books_data = []
with gzip.open("goodreads_books.json.gz", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        metadata = fetch_metadata(line)
        try:
            ratings = int(metadata["ratings"])
        except ValueError:
            continue
        if ratings > 20: # take only books with at least 20 ratings
            books_data.append(metadata)

In [15]:
import pandas as pd

books = pd.DataFrame.from_dict(books_data)
books.head()

Unnamed: 0,book_id,title,ratings,url,cover_image
0,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...
1,6066819,Best Friends Forever,51184,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...
2,287141,The Aeneid for Boys and Girls,46,https://www.goodreads.com/book/show/287141.The...,https://s.gr-assets.com/assets/nophoto/book/11...
3,6066812,All's Fairy in Love and War (Avalon: Web of Ma...,98,https://www.goodreads.com/book/show/6066812-al...,https://images.gr-assets.com/books/1316637798m...
4,287149,The Devil's Notebook,986,https://www.goodreads.com/book/show/287149.The...,https://images.gr-assets.com/books/1328768789m...


In [16]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1171891 entries, 0 to 1171890
Data columns (total 5 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   book_id      1171891 non-null  object
 1   title        1171891 non-null  object
 2   ratings      1171891 non-null  object
 3   url          1171891 non-null  object
 4   cover_image  1171891 non-null  object
dtypes: object(5)
memory usage: 44.7+ MB


In [17]:
# checking for na values
books.isna().sum()

book_id        0
title          0
ratings        0
url            0
cover_image    0
dtype: int64

In [20]:
# converting ratings count to numeric (integers)
books['ratings'] = books['ratings'].astype('int')
books.dtypes

book_id        object
title          object
ratings         int32
url            object
cover_image    object
dtype: object

In [25]:
# processing book titles
books['processed_title'] = books['title'].str.lower()
books['processed_title'] = books['processed_title'].str.replace('[^a-z0-9 ]', '', regex=True)
books['processed_title'] = books['processed_title'].str.replace('\s+', ' ', regex=True)
books['processed_title'] = books['processed_title'].str.strip()
books.head()

Unnamed: 0,book_id,title,ratings,url,cover_image,processed_title
0,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,the unschooled wizard sun wolf and starhawk 12
1,6066819,Best Friends Forever,51184,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,best friends forever
2,287141,The Aeneid for Boys and Girls,46,https://www.goodreads.com/book/show/287141.The...,https://s.gr-assets.com/assets/nophoto/book/11...,the aeneid for boys and girls
3,6066812,All's Fairy in Love and War (Avalon: Web of Ma...,98,https://www.goodreads.com/book/show/6066812-al...,https://images.gr-assets.com/books/1316637798m...,alls fairy in love and war avalon web of magic 8
4,287149,The Devil's Notebook,986,https://www.goodreads.com/book/show/287149.The...,https://images.gr-assets.com/books/1328768789m...,the devils notebook


In [26]:
# checking for missing titles (due to invalid (special chars, diff. lang., etc.) original titles)
books[books.processed_title.str.len()==0].head()

Unnamed: 0,book_id,title,ratings,url,cover_image,processed_title
28,28492512,Очир эрхшээгч,82,https://www.goodreads.com/book/show/28492512,https://images.gr-assets.com/books/1452330740m...,
36,1907185,چنین گفت زرتشت,159,https://www.goodreads.com/book/show/1907185._,https://images.gr-assets.com/books/1397137012m...,
37,1907184,بوف کور,704,https://www.goodreads.com/book/show/1907184._,https://images.gr-assets.com/books/1457787423m...,
51,18283091,คัมภีร์ร้อยใจ,23,https://www.goodreads.com/book/show/18283091,https://images.gr-assets.com/books/1394964151m...,
61,316994,روانکاو و داستان‌های دیگر,57,https://www.goodreads.com/book/show/316994._,https://s.gr-assets.com/assets/nophoto/book/11...,


In [28]:
# removing such lines
books = books[books.processed_title.str.len()>0]
books.shape

(1134094, 6)

In [29]:
books.to_json('books.json')

The search engine is developed based on ti-idf values for the terms in the title.

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

tfidf = vectorizer.fit_transform(books.processed_title)

In [33]:
tfidf

<1134094x213798 sparse matrix of type '<class 'numpy.float64'>'
	with 5563366 stored elements in Compressed Sparse Row format>

In [36]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re

query = "harry potter"
processed = re.sub('[^a-z0-9 ]', '', query.lower().strip()) # same processing done to the titles
query_vec = vectorizer.transform([processed])
similarity = cosine_similarity(query_vec, tfidf).flatten()
indices = np.argpartition(similarity, -10)[-10:] # gives indices of top 10 similar rows
results = books.iloc[indices]
results

Unnamed: 0,book_id,title,ratings,url,cover_image,processed_title
575636,1508373,Гарри Поттер и философский камень (Harry Potte...,67,https://www.goodreads.com/book/show/1508373._,https://s.gr-assets.com/assets/nophoto/book/11...,harry potter 1
414996,86940,"هاري بوتر وحجر الفيلسوف (Harry Potter, #1)",1290,https://www.goodreads.com/book/show/86940._,https://images.gr-assets.com/books/1327275224m...,harry potter 1
1056149,29546135,ჰარი პოტერი და სიკვდილის საჩუქრები (Harry Pott...,21,https://www.goodreads.com/book/show/29546135,https://images.gr-assets.com/books/1458298071m...,harry potter 7
421292,2089953,"הארי פוטר וגביע האש (Harry Potter, #4)",32,https://www.goodreads.com/book/show/2089953._,https://images.gr-assets.com/books/1432676097m...,harry potter 4
889764,6352403,"הארי פוטר ומסדר עוף החול (Harry Potter, #5)",32,https://www.goodreads.com/book/show/6352403,https://images.gr-assets.com/books/1320636301m...,harry potter 5
889765,6352404,"הארי פוטר ואוצרות המוות (Harry Potter, #7)",33,https://www.goodreads.com/book/show/6352404,https://images.gr-assets.com/books/1281662787m...,harry potter 7
1019673,6304297,"הארי פוטר והנסיך חצוי-הדם (Harry Potter, #6)",74,https://www.goodreads.com/book/show/6304297--,https://images.gr-assets.com/books/1349347442m...,harry potter 6
569477,8479215,هری پاتر و فرمان ققنوس - کتاب پنجم (Harry Pott...,26,https://www.goodreads.com/book/show/8479215--,https://images.gr-assets.com/books/1279770756m...,harry potter 5
605227,22601967,"ჰარი პოტერი და ფენიქსის ორდენი (Harry Potter, #5)",33,https://www.goodreads.com/book/show/22601967,https://images.gr-assets.com/books/1404053183m...,harry potter 5
535434,8683527,"แฮร์รี่ พอตเตอร์กับศิลาอาถรรพ์ (Harry Potter, #1)",84,https://www.goodreads.com/book/show/8683527,https://s.gr-assets.com/assets/nophoto/book/11...,harry potter 1


In [37]:
query = "twilight"
processed = re.sub('[^a-z0-9 ]', '', query.lower().strip()) # same processing done to the titles
query_vec = vectorizer.transform([processed])
similarity = cosine_similarity(query_vec, tfidf).flatten()
indices = np.argpartition(similarity, -10)[-10:] # gives indices of top 10 similar rows
results = books.iloc[indices]
results

Unnamed: 0,book_id,title,ratings,url,cover_image,processed_title
88144,5953576,"Twilight (Twilight, #1)",72,https://www.goodreads.com/book/show/5953576-tw...,https://s.gr-assets.com/assets/nophoto/book/11...,twilight twilight 1
283784,6421355,"Twilight (Twilight, #1)",83,https://www.goodreads.com/book/show/6421355-tw...,https://s.gr-assets.com/assets/nophoto/book/11...,twilight twilight 1
862330,10244455,"Twilight (Twilight, #1)",172,https://www.goodreads.com/book/show/10244455-t...,https://s.gr-assets.com/assets/nophoto/book/11...,twilight twilight 1
1103451,11890319,Twilight,37,https://www.goodreads.com/book/show/11890319-t...,https://images.gr-assets.com/books/1315418855m...,twilight
271722,1026149,Twilight,36,https://www.goodreads.com/book/show/1026149.Tw...,https://s.gr-assets.com/assets/nophoto/book/11...,twilight
1152725,9979062,"Twilight (Twilight, #1)",42,https://www.goodreads.com/book/show/9979062-tw...,https://images.gr-assets.com/books/1488319289m...,twilight twilight 1
763406,5999961,"Twilight (Twilight, #1)",398,https://www.goodreads.com/book/show/5999961-tw...,https://s.gr-assets.com/assets/nophoto/book/11...,twilight twilight 1
1125587,8516508,"خسوف (Twilight, #3)",111,https://www.goodreads.com/book/show/8516508-tw...,https://images.gr-assets.com/books/1315986976m...,twilight 3
69540,12024,"Twilight (Twilight, #1)",2405,https://www.goodreads.com/book/show/12024.Twil...,https://s.gr-assets.com/assets/nophoto/book/11...,twilight twilight 1
501045,762743,"Twilight (Twilight, #1)",7239,https://www.goodreads.com/book/show/762743.Twi...,https://images.gr-assets.com/books/1318007961m...,twilight twilight 1


We can see that there are duplicates of the same book. We can try to solve this by taking the row with the highest number of ratings.

In [38]:
query = "twilight"
processed = re.sub('[^a-z0-9 ]', '', query.lower().strip()) # same processing done to the titles
query_vec = vectorizer.transform([processed])
similarity = cosine_similarity(query_vec, tfidf).flatten()
indices = np.argpartition(similarity, -10)[-10:] # gives indices of top 10 similar rows
results = books.iloc[indices]
results = results.sort_values("ratings", ascending=False)
results.head(5)

Unnamed: 0,book_id,title,ratings,url,cover_image,processed_title
501045,762743,"Twilight (Twilight, #1)",7239,https://www.goodreads.com/book/show/762743.Twi...,https://images.gr-assets.com/books/1318007961m...,twilight twilight 1
69540,12024,"Twilight (Twilight, #1)",2405,https://www.goodreads.com/book/show/12024.Twil...,https://s.gr-assets.com/assets/nophoto/book/11...,twilight twilight 1
763406,5999961,"Twilight (Twilight, #1)",398,https://www.goodreads.com/book/show/5999961-tw...,https://s.gr-assets.com/assets/nophoto/book/11...,twilight twilight 1
862330,10244455,"Twilight (Twilight, #1)",172,https://www.goodreads.com/book/show/10244455-t...,https://s.gr-assets.com/assets/nophoto/book/11...,twilight twilight 1
1125587,8516508,"خسوف (Twilight, #3)",111,https://www.goodreads.com/book/show/8516508-tw...,https://images.gr-assets.com/books/1315986976m...,twilight 3


We can convert this to a function and do some html formatting for url and cover image columns.

In [43]:
def url_format(url):
    return(f'<a target="_blank" href="{url}">Goodreads</a>')

def display_image(image):
    return(f'<img src="{image}" width=50></img>')

def search(query, vectorizer):
    processed = re.sub('[^a-z0-9 ]', '', query.lower().strip()) # same processing done to the titles
    query_vec = vectorizer.transform([processed])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -10)[-10:] # gives indices of top 10 similar rows
    results = books.iloc[indices]
    results = results.sort_values("ratings", ascending=False)
    return(results.head(5).style.format({'url': url_format, 'cover_image': display_image}))

In [55]:
search('scarlet pimpernel', vectorizer)

Unnamed: 0,book_id,title,ratings,url,cover_image,processed_title
41813,15833451,The Scarlet Pimpernel,1154,Goodreads,,the scarlet pimpernel
898339,20598360,The Scarlet Pimpernel,395,Goodreads,,the scarlet pimpernel
598791,18297740,The Scarlet Pimpernel,120,Goodreads,,the scarlet pimpernel
1170788,7137166,The Scarlet Pimpernel,109,Goodreads,,the scarlet pimpernel
334077,1444468,The Scarlet Pimpernel,82,Goodreads,,the scarlet pimpernel


# Recommendation Engine

In [56]:
books_i_like = ['4837599', '762743', '86940', '9571725', '1392526', '15833451']

In [None]:
book_id_map = {}
with open("book_id_map.csv", "r") as f:
    while True:
        line = f.readline()
        if not line:
            break
        id, book_id = line.split(',')
        book_id_map[id] = book_id.split('\n')[0]


In [62]:
similar_users = set() # fetching users who have rated the books I like >= 4
with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, id, _, rating, _ = line.split(",")
        
        if user_id in similar_users:
            continue
            
        try:
            rating = int(rating)
        except ValueError:
            continue
            
        book_id = book_id_map[id]
        
        if (book_id in books_i_like) and (rating >= 4):
            similar_users.add(user_id)
        

In [65]:
books_rated_by_similar_users = [] # fetching the books rated by the above users
with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, id, _, rating, _ = line.split(",")
        
        if user_id in similar_users:
            book_id = book_id_map[id]
            books_rated_by_similar_users.append([user_id, book_id, rating])


In [68]:
import pandas

recommended_books = pd.DataFrame(books_rated_by_similar_users, columns=['user_id', 'book_id', 'rating'])
recommended_books.head()

Unnamed: 0,user_id,book_id,rating
0,356,13597801,5
1,356,13641946,4
2,356,13101894,4
3,356,762743,4
4,356,42900,5


In [69]:
top_recommended_books = recommended_books.book_id.value_counts().head(10) # top 10 recommended books
top_recommended_books.head()

2767052    699
1162543    674
762743     654
6148028    634
428263     620
Name: book_id, dtype: int64

In [70]:
top_recommended_books = top_recommended_books.index.values

In [71]:
books = pd.read_json("books.json")
books.head()

Unnamed: 0,book_id,title,ratings,url,cover_image,processed_title
0,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,the unschooled wizard sun wolf and starhawk 12
1,6066819,Best Friends Forever,51184,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,best friends forever
2,287141,The Aeneid for Boys and Girls,46,https://www.goodreads.com/book/show/287141.The...,https://s.gr-assets.com/assets/nophoto/book/11...,the aeneid for boys and girls
3,6066812,All's Fairy in Love and War (Avalon: Web of Ma...,98,https://www.goodreads.com/book/show/6066812-al...,https://images.gr-assets.com/books/1316637798m...,alls fairy in love and war avalon web of magic 8
4,287149,The Devil's Notebook,986,https://www.goodreads.com/book/show/287149.The...,https://images.gr-assets.com/books/1328768789m...,the devils notebook


In [73]:
top_recommended_books = [str(book_id) for book_id in top_recommended_books]
books['book_id'] = books['book_id'].astype('str')

In [74]:
books[books.book_id.isin(top_recommended_books)]

Unnamed: 0,book_id,title,ratings,url,cover_image,processed_title
65983,7260188,"Mockingjay (The Hunger Games, #3)",1743362,https://www.goodreads.com/book/show/7260188-mo...,https://images.gr-assets.com/books/1358275419m...,mockingjay the hunger games 3
255053,2767052,"The Hunger Games (The Hunger Games, #1)",4899965,https://www.goodreads.com/book/show/2767052-th...,https://images.gr-assets.com/books/1447303603m...,the hunger games the hunger games 1
316186,49041,"New Moon (Twilight, #2)",1161751,https://www.goodreads.com/book/show/49041.New_...,https://images.gr-assets.com/books/1361039440m...,new moon twilight 2
361102,1162543,"Breaking Dawn (Twilight, #4)",1078310,https://www.goodreads.com/book/show/1162543.Br...,https://images.gr-assets.com/books/1361039438m...,breaking dawn twilight 4
501045,762743,"Twilight (Twilight, #1)",7239,https://www.goodreads.com/book/show/762743.Twi...,https://images.gr-assets.com/books/1318007961m...,twilight twilight 1
746012,6148028,"Catching Fire (The Hunger Games, #2)",1854746,https://www.goodreads.com/book/show/6148028-ca...,https://images.gr-assets.com/books/1358273780m...,catching fire the hunger games 2
786647,3,Harry Potter and the Sorcerer's Stone (Harry P...,4765497,https://www.goodreads.com/book/show/3.Harry_Po...,https://images.gr-assets.com/books/1474154022m...,harry potter and the sorcerers stone harry pot...
808272,136251,Harry Potter and the Deathly Hallows (Harry Po...,1784684,https://www.goodreads.com/book/show/136251.Har...,https://images.gr-assets.com/books/1474171184m...,harry potter and the deathly hallows harry pot...
852869,428263,"Eclipse (Twilight, #3)",1146155,https://www.goodreads.com/book/show/428263.Ecl...,https://images.gr-assets.com/books/1361038355m...,eclipse twilight 3
891116,15881,Harry Potter and the Chamber of Secrets (Harry...,1821802,https://www.goodreads.com/book/show/15881.Harr...,https://images.gr-assets.com/books/1474169725m...,harry potter and the chamber of secrets harry ...


The books given here are generally popular books. We need a list that actually is custom-made to our liking. We try to penalise the popularity of the recommended books.

In [75]:
all_recommended_books = recommended_books.book_id.value_counts().reset_index()
all_recommended_books.columns = ["book_id", "book_count"]
all_recommended_books["book_id"] = all_recommended_books['book_id'].astype('str')

all_recommended_books = all_recommended_books.merge(books, on="book_id", how="inner")
all_recommended_books.head()

Unnamed: 0,book_id,book_count,title,ratings,url,cover_image,processed_title
0,2767052,699,"The Hunger Games (The Hunger Games, #1)",4899965,https://www.goodreads.com/book/show/2767052-th...,https://images.gr-assets.com/books/1447303603m...,the hunger games the hunger games 1
1,1162543,674,"Breaking Dawn (Twilight, #4)",1078310,https://www.goodreads.com/book/show/1162543.Br...,https://images.gr-assets.com/books/1361039438m...,breaking dawn twilight 4
2,762743,654,"Twilight (Twilight, #1)",7239,https://www.goodreads.com/book/show/762743.Twi...,https://images.gr-assets.com/books/1318007961m...,twilight twilight 1
3,6148028,634,"Catching Fire (The Hunger Games, #2)",1854746,https://www.goodreads.com/book/show/6148028-ca...,https://images.gr-assets.com/books/1358273780m...,catching fire the hunger games 2
4,428263,620,"Eclipse (Twilight, #3)",1146155,https://www.goodreads.com/book/show/428263.Ecl...,https://images.gr-assets.com/books/1361038355m...,eclipse twilight 3


In [76]:
all_recommended_books["penalised_score"] = all_recommended_books["book_count"] * all_recommended_books["book_count"]/all_recommended_books["ratings"]


In [78]:
all_recommended_books.sort_values('penalised_score', ascending=False).head(10)

Unnamed: 0,book_id,book_count,title,ratings,url,cover_image,processed_title,penalised_score
2,762743,654,"Twilight (Twilight, #1)",7239,https://www.goodreads.com/book/show/762743.Twi...,https://images.gr-assets.com/books/1318007961m...,twilight twilight 1,59.084956
102,86940,228,"هاري بوتر وحجر الفيلسوف (Harry Potter, #1)",1290,https://www.goodreads.com/book/show/86940._,https://images.gr-assets.com/books/1327275224m...,harry potter 1,40.297674
51,9571725,284,Murder on the Orient Express,2136,https://www.goodreads.com/book/show/9571725-mu...,https://images.gr-assets.com/books/1409605149m...,murder on the orient express,37.7603
2104,24909347,45,"Obsidio (The Illuminae Files, #3)",82,https://www.goodreads.com/book/show/24909347-o...,https://images.gr-assets.com/books/1501704611m...,obsidio the illuminae files 3,24.695122
201,7809996,165,"هاري بوتر وحجرة الأسرار (Harry Potter, #2)",1117,https://www.goodreads.com/book/show/7809996,https://s.gr-assets.com/assets/nophoto/book/11...,harry potter 2,24.373321
320,49839,137,"هاري بوتر وكأس النار (Harry Potter, #4)",957,https://www.goodreads.com/book/show/49839._,https://images.gr-assets.com/books/1312417199m...,harry potter 4,19.61233
304,49869,141,"هاري بوتر وسجين أزكابان (Harry Potter, #3)",1023,https://www.goodreads.com/book/show/49869._,https://images.gr-assets.com/books/1329651788m...,harry potter 3,19.434018
382,70355,125,"هاري بوتر وجماعة العنقاء (Harry Potter, #5)",955,https://www.goodreads.com/book/show/70355._,https://images.gr-assets.com/books/1351790790m...,harry potter 5,16.361257
604,1392526,97,The Da Vinci Code,695,https://www.goodreads.com/book/show/1392526.Th...,https://images.gr-assets.com/books/1361688983m...,the da vinci code,13.538129
8647,36307629,17,"King of Scars (King of Scars, #1)",22,https://www.goodreads.com/book/show/36307629-k...,https://images.gr-assets.com/books/1506962795m...,king of scars king of scars 1,13.136364


Note that now, books with low number of ratings have also turned up in the list. Also, some of these books are already in the liked book list. 

Now, we can choose to have a cut-off to the number of ratings.

In [79]:
popular_recommended_books = all_recommended_books[all_recommended_books.book_count>75].sort_values('penalised_score', ascending=False)


In [82]:
popular_recommended_books[~popular_recommended_books.book_id.isin(books_i_like)].head(10).style.format({'url': url_format, 'cover_image': display_image})


Unnamed: 0,book_id,book_count,title,ratings,url,cover_image,processed_title,penalised_score
201,7809996,165,"هاري بوتر وحجرة الأسرار (Harry Potter, #2)",1117,Goodreads,,harry potter 2,24.373321
320,49839,137,"هاري بوتر وكأس النار (Harry Potter, #4)",957,Goodreads,,harry potter 4,19.61233
304,49869,141,"هاري بوتر وسجين أزكابان (Harry Potter, #3)",1023,Goodreads,,harry potter 3,19.434018
382,70355,125,"هاري بوتر وجماعة العنقاء (Harry Potter, #5)",955,Goodreads,,harry potter 5,16.361257
366,563012,128,"New Moon (Twilight, #2)",2497,Goodreads,,new moon twilight 2,6.561474
923,6138606,76,"The Seeker (The Host, #2)",1600,Goodreads,,the seeker the host 2,3.61
643,332572,93,"Eclipse (Twilight, #3)",3681,Goodreads,,eclipse twilight 3,2.349633
820,685386,80,"Club Dead (Sookie Stackhouse, #3)",5890,Goodreads,,club dead sookie stackhouse 3,1.086587
909,685385,76,"Living Dead in Dallas (Sookie Stackhouse, #2)",5347,Goodreads,,living dead in dallas sookie stackhouse 2,1.080232
404,10929432,121,"The Archived (The Archived, #1)",17028,Goodreads,,the archived the archived 1,0.859819
