In [1]:
import gzip

with gzip.open("goodreads_books.json.gz") as f:
    line = f.readline()

In [2]:
import json

data = json.loads(line)
data

{'isbn': '0312853122',
 'text_reviews_count': '1',
 'series': [],
 'country_code': 'US',
 'language_code': '',
 'popular_shelves': [{'count': '3', 'name': 'to-read'},
  {'count': '1', 'name': 'p'},
  {'count': '1', 'name': 'collection'},
  {'count': '1', 'name': 'w-c-fields'},
  {'count': '1', 'name': 'biography'}],
 'asin': '',
 'is_ebook': 'false',
 'average_rating': '4.00',
 'kindle_asin': '',
 'similar_books': [],
 'description': '',
 'format': 'Paperback',
 'link': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'authors': [{'author_id': '604031', 'role': ''}],
 'publisher': "St. Martin's Press",
 'num_pages': '256',
 'publication_day': '1',
 'isbn13': '9780312853129',
 'publication_month': '9',
 'edition_information': '',
 'publication_year': '1984',
 'url': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'image_url': 'https://images.gr-assets.com/books/1310220028m/5333265.jpg',
 'book_id': '5333265',
 'ratings_count': '3',
 'work_id': '5400751',
 'title': '

In [3]:
def parse_fields(line):
    data = json.loads(line)
    return {
        "book_id": data["book_id"], 
        "title": data["title_without_series"], 
        "ratings": data["ratings_count"], 
        "url": data["url"], 
        "cover_image": data["image_url"]
    }

In [5]:
books_titles = []
with gzip.open("goodreads_books.json.gz",'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        fields = parse_fields(line)
        try:
            ratings = int(fields["ratings"])
        except ValueError:
            continue
        if ratings > 15:
            books_titles.append(fields)

In [6]:
import pandas as pd

titles = pd.DataFrame.from_dict(books_titles)

In [17]:
titles.shape[0]

1302659

In [11]:
titles["ratings"] = pd.to_numeric(titles["ratings"])

In [13]:
titles["mod_title"] = titles["title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True)

In [14]:
titles["mod_title"] = titles["mod_title"].str.lower()

In [15]:
titles["mod_title"] = titles["mod_title"].str.replace("\s+", " ", regex=True)

In [16]:
titles = titles[titles["mod_title"].str.len() > 0]

In [18]:
titles.to_json("books_titles.json")

In [20]:
type(titles)

pandas.core.frame.DataFrame

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

tfidf = vectorizer.fit_transform(titles["mod_title"])

In [22]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re

def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val, val)

def show_image(val):
    return '<a href="{}"><img src="{}" width=50></img></a>'.format(val, val)

def search(query,vectorizer):
    processed = re.sub("[^a-zA-Z0-9 ]", "", query.lower())
    query_vec = vectorizer.transform([query])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -10)[-10:]
    results = titles.iloc[indices]
    results = results.sort_values("ratings", ascending=False)
    
    return results.head(5).style.format({'url': make_clickable, 'cover_image': show_image})

In [41]:
search("picture of dorian grey", vectorizer)

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
1117410,12346651,The Picture of Dorian Grey,6199,Goodreads,,the picture of dorian grey
853406,22515335,The Picture of Dorian Gray,234,Goodreads,,the picture of dorian gray
798634,14574838,The Picture of Dorian Grey,154,Goodreads,,the picture of dorian grey
931170,34506163,The Picture of Dorian Grey,107,Goodreads,,the picture of dorian grey
17441,35113385,The Picture of Dorian Gray,90,Goodreads,,the picture of dorian gray


In [None]:
liked_books = ["6882","331567","8349198","4069","2547","14082","12346651"]