In [45]:
import gzip

with gzip.open("goodreads_books.json.gz") as f:
    line = f.readline()

In [46]:
import json

data = json.loads(line)
data

{'isbn': '0312853122',
 'text_reviews_count': '1',
 'series': [],
 'country_code': 'US',
 'language_code': '',
 'popular_shelves': [{'count': '3', 'name': 'to-read'},
  {'count': '1', 'name': 'p'},
  {'count': '1', 'name': 'collection'},
  {'count': '1', 'name': 'w-c-fields'},
  {'count': '1', 'name': 'biography'}],
 'asin': '',
 'is_ebook': 'false',
 'average_rating': '4.00',
 'kindle_asin': '',
 'similar_books': [],
 'description': '',
 'format': 'Paperback',
 'link': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'authors': [{'author_id': '604031', 'role': ''}],
 'publisher': "St. Martin's Press",
 'num_pages': '256',
 'publication_day': '1',
 'isbn13': '9780312853129',
 'publication_month': '9',
 'edition_information': '',
 'publication_year': '1984',
 'url': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'image_url': 'https://images.gr-assets.com/books/1310220028m/5333265.jpg',
 'book_id': '5333265',
 'ratings_count': '3',
 'work_id': '5400751',
 'title': '

In [47]:
def parse_fields(line):
    data = json.loads(line)
    return {
        "book_id": data["book_id"], 
        "title": data["title_without_series"], 
        "ratings": data["ratings_count"], 
        "url": data["url"], 
        "cover_image": data["image_url"]
    }

In [48]:
books_titles = []
with gzip.open("goodreads_books.json.gz") as f:
    while True:
        line = f.readline()
        if not line:
            break
        fields = parse_fields(line)
        try:
            ratings = int(fields["ratings"])
        except ValueError:
            continue
        if ratings > 5:
            books_titles.append(fields)

KeyboardInterrupt: 

In [None]:
import pandas as pd

titles = pd.DataFrame.from_dict(books_titles)

In [None]:
titles["ratings"] = pd.to_numeric(titles["ratings"])

In [None]:
titles["mod_title"] = titles["title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True)

In [None]:
titles["mod_title"] = titles["mod_title"].str.lower()

In [None]:
titles["mod_title"] = titles["mod_title"].str.replace("\s+", " ", regex=True)

In [None]:
titles = titles[titles["mod_title"].str.len() > 0]

In [None]:
titles.to_json("books_titles.json")

In [None]:
!pip install -U scikit-learn scipy matplotlib

Collecting scikit-learn
  Downloading scikit_learn-1.1.3-cp310-cp310-win_amd64.whl (7.5 MB)
     ---------------------------------------- 7.5/7.5 MB 203.4 kB/s eta 0:00:00
Collecting scipy
  Downloading scipy-1.9.3-cp310-cp310-win_amd64.whl (40.1 MB)
     -------------------------------------- 40.1/40.1 MB 338.8 kB/s eta 0:00:00
Collecting matplotlib
  Downloading matplotlib-3.6.2-cp310-cp310-win_amd64.whl (7.2 MB)
     ---------------------------------------- 7.2/7.2 MB 175.5 kB/s eta 0:00:00
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Collecting joblib>=1.0.0
  Downloading joblib-1.2.0-py3-none-any.whl (297 kB)
     ------------------------------------ 298.0/298.0 kB 383.6 kB/s eta 0:00:00
Collecting cycler>=0.10
  Downloading cycler-0.11.0-py3-none-any.whl (6.4 kB)
Collecting contourpy>=1.0.1
  Downloading contourpy-1.0.6-cp310-cp310-win_amd64.whl (163 kB)
     ------------------------------------ 163.6/163.6 kB 272.9 kB/s eta 0:00:00
C


[notice] A new release of pip available: 22.2.2 -> 22.3.1
[notice] To update, run: C:\Users\kalya\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

tfidf = vectorizer.fit_transform(titles["mod_title"])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re

def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val, val)

def show_image(val):
    return '<a href="{}"><img src="{}" width=50></img></a>'.format(val, val)

def search(query,vectorizer):
    processed = re.sub("[^a-zA-Z0-9 ]", "", query.lower())
    query_vec = vectorizer.transform([query])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -10)[-20:]
    results = titles.iloc[indices]
    results = results.sort_values("ratings", ascending=False)
    
    return results.head(5).style.format({'url': make_clickable, 'cover_image': show_image})

In [None]:
!pip install jinja2


Collecting jinja2
  Using cached Jinja2-3.1.2-py3-none-any.whl (133 kB)
Collecting MarkupSafe>=2.0
  Downloading MarkupSafe-2.1.1-cp310-cp310-win_amd64.whl (17 kB)
Installing collected packages: MarkupSafe, jinja2
Successfully installed MarkupSafe-2.1.1 jinja2-3.1.2



[notice] A new release of pip available: 22.2.2 -> 22.3.1
[notice] To update, run: C:\Users\kalya\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [51]:
search("Harry potter", vectorizer)

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
1762728,10,"Harry Potter Collection (Harry Potter, #1-6)",25245,Goodreads,,harry potter collection harry potter 16
630902,86940,"هاري بوتر وحجر الفيلسوف (Harry Potter, #1)",1290,Goodreads,,harry potter 1
1422621,49869,"هاري بوتر وسجين أزكابان (Harry Potter, #3)",1023,Goodreads,,harry potter 3
26669,49839,"هاري بوتر وكأس النار (Harry Potter, #4)",957,Goodreads,,harry potter 4
1230739,70355,"هاري بوتر وجماعة العنقاء (Harry Potter, #5)",955,Goodreads,,harry potter 5


: 