In [None]:
import gzip

with gzip.open("goodreads_books.json.gz") as f: # used to stream the file without unzip the file
    line = f.readline() # reads the file line by line 

In [None]:
print(line) # print the first line 

In [None]:
import json
data = json.loads(line) # converts the strings into python dictionary data type
data

In [None]:
# creating function to return the required field from jason.gz file 
def parse_fields(line):
    data = json.loads(line)
    return {
        "book_id": data["book_id"], 
        "title": data["title_without_series"], 
        "ratings": data["ratings_count"], 
        "url": data["url"], 
        "cover_image": data["image_url"]
    }

In [None]:
# creating book_titles.json file
books_titles = []
with gzip.open("goodreads_books.json.gz") as f:
    while True:
        line = f.readline()
        if not line:
            break
        fields = parse_fields(line)
        try:
            ratings = int(fields["ratings"])
        except ValueError:
            continue
        if ratings > 5:
            books_titles.append(fields)

In [None]:
import pandas as pd
titles = pd.DataFrame.from_dict(books_titles)
titles["ratings"] = pd.to_numeric(titles["ratings"]) #convert rating field data from str to int


In [None]:
titles

In [None]:
# add modified title as new filed and clean the datas
titles["mod_title"] = titles["title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True) #retain only alphanumeric
titles["mod_title"] = titles["mod_title"].str.lower() # convert into lower case
titles["mod_title"] = titles["mod_title"].str.replace("\s+", " ", regex=True) # replaces long spaces into single space
titles = titles[titles["mod_title"].str.len() > 0] # removes rows where the "mod_title" is an empty string or whitespace
titles.to_json("books_titles.json") # saves the title dataframe into json file

In [None]:
titles

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer() # Create a TfidfVectorizer object

tfidf = vectorizer.fit_transform(titles["mod_title"]) # Convert the text into a TF-IDF matrix

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re

def make_clickable(val):
    return '<a target="_blank" href="{}">Go to website</a>'.format(val, val)

def show_image(val):
    return '<a href="{}"><img src="{}" width=50></img></a>'.format(val, val)

def search(query,vectorizer):
    processed = re.sub("[^a-zA-Z0-9 ]", "", query.lower()) #convert the query into lower case and alphanumeric value
    query_vec = vectorizer.transform([query]) #fit and transform the query text into TF-IDF matrix
    similarity = cosine_similarity(query_vec, tfidf).flatten() #calculates the similarity b/w query and tfidf
    indices = np.argpartition(similarity, -5)[-5:] #finds the indices of the five largest elements in the similarity array
    results = titles.iloc[indices] #retrieves the corresponding titles based on their most similar indices in the original dataset
    results = results.sort_values("ratings", ascending=False) # sort the results based on ratings column
    
    return results.head(5).style.format({'url': make_clickable, 'cover_image': show_image})

In [None]:
search("harry potter", vectorizer)