In [1]:
import pandas as pd
import gzip
import json
import os
import shutil

Datasets directory

In [2]:
DIR = '../data/external/'

loading datasets

In [3]:
books_path = os.path.join(DIR, 'goodreads_books_history_biography.json.gz')
tmpfile = os.path.join(DIR, 'books.json')

# Create temporary file
with open(tmpfile, "wb") as tmp:
    shutil.copyfileobj(gzip.open(books_path), tmp)
    
# dataframe from temporary file
books = pd.read_json(tmpfile, lines=True)

# delete temporary file
os.remove(tmpfile)

In [4]:
books.head(2).T

Unnamed: 0,0,1
isbn,1599150603,184737297X
text_reviews_count,7,15
series,[],[169353]
country_code,US,US
language_code,,
popular_shelves,"[{'count': '56', 'name': 'to-read'}, {'count':...","[{'count': '159', 'name': 'to-read'}, {'count'..."
asin,,
is_ebook,false,false
average_rating,4.13,3.93
kindle_asin,B00DU10PUG,B007YLTG5I


To streamline the handling of book information, we have created a dedicated function. This function is designed to consolidate all relevant details of a book into a single text block, simplifying access and management of this information for our analysis and processing.

In [5]:
def get_shelves(x):
    values = []
    for _ in x:
        value = list(_.values())
        value = f"{value[1]}*{value[0]}"
        values.append(value)
    return ' '.join(map(str, values))

def get_smlr_bks(similar_books):
    if similar_books == []:
        return "None"
    else:
        bks = [int(bk) for bk in similar_books]
        smlr_bks = books[books['book_id'].isin(bks)]['title'].values
        return '. '.join(map(str, smlr_bks))

def upgrade_desc(DataFrame):
    df = DataFrame.copy()
    decription = "Description: " + df['description']
    publisher = " Publisher: " + df['publisher']
    publication_month = '. Publication Month: ' + df['publication_month']
    publication_day = '. Publication Day: ' + df['publication_day']
    publication_year = ". Publication Year: " + df['publication_year']
    num_pages = ". Num Pages: " + df['num_pages']
    country_code = ". Country Code: " + df['country_code']
    text_reviews_count = ". Review Count: " + df['text_reviews_count'].astype(str)
    popular_shelves = '. Shelves: ' + df['popular_shelves'].map(get_shelves)
    is_ebook = ". Is Ebook: " + df['is_ebook'].astype(str)
    average_rating = ". Average Rating: " + df['average_rating'].astype(str)
    similar_books = ". Similar Books: " + df['similar_books'].map(get_smlr_bks)
    language_code = ". Language: " + df['language_code']
    title = ". Title: " + df['title']
    title_without_series = ". Title: " + df['title_without_series']
    text = (decription + publisher + publication_month + publication_year + publication_day
            + num_pages + country_code + text_reviews_count
            + popular_shelves + is_ebook + average_rating + similar_books
            + language_code + title + title_without_series
            )
    df['text'] = text
    return  df['text']
_ = (upgrade_desc(books.iloc[:1]))

In [6]:
print(_.iloc[0])

Description: Relates in vigorous prose the tale of Aeneas, the legendary ancestor of Romulus, who escaped from the burning city of Troy and wandered the Mediterranean for years before settling in Italy. Patterned after the Iliad and the Odyssey, the Aeneid was composed as an epic poem by Virgil, to glorify the imperial city of Rome. Publisher: Yesterday's Classics. Publication Month: 9. Publication Year: 2006. Publication Day: 13. Num Pages: 162. Country Code: US. Review Count: 7. Shelves: to-read*56 currently-reading*10 history*4 classics*3 level-4to5*2 school-books*2 yesterday*2 school*2 classic*2 read-alouds*2 children-s-literature*1 classic-ed*1 waldorf-homeschool*1 6th-grade*1 myths-folktales-fairytales*1 abandoned*1 tapestry-of-grace-year-1-dialectic*1 default*1 myth*1 fantasy*1 14*1 school-room*1 history-intermediate*1 rome*1 literature*1 classical-world*1 childrens-literature*1 willem-s*1 children-s-fiction*1 elem-middle*1 digital*1 youth-lit*1 teaching-english*1 english*1 my-b

In [7]:
books['text'] = upgrade_desc(books)

Save final dataset

In [10]:
df = books[['title', 'title_without_series', 'average_rating', 'text', 'url', 'image_url']]

In [None]:
df.to_json(path_or_buf='../data/interim/books.json', orient='records')