In [52]:
import pandas as pd
import gzip
import json
import os
import shutil
import calendar

Datasets directory

In [2]:
DIR = '../data/external/'

loading datasets

In [3]:
books_path = os.path.join(DIR, 'goodreads_books_history_biography.json.gz')
tmpfile = os.path.join(DIR, 'books.json')

# Create temporary file
with open(tmpfile, "wb") as tmp:
    shutil.copyfileobj(gzip.open(books_path), tmp)
    
# dataframe from temporary file
books = pd.read_json(tmpfile, lines=True)

# delete temporary file
os.remove(tmpfile)

In [4]:
review_path = os.path.join(DIR, 'goodreads_reviews_history_biography.json.gz')
tmpfile = os.path.join(DIR, 'reviews.json')

# Create temporary file
with open(tmpfile, "wb") as tmp:
    shutil.copyfileobj(gzip.open(review_path), tmp)
    
# dataframe from temporary file
reviews = pd.read_json(tmpfile, lines=True)

# delete temporary file
os.remove(tmpfile)

To streamline the handling of book information, we have created a dedicated function. This function is designed to consolidate all relevant details of a book into a single text block, simplifying access and management of this information for our analysis and processing.

Given our current focus on testing and refining our model's performance, we have opted to work with a small, carefully selected subset of our dataset. This deliberate choice enables us to efficiently assess our strategies and techniques without the computational overhead of processing the entire dataset. As we progress and gain confidence in our approach, we will extend our analyses to encompass the full dataset for more comprehensive insights.

In [79]:
smp = books.sample(random_state=1, n=1000)

In [102]:
def get_shelves(x):
    values = []
    for _ in x:
        value = list(_.values())
        value = f"{value[1]} "*int(value[0])
        values.append(value)
    return ' '.join(map(str, values))

def get_reviews(x):
    cols = ['review_text', 'n_votes', 'n_comments', 'rating']
    r = reviews[reviews['book_id'] == x][cols]
    o = [] 
    for txt, votes, comments, rating in r.values:
        info = "Review: " + txt + '; ' + "Votes: " + str(votes) + '; ' + "Comments: " + str(comments) + '; ' + "Rating: " + str(rating) + ';'
        o.append(info)
    return ' \n'.join(map(str, o))

def to_int(x):
    try:
        o = int(x)
    except ValueError:
        o = x
    finally:
        return o

def get_mth(x): 
    x = to_int(x)
    try:
        o = calendar.month_name[x]
    except TypeError:
        o = 'Unknown'
    finally:
        return o

def get_smlr_bks(similar_books):
    if similar_books == []:
        return "None"
    else:
        bks = [int(bk) for bk in similar_books]
        smlr_bks = books[books['book_id'].isin(bks)]['title'].values
        return '. '.join(map(str, smlr_bks))

def upgrade_desc(DataFrame):
    df = DataFrame.copy()
    decription = "Description: " + df['description']
    publisher = ";\nPUBLISHER: " + df['publisher']
    publication_month = ';\nPUBLICATION MONTH: ' + df['publication_month'].map(get_mth)
    publication_day = ';\nPUBLICATION DAY: ' + df['publication_day']
    publication_year = ";\nPUBLICATION YEAR: " + df['publication_year']
    num_pages = ";\nNUMBER OF PAGES: " + df['num_pages']
    country_code = ";\nCOUNTRY CODE: " + df['country_code']
    text_reviews_count = ";\nREVIEW COUNT: " + df['text_reviews_count'].astype(str)
    popular_shelves = ';\nSHELVES: ' + df['popular_shelves'].map(get_shelves)
    is_ebook = ";\nIS EBOOK: " + df['is_ebook'].astype(str)
    average_rating = ";\nAVERAGE RATING: " + df['average_rating'].astype(str)
    similar_books = ";\nSIMILAR BOOKS: " + df['similar_books'].map(get_smlr_bks)
    language_code = ";\nLANGUAGE: " + df['language_code']
    review = ";\nREVIEWS: {" + df['book_id'].map(get_reviews) + '}'
    title = ".\nTITLE: " + df['title']
    title_without_series = ".\nTITLE WITHOUT SERIES: " + df['title_without_series']
    text = (decription + publisher + publication_month + publication_year + publication_day
            + num_pages + country_code + text_reviews_count
            + popular_shelves + is_ebook + average_rating + similar_books
            + language_code + review + title + title_without_series
            )
    df['text'] = text
    return  df['text']
_ = (upgrade_desc(smp.iloc[3:4]))

In [103]:
print(_.values[0])

Description: Neni chvile odpocinku - boj o svedsky trun Orm a muzi z jeho prisezneho bratrstva ve ctvrtem dile Lowovy vikinske sagy bojuji... Prisezne bratrstvo zbohatlo a ziskalo si vehlas i uctu ve vikinskem svete. Slava ale prinasi take zavist a nepratele. Ormovo sidlo v Hestrengu je opet napadeno a on se musi i se svymi muzi stahnout do hor. Prisezni bratri ale nejsou zvykli vzdavat se. Vrati se, ale brzy si uvedomi, ze utocnici jen zdanlive chteli loupit. Ve hre je totiz mnohem vetsi korist - prave sjednocene Svedsko - a Orm s priseznymi bratry strezi kralovnu Sigrith, ktera ma brzy porodit dedice trunu.;
PUBLISHER: BB art;
PUBLICATION MONTH: Unknown;
PUBLICATION YEAR: 2013;
PUBLICATION DAY: ;
NUMBER OF PAGES: ;
COUNTRY CODE: US;
REVIEW COUNT: 1;
SHELVES: to-read to-read to-read to-read to-read to-read to-read to-read to-read to-read to-read to-read to-read to-read to-read to-read to-read to-read to-read to-read to-read to-read to-read to-read to-read to-read to-read to-read to-re

In [104]:
smp['text'] = upgrade_desc(smp)

Save final dataset

In [None]:
df = smp[['title','text', 'book_id']]

In [None]:
df.to_json(path_or_buf='../data/interim/books.json', orient='records')