# Calculate Basic Statistics of the Datasets

In [1]:
import gzip
import json
import numpy as np
import pandas as pd
import os
pd.options.display.float_format = '{:,}'.format

**Specify your directory here:**

In [2]:
DIR = './'
DIR_GENRE = './genre/'

**This function will calculate the number of records in the input file**

In [3]:
def count_lines(file_name):
    print('counting file:', file_name)
    count = 0
    with gzip.open(file_name) as fin:
        for l in fin:
            count += 1
    print('done!')
    return count

**Calculate basic statistics of the book graph**

In [4]:
n_book = count_lines(os.path.join(DIR, 'goodreads_books.json.gz'))
n_work = count_lines(os.path.join(DIR, 'goodreads_book_works.json.gz'))
n_author = count_lines(os.path.join(DIR, 'goodreads_book_authors.json.gz'))
n_series = count_lines(os.path.join(DIR, 'goodreads_book_series.json.gz'))

counting file: ./goodreads_books.json.gz
done!
counting file: ./goodreads_book_works.json.gz
done!
counting file: ./goodreads_book_authors.json.gz
done!
counting file: ./goodreads_book_series.json.gz
done!


In [5]:
df_book_stats = pd.DataFrame([n_book, n_work, n_author, n_series], dtype = float, 
                             columns = ['count'],
                             index = ['# book', '# work', '# author', '# series'])
display(df_book_stats)

Unnamed: 0,count
# book,2360655.0
# work,1521962.0
# author,829529.0
# series,400390.0


**This function will extract statistics from interaction files**

In [6]:
def count_interactions(file_name):
    print('counting file:', file_name)
    n_shelve, n_read, n_rate, n_review = 0, 0, 0, 0
    user_set = set()
    print('current line:', end='')
    with gzip.open(file_name) as fin:
        for l in fin:
            d = json.loads(l)
            if n_shelve % 1000000 == 0:
                print(n_shelve, end=',')
            n_shelve += 1
            n_read += int(d['is_read'])
            n_rate += int(d['rating'] > 0)
            n_review += int(d['review_text_incomplete'] != '')
            user_set.add(d['user_id'])
    print('complete')
    print('done!')
    return n_shelve, n_read, n_rate, n_review, len(user_set)

**Now we can calculate basic statistics for each genre:**

In [7]:
genre_list = ['children', 'comics_graphic', 'fantasy_paranormal', 'history_biography',
              'mystery_thriller_crime', 'poetry', 'romance', 'young_adult']

def count_all_genres(genre_list):
    res = []
    for g in genre_list:
        n_book = count_lines(os.path.join(DIR_GENRE, 'goodreads_books_'+g+'.json.gz'))
        n_shelve, n_read, n_rate, n_review, n_user = count_interactions(
            os.path.join(DIR_GENRE, 'goodreads_interactions_'+g+'.json.gz'))
        res.append([n_book, n_user, n_shelve, n_read, n_rate, n_review])
    df_stats_by_genre = pd.DataFrame(res, dtype = float, 
                                     columns = ['# book', '# user', '# shelve', '# read', '# rate', '# review'],
                                     index = genre_list)
    return df_stats_by_genre

In [8]:
df_stats_by_genre = count_all_genres(genre_list)

counting file: ./genre/goodreads_books_children.json.gz
done!
counting file: ./genre/goodreads_interactions_children.json.gz
current line:0,1000000,2000000,3000000,4000000,5000000,6000000,7000000,8000000,9000000,10000000,complete
done!
counting file: ./genre/goodreads_books_comics_graphic.json.gz
done!
counting file: ./genre/goodreads_interactions_comics_graphic.json.gz
current line:0,1000000,2000000,3000000,4000000,5000000,6000000,7000000,complete
done!
counting file: ./genre/goodreads_books_fantasy_paranormal.json.gz
done!
counting file: ./genre/goodreads_interactions_fantasy_paranormal.json.gz
current line:0,1000000,2000000,3000000,4000000,5000000,6000000,7000000,8000000,9000000,10000000,11000000,12000000,13000000,14000000,15000000,16000000,17000000,18000000,19000000,20000000,21000000,22000000,23000000,24000000,25000000,26000000,27000000,28000000,29000000,30000000,31000000,32000000,33000000,34000000,35000000,36000000,37000000,38000000,39000000,40000000,41000000,42000000,43000000,440

In [19]:
for _t in ['# shelve', '# read', '# rate', '# review']:
    df_stats_by_genre[_t+'/'+'book'] = df_stats_by_genre[_t]/df_stats_by_genre['# book']
    df_stats_by_genre[_t+'/'+'user'] = df_stats_by_genre[_t]/df_stats_by_genre['# user']
display(df_stats_by_genre.round(2).transpose())

Unnamed: 0,children,comics_graphic,fantasy_paranormal,history_biography,mystery_thriller_crime,poetry,romance,young_adult
# book,124082.0,89411.0,258585.0,302935.0,219235.0,36514.0,335449.0,93398.0
# user,542145.0,342415.0,726932.0,761215.0,676075.0,377799.0,655454.0,644686.0
# shelve,10059349.0,7347630.0,55397550.0,31479229.0,24799896.0,2734350.0,42792856.0,34919254.0
# read,6626989.0,4764133.0,27904041.0,13436575.0,12524984.0,1313610.0,21174642.0,15722749.0
# rate,6384470.0,4514094.0,26193771.0,12379895.0,11715518.0,1229059.0,19701197.0,14731908.0
# review,736682.0,544371.0,3444043.0,2074497.0,1856053.0,155414.0,3585643.0,2405359.0
# shelve/book,81.07,82.18,214.23,103.91,113.12,74.88,127.57,373.88
# shelve/user,18.55,21.46,76.21,41.35,36.68,7.24,65.29,54.16
# read/book,53.41,53.28,107.91,44.35,57.13,35.98,63.12,168.34
# read/user,12.22,13.91,38.39,17.65,18.53,3.48,32.31,24.39
