# Calculate Basic Statistics of Detailed Reviews

In [1]:
import gzip
import json
import re
import os
import sys
import numpy as np
import pandas as pd
pd.options.display.float_format = '{:,}'.format

**Specify your directory here:**

In [2]:
DIR = './'
DIR_GENRE = './genre/'

**Calculate statistics of book reviews (without spoiler tags)**

In [3]:
def count_reviews(file_name):
    print('counting file:', file_name)
    n_review = 0
    book_set, user_set = set(), set()
    print('current line: ', end='')
    with gzip.open(file_name) as fin:
        for l in fin:
            d = json.loads(l)
            if n_review % 1000000 == 0:
                print(n_review, end=',')
            n_review += 1
            book_set.add(d['book_id'])
            user_set.add(d['user_id'])
    print('complete')
    print('done!')
    return n_review, len(book_set), len(user_set)

In [4]:
n_review, n_book, n_user = count_reviews(os.path.join(DIR, 'goodreads_reviews_dedup.json.gz'))
df_stats_review = pd.DataFrame([n_review, n_book, n_user], dtype=float,
                               columns=['count'], index=['# review', '# book', '# user'])
display(df_stats_review)

counting file: ./goodreads_reviews_dedup.json.gz
current line: 0,1000000,2000000,3000000,4000000,5000000,6000000,7000000,8000000,9000000,10000000,11000000,12000000,13000000,14000000,15000000,complete
done!


Unnamed: 0,count
# review,15739967.0
# book,2080190.0
# user,465323.0


**Calculate statistics of the review subset with spoiler tags**

In [5]:
def count_spoilers(file_name):
    print('counting file:', file_name)
    n_review, n_sentence, n_spoiler_review, n_spoiler_sentence = 0, 0, 0, 0
    book_set, user_set = set(), set()
    print('current line: ', end='')
    with gzip.open(file_name) as fin:
        for l in fin:
            d = json.loads(l)
            if n_review % 1000000 == 0:
                print(n_review, end=',')
            n_review += 1
            for _t, _ in d['review_sentences']:
                n_sentence += 1
                n_spoiler_sentence += _t
            n_spoiler_review += int(d['has_spoiler'])
            book_set.add(d['book_id'])
            user_set.add(d['user_id'])
    print('complete')
    print('done!')
    return n_review, n_sentence, n_spoiler_review, n_spoiler_sentence, len(book_set), len(user_set)

In [6]:
res = count_spoilers(os.path.join(DIR, 'goodreads_reviews_spoiler.json.gz'))
df_stats_spoiler = pd.DataFrame(res, columns=['count'], dtype=float, 
                               index=['# review', '# sentence', '# spoiler review', '# spoiler sentence',
                                      '# book', '# user'])
display(df_stats_spoiler)

counting file: ./goodreads_reviews_spoiler.json.gz
current line: 0,1000000,complete
done!


Unnamed: 0,count
# review,1378033.0
# sentence,17672655.0
# spoiler review,89627.0
# spoiler sentence,569724.0
# book,25475.0
# user,18892.0


**Calculate basic statistics of review files for each genre:**

In [7]:
genre_list = ['children', 'comics_graphic', 'fantasy_paranormal', 'history_biography',
              'mystery_thriller_crime', 'poetry', 'romance', 'young_adult']

def count_all_genres(genre_list):
    res = []
    for g in genre_list:
        n_review, n_book, n_user = count_reviews(
            os.path.join(DIR_GENRE, 'goodreads_reviews_'+g+'.json.gz'))
        res.append([n_review, n_book, n_user])
    df_stats_by_genre = pd.DataFrame(res, dtype = float, 
                                     columns = ['# review', '# book', '# user'],
                                     index = genre_list)
    return df_stats_by_genre

In [8]:
df_stats_by_genre = count_all_genres(genre_list)
display(df_stats_by_genre)

counting file: ./genre/goodreads_reviews_children.json.gz
current line: 0,complete
done!
counting file: ./genre/goodreads_reviews_comics_graphic.json.gz
current line: 0,complete
done!
counting file: ./genre/goodreads_reviews_fantasy_paranormal.json.gz
current line: 0,1000000,2000000,3000000,complete
done!
counting file: ./genre/goodreads_reviews_history_biography.json.gz
current line: 0,1000000,2000000,complete
done!
counting file: ./genre/goodreads_reviews_mystery_thriller_crime.json.gz
current line: 0,1000000,complete
done!
counting file: ./genre/goodreads_reviews_poetry.json.gz
current line: 0,complete
done!
counting file: ./genre/goodreads_reviews_romance.json.gz
current line: 0,1000000,2000000,3000000,complete
done!
counting file: ./genre/goodreads_reviews_young_adult.json.gz
current line: 0,1000000,2000000,complete
done!


Unnamed: 0,# review,# book,# user
children,734640.0,123946.0,92667.0
comics_graphic,542338.0,89311.0,59347.0
fantasy_paranormal,3424641.0,258212.0,256088.0
history_biography,2066193.0,302346.0,238450.0
mystery_thriller_crime,1849236.0,218987.0,203655.0
poetry,154555.0,36412.0,47400.0
romance,3565378.0,334957.0,198141.0
young_adult,2389900.0,93267.0,209152.0
