In [2]:
import pandas as pd
import gzip
import json

# set pandas to display all columns
pd.set_option('display.max_columns', None)

# Introduction to the interactions file


<p> The interaction file contains 228_648_343 rows of interactions.  
It contains information on a user's interactions with different books.  
This includes whether a user has read the book, has rated the book, and has reviews the book or not. <p>

To find the number of lines in a CSV, use CLI with the following code: <p>
`wc -l <filename>.csv`

In [3]:
interactions_csv = '../raw_data/goodreads_interactions.csv'

In [13]:
interactions = pd.read_csv('../raw_data/goodreads_interactions.csv', nrows=100_000)

In [14]:
interactions.user_id.nunique()  # Unique users in 10 million rows of data

228

In [15]:
interactions.columns    # columns in the interaction dataset

Index(['user_id', 'book_id', 'is_read', 'rating', 'is_reviewed'], dtype='object')

In [16]:
interactions.sample(4)

Unnamed: 0,user_id,book_id,is_read,rating,is_reviewed
83213,196,3015,1,3,0
26172,48,20018,0,0,0
32935,75,11146,1,4,0
49646,119,31907,0,0,0


# Introduction to the books graph dataset

The books graph dataset has many columns and is therefore a bit more intensive to load. 100k rows took about 15 seconds to load. Use the following CLI command to access the number of rows in the data: <p>
`gzcat <path/to/file.json.gz> | wc -l`
<p> The dataset has 2_360_655 rows. 

In [8]:
def load_data(filename, head=10_000):
    count = 0
    data = []
    with gzip.open(filename) as f:
        for line in f:
            d = json.loads(line)
            count += 1
            data.append(d)
            if (head is not None) and (count > head):
                break
    return data

In [9]:
books_graph = pd.DataFrame(load_data('../raw_data/goodreads_books.json.gz', head=100_000))

In [10]:
books_graph['link'].sample(1)

37468    https://www.goodreads.com/book/show/249505.Jul...
Name: link, dtype: object

In [11]:
books_graph.columns

Index(['isbn', 'text_reviews_count', 'series', 'country_code', 'language_code',
       'popular_shelves', 'asin', 'is_ebook', 'average_rating', 'kindle_asin',
       'similar_books', 'description', 'format', 'link', 'authors',
       'publisher', 'num_pages', 'publication_day', 'isbn13',
       'publication_month', 'edition_information', 'publication_year', 'url',
       'image_url', 'book_id', 'ratings_count', 'work_id', 'title',
       'title_without_series'],
      dtype='object')

In [12]:
books_graph.isbn.nunique()

58621