# Data Exploration

## 0 Global settings

In [1]:
import gzip
import json
import re
import os
import sys
import itertools
import numpy as np
import pandas as pd
from typing import List, Dict

print(f"System version: {sys.version}")
print(f"Pandas version: {pd.__version__}")
print(f"Numpy version: {np.__version__}")
print(f"json version: {json.__version__}")
print(f"re version: {re.__version__}")
print("Setup Complete")

System version: 3.11.5 (tags/v3.11.5:cce6ba9, Aug 24 2023, 14:38:34) [MSC v.1936 64 bit (AMD64)]
Pandas version: 2.2.2
Numpy version: 1.26.4
json version: 2.0.9
re version: 2.2.1
Setup Complete


## 1 Display Sample Records

### 1.1 Specify Dataset Directory

In [2]:
path = '../data/raw/'

### 1.2 Load Datasets

In [3]:
def load_sample_data(file_name: str, head: int = 500) -> List[Dict]:
    """
    Load a sample of data from a gzipped JSON file.

    Args:
        file_name (str): The path to the gzipped JSON file.
        head (int, optional): The maximum number of lines to load. Defaults to 500.

    Returns:
        list: A list of dictionaries representing the loaded data.
    """
    try:
        with gzip.open(file_name, 'rt', encoding='utf-8') as f:
            data = [json.loads(line) for line in itertools.islice(f, head)]
    except FileNotFoundError:
        print(f"Error: {file_name} not found.")
        return []
    except (json.JSONDecodeError, UnicodeDecodeError):
        print(f"Error: Failed to decode {file_name}.")
        return []
    return data


In [4]:
books = load_sample_data(os.path.join(path, 'goodreads_books_children.json.gz'))
interactions = load_sample_data(os.path.join(path, 'goodreads_interactions_children.json.gz'))
genres = load_sample_data(os.path.join(path, 'goodreads_book_genres_initial.json.gz'))
authors = load_sample_data(os.path.join(path, 'goodreads_book_authors.json.gz'))


### 1.3 display sample records of books/interactions/genres/authors

In [5]:
print(' == sample record (books) ==')
display(np.random.choice(books))
print(' == sample record (interactions) ==')
display(np.random.choice(interactions))
print(' == sample record (genres) ==')
display(np.random.choice(genres))
print(' == sample record (authors) ==')
display(np.random.choice(authors))


 == sample record (books) ==


{'isbn': '0062333917',
 'text_reviews_count': '62',
 'series': [],
 'country_code': 'US',
 'language_code': 'eng',
 'popular_shelves': [{'count': '1124', 'name': 'to-read'},
  {'count': '50', 'name': 'classics'},
  {'count': '43', 'name': 'currently-reading'},
  {'count': '33', 'name': 'fairy-tales'},
  {'count': '24', 'name': 'fantasy'},
  {'count': '21', 'name': 'fiction'},
  {'count': '21', 'name': 'picture-books'},
  {'count': '17', 'name': 'books-i-own'},
  {'count': '16', 'name': 'children'},
  {'count': '16', 'name': 'childhood'},
  {'count': '15', 'name': 'childrens'},
  {'count': '13', 'name': 'المكتبة-الخضراء'},
  {'count': '13', 'name': 'fairy-tale'},
  {'count': '11', 'name': 'romance'},
  {'count': '10', 'name': 'to-buy'},
  {'count': '9', 'name': 'fairytales'},
  {'count': '8', 'name': 'owned'},
  {'count': '8', 'name': 'graphic-novels'},
  {'count': '8', 'name': 'wish-list'},
  {'count': '8', 'name': 'art'},
  {'count': '7', 'name': 'short-stories'},
  {'count': '7', 'na

 == sample record (interactions) ==


{'user_id': '4b3636a043e5c99fa27ac897ccfa1151',
 'book_id': '5',
 'review_id': '55151a7045624b118cef7792ecb04857',
 'is_read': True,
 'rating': 5,
 'review_text_incomplete': '',
 'date_added': 'Tue Jun 14 15:29:54 -0700 2011',
 'date_updated': 'Tue Jun 14 15:30:02 -0700 2011',
 'read_at': '',
 'started_at': ''}

 == sample record (genres) ==


{'book_id': '6375329',
 'genres': {'fiction': 2,
  'history, historical fiction, biography': 2,
  'romance': 1}}

 == sample record (authors) ==


{'average_rating': '3.88',
 'author_id': '41194',
 'text_reviews_count': '1251',
 'name': 'Judith Tarr',
 'ratings_count': '29656'}