# EDA of initial data download

In [None]:
from datetime import date
import gzip
import json
from pathlib import Path

import altair as alt
import arrow
import lux
import pandas as pd

In [None]:
input_dir = Path("../data/raw/20210527_initial_data_children_biography/")

In [None]:
genres = ["children", "history_biography"]

## Plot ratings distribution by genre

In [None]:
def load_ratings(genres):
    for genre in genres:
        input_file = input_dir / f"goodreads_interactions_{genre}.json.gz"
        with gzip.open(input_file, "rt") as fin:
            for i, line in enumerate(fin):
#                 if i > 999:
#                     break
                j = json.loads(line)
                if j["rating"] > 0:
                    d = arrow.get(j["date_updated"], "ddd MMM DD HH:mm:ss Z YYYY").date()
                    first_day_month = date(year=d.year, month=d.month, day=1)
                    yield j["user_id"], j["book_id"], first_day_month, genre, j["rating"]

In [None]:
rating_df = pd.DataFrame(load_ratings(genres), columns=["user_id", "book_id", "month", "genre", "rating"])
rating_df["month"] = pd.to_datetime(rating_df["month"])
rating_df = rating_df.convert_dtypes()

In [None]:
ratings_per_genre = rating_df.groupby(["genre", "rating"]).size()
ratings_per_genre = ratings_per_genre.reset_index()
ratings_per_genre.columns = ["Genre", "Rating", "Count"]

In [None]:
alt.Chart(ratings_per_genre).mark_bar().encode(
    x="Rating:O",
    y="Count:Q",
    column="Genre:N"
)

Labeling ratings $\leq 3$ as negative and $> 3$ as positive might be a first thing to try.

Based on my own experience, there is also a self-justification aspect to be aware of when interpreting ratings: when I spend the time reading a whole book, rating it less then 3 becomes unlikely even though I did not really like the book because rating it $< 3$ would mean that I wasted a lot of time reading the book in the first place.

## Plot ratings over time

In [None]:
nrating_per_genre_month = rating_df.groupby(["genre", "month"]).size()
nrating_per_genre_month = nrating_per_genre_month.reset_index()
nrating_per_genre_month.columns = ["Genre", "Month", "N_ratings"]

In [None]:
nrating_per_genre_month

In [None]:
alt.Chart(nrating_per_genre_month).mark_bar().encode(
    x="Month:T",
    y="N_ratings:Q",
    column="Genre:N"
)

## Plot (cumulative) distribution of rating count for books and users 