# EDA of initial data download

In [1]:
from datetime import date
import gzip
import json
from pathlib import Path

import altair as alt
import arrow
import pandas as pd

In [2]:
input_dir = Path("../data/raw/20210527_initial_data_children_biography/")

In [3]:
genres = ["children", "history_biography"]

## Plot ratings distribution by genre to define "good" and "bad" ratings

In [4]:
def load_ratings(genres):
    for genre in genres:
        input_file = input_dir / f"goodreads_interactions_{genre}.json.gz"
        with gzip.open(input_file, "rt") as fin:
            for i, line in enumerate(fin):
#                 if i > 999:
#                     break
                j = json.loads(line)
                if j["rating"] > 0:
                    d = arrow.get(j["date_updated"], "ddd MMM DD HH:mm:ss Z YYYY").date()
                    first_day_month = date(year=d.year, month=d.month, day=1)
                    yield j["user_id"], j["book_id"], first_day_month, genre, j["rating"]

In [5]:
rating_df = pd.DataFrame(load_ratings(genres), columns=["user_id", "book_id", "month", "genre", "rating"])
rating_df["month"] = pd.to_datetime(rating_df["month"])
rating_df = rating_df.convert_dtypes()

In [6]:
ratings_per_genre = rating_df.groupby(["genre", "rating"]).size()
ratings_per_genre = ratings_per_genre.reset_index()
ratings_per_genre.columns = ["Genre", "Rating", "Count"]

In [7]:
alt.Chart(ratings_per_genre).mark_bar().encode(
    x="Rating:O",
    y="Count:Q",
    column="Genre:N"
)

### Decision

Labeling ratings $\leq 3$ as negative and $> 3$ as positive might be a first thing to try.

Based on my own experience, there is also a self-justification aspect to be aware of when interpreting ratings: when I spend the time reading a whole book, rating it less then 3 becomes unlikely even though I did not really like the book because rating it $< 3$ would mean admitting that I wasted a lot of time reading the book in the first place.

## Plot ratings over time to define temporal train/dev/test splits

In [8]:
nrating_per_genre_month = rating_df.groupby(["genre", "month"]).size()
nrating_per_genre_month = nrating_per_genre_month.reset_index()
nrating_per_genre_month.columns = ["Genre", "Month", "N_ratings"]

In [9]:
nrating_per_genre_month.sort_values("Month", inplace=True)

In [10]:
print(nrating_per_genre_month.head())
print(nrating_per_genre_month.tail())

                 Genre      Month  N_ratings
0             children 2006-08-01          2
134  history_biography 2006-09-01          2
135  history_biography 2006-10-01          1
136  history_biography 2006-11-01          1
1             children 2006-11-01          4
                 Genre      Month  N_ratings
266  history_biography 2017-09-01     141815
267  history_biography 2017-10-01     118018
132           children 2017-10-01      50612
133           children 2017-11-01        889
268  history_biography 2017-11-01       2346


In [11]:
alt.Chart(nrating_per_genre_month).mark_bar().encode(
    x="Month:T",
    y="N_ratings:Q",
    column="Genre:N"
)

### Decision

We will use the months July+August 2017 as dev set and September+October 2017 as test set. The training set ranges from August 2006 to June 2017.

The number of ratings given per month is somewhat steady from 2012 with many ratings given each month. There is however a drop in the number of ratings in Octover 2017 and, more signifantly, in November 2017 probably because the dataset was generating by collecting Goodreads data over multiple weeks in this period.

## Find threshold for books and users with too few ratings to make a meaningful prediction

In [12]:
nrating_per_user = rating_df.groupby("user_id").size()
nrating_per_user = nrating_per_user.reset_index()
nrating_per_user.columns = ["User", "N_ratings"]

In [13]:
nrating_per_user.sort_values("N_ratings", inplace=True, ascending=False)

In [14]:
print(nrating_per_user.head())
print(nrating_per_user.tail())

                                    User  N_ratings
131092  2fe1c721ba1cc6c30d4c09965bb82f78      10578
541421  c5823767a1a164cd8e9d029f1806f2aa       6955
394554  9003d274774f4c47e62f77600b08ac1d       5377
633702  e74579ee17545a6c4595ab0e43fce8a8       5337
232323  54da946caf0be4f0eedf6728da39bac7       4981
                                    User  N_ratings
232974  551a6df2dc37586bb610aa4408a536b2          1
639428  e9603e55d5b6ef8050ad1c11420a3338          1
639426  e9602ce2025f6a5f94e00ff8a52b41ef          1
232967  5519d9416db7c2fef5485bb2bbb3380b          1
420380  9979738b01ba7628eed487d3c52da57b          1


In [15]:
cutoff = 5
removed = sum(nrating_per_user["N_ratings"] < cutoff)
initial = len(nrating_per_user)
print(f"Cutoff {cutoff} removes {removed} of {initial} ({removed/initial:%}) users.")

Cutoff 5 removes 221020 of 701418 (31.510455%) users.


In [16]:
nrating_per_book = rating_df.groupby("book_id").size()
nrating_per_book = nrating_per_book.reset_index()
nrating_per_book.columns = ["Book", "N_ratings"]

In [17]:
nrating_per_book.sort_values("N_ratings", inplace=True, ascending=False)

In [18]:
print(nrating_per_book.head())
print(nrating_per_book.tail())

         Book  N_ratings
230302   2657     191018
319267      5     172207
317552  48855     122493
3447    10210      95682
298404   3636      93057
            Book  N_ratings
268866  31348245          1
268865  31348232          1
268855   3134669          1
268854  31346594          1
209077  25001101          1


In [19]:
cutoff = 5
removed = sum(nrating_per_book["N_ratings"] < cutoff)
initial = len(nrating_per_book)
print(f"Cutoff {cutoff} removes {removed} of {initial} ({removed/initial:%}) books.")

Cutoff 5 removes 184179 of 418154 (44.045734%) books.


### Decision

After playing with cutoffs, a cutoff of 5, i.e. removing books and users with less than 5 associated ratings, seems reasonable. Thid cutoff will still yield a dataset including most books and users but exclude entities that have to few ratings to make meaningful predictions.