In [45]:
import json
import csv
import os


input_path = "../raw_data/goodreads_books.json"
output_path = "../raw_data/goodreads_books_50k.csv"
max_rows = 50000


with open(input_path, "r", encoding="utf-8") as fin, \
     open(output_path, "w", encoding="utf-8", newline="") as fout:


    first_line = fin.readline().strip()
    first_obj = json.loads(first_line)

    fieldnames = list(first_obj.keys())
    writer = csv.DictWriter(fout, fieldnames=fieldnames)
    writer.writeheader()

    writer.writerow(first_obj)

    for i, line in enumerate(fin, start=2):
        if i > max_rows:
            break

        if not line.strip():
            continue

        obj = json.loads(line)
        writer.writerow(obj)


        if i % 10000 == 0:
            print(f"finished {i} rows...")

print(f"\nfinished {max_rows} rows → {output_path}")

finished 10000 rows...
finished 20000 rows...
finished 30000 rows...
finished 40000 rows...
finished 50000 rows...

finished 50000 rows → ../raw_data/goodreads_books_50k.csv


In [46]:
import pandas as pd
df = pd.read_csv("../raw_data/goodreads_books_50k.csv")

In [47]:
df.shape

(50000, 29)

In [48]:
df.columns.tolist()

['isbn',
 'text_reviews_count',
 'series',
 'country_code',
 'language_code',
 'popular_shelves',
 'asin',
 'is_ebook',
 'average_rating',
 'kindle_asin',
 'similar_books',
 'description',
 'format',
 'link',
 'authors',
 'publisher',
 'num_pages',
 'publication_day',
 'isbn13',
 'publication_month',
 'edition_information',
 'publication_year',
 'url',
 'image_url',
 'book_id',
 'ratings_count',
 'work_id',
 'title',
 'title_without_series']

In [49]:
df.head(3)

Unnamed: 0,isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,kindle_asin,...,publication_month,edition_information,publication_year,url,image_url,book_id,ratings_count,work_id,title,title_without_series
0,312853122.0,1.0,[],US,,"[{'count': '3', 'name': 'to-read'}, {'count': ...",,False,4.0,,...,9.0,,1984.0,https://www.goodreads.com/book/show/5333265-w-...,https://images.gr-assets.com/books/1310220028m...,5333265,3.0,5400751.0,W.C. Fields: A Life on Film,W.C. Fields: A Life on Film
1,743509986.0,6.0,[],US,,"[{'count': '2634', 'name': 'to-read'}, {'count...",,False,3.23,B000FC0PBC,...,10.0,Abridged,2001.0,https://www.goodreads.com/book/show/1333909.Go...,https://s.gr-assets.com/assets/nophoto/book/11...,1333909,10.0,1323437.0,Good Harbor,Good Harbor
2,,7.0,['189911'],US,eng,"[{'count': '58', 'name': 'to-read'}, {'count':...",B00071IKUY,False,4.03,,...,,Book Club Edition,1987.0,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,7327624,140.0,8948723.0,"The Unschooled Wizard (Sun Wolf and Starhawk, ...","The Unschooled Wizard (Sun Wolf and Starhawk, ..."


In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 29 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   isbn                  29140 non-null  object 
 1   text_reviews_count    49991 non-null  float64
 2   series                50000 non-null  object 
 3   country_code          49992 non-null  object 
 4   language_code         27657 non-null  object 
 5   popular_shelves       50000 non-null  object 
 6   asin                  9935 non-null   object 
 7   is_ebook              49992 non-null  object 
 8   average_rating        49991 non-null  float64
 9   kindle_asin           21532 non-null  object 
 10  similar_books         50000 non-null  object 
 11  description           41352 non-null  object 
 12  format                36292 non-null  object 
 13  link                  49991 non-null  object 
 14  authors               50000 non-null  object 
 15  publisher          

In [51]:
missing_ratio = df.isna().mean().sort_values(ascending=False)
missing_ratio

edition_information     0.90880
asin                    0.80130
kindle_asin             0.56936
language_code           0.44686
publication_day         0.43354
isbn                    0.41720
publication_month       0.37222
isbn13                  0.32780
num_pages               0.32356
publisher               0.27712
format                  0.27416
publication_year        0.25422
description             0.17296
work_id                 0.00018
url                     0.00018
ratings_count           0.00018
link                    0.00018
average_rating          0.00018
text_reviews_count      0.00018
country_code            0.00016
is_ebook                0.00016
image_url               0.00016
similar_books           0.00000
series                  0.00000
popular_shelves         0.00000
authors                 0.00000
book_id                 0.00000
title                   0.00000
title_without_series    0.00000
dtype: float64

many information is missing in "edition_information", also not helpful, "asin" (Amazon ID, can delete), "kindle_asin" (Kindle Amazin ID), "publication_day", "publication_month", "format", "publisher", "isbn". Could be removed.

In [52]:
#description is important, 17% missing values, fillna with empty string
df["description"] = df["description"].fillna("")


In [53]:
df["language_code"].value_counts(dropna=False).head(20)

language_code
NaN      22343
eng      15218
en-US     2004
en-GB     1210
spa       1160
ita       1020
ara        855
fre        643
ger        607
ind        581
por        536
nl         375
tur        306
per        247
fin        245
swe        223
gre        215
cze        213
en-CA      189
jpn        160
Name: count, dtype: int64

In [None]:
#removed other languages books, keep only English and unknown language books
df["language_code_clean"] = df["language_code"].fillna("unknown")

english_codes = ["eng", "en-US", "en-GB", "en-CA", "unknown"]

df_eng = df[df["language_code_clean"].isin(english_codes)].copy()
#df_eng=df_eng.drop(columns=["language_code"])
df_eng.shape

(40964, 29)

In [55]:
df_eng["authors"].head(3)

0    [{'author_id': '604031', 'role': ''}]
1    [{'author_id': '626222', 'role': ''}]
2     [{'author_id': '10333', 'role': ''}]
Name: authors, dtype: object

In [56]:
# changing formats for "Authors", "Popular Shelves", "Similar Books" columns
import ast
def parse_list_dict(s):
    try:
        return ast.literal_eval(s)
    except:
        return []

In [57]:
df_eng["authors_parsed"] = df_eng["authors"].apply(parse_list_dict)
df_eng["author_ids"] = df_eng["authors_parsed"].apply(lambda lst: [d.get("author_id") for d in lst])

In [58]:
df_eng["shelves_parsed"] = df_eng["popular_shelves"].apply(parse_list_dict)
df_eng["shelf_names"] = df_eng["shelves_parsed"].apply(lambda lst: [d.get("name") for d in lst])

In [59]:
def parse_simple_list(s):
    try:
        return ast.literal_eval(s)
    except:
        return []

df_eng["similar_books_parsed"] = df_eng["similar_books"].apply(parse_simple_list)

In [65]:
df_eng.shape

AttributeError: 'NoneType' object has no attribute 'shape'

In [None]:
#df_eng=df_eng.drop(columns=["authors", "authors_parsed", "popular_shelves", "shelves_parsed", "similar_books"], inplace=True)

AttributeError: 'NoneType' object has no attribute 'drop'

In [None]:
# To remove columns that are not useful for analysis
cols_to_drop = [
    "edition_information",
    "asin",
    "kindle_asin",
    "publication_day",
    "publication_month",
    "format",
    "publisher",
    "isbn"          # kept isbn13,
    "authors", "authors_parsed", "popular_shelves", "shelves_parsed", "similar_books","language_code"
]
df_new=df_eng.drop(columns=cols_to_drop, inplace=True)

AttributeError: 'NoneType' object has no attribute 'drop'

In [None]:
df_eng.head(5)

AttributeError: 'NoneType' object has no attribute 'head'