# Goodreads Book Reviews Analysis - Numerical Data Exploration

## Project Overview
This project aims to analyze **Goodreads book reviews**, focusing on **1-star ratings** to understand patterns in harsh reviews. The analysis is divided into two parts:
1. **Numerical Data Analysis** (Current Stage) - Examining numerical factors such as star ratings, review counts, and genre distributions.
2. **Natural Language Processing (NLP) Analysis** (Next Stage) - Exploring book descriptions and text reviews to identify sentiment patterns.

## Adding dataset with text reviews

In [None]:
import pandas as pd
import json
import gzip

chunk_size= 10000
chunks= []

with gzip.open ("./Data/goodreads_reviews_dedup.json.gz", "rt", encoding="utf-8") as f:
    for i, line in enumerate(f): #read line by line
        chunks.append(json.loads(line)) #convert json to stionf dict

    #every chuck line, process data to write csv
        if (i + 1) % chunk_size == 0:
            df_chunk = pd.DataFrame(chunks)
            df_chunk.to_csv("goodreads_reviews", mode="a", index= False, header = (i < chunk_size))
            chunks = []
        
if chunks:
    df_chunk = pd.DataFrame(chunks)
    df_chunk.to_csv("goodreads_reviews", mode ="a", index=False, header=False) 


In [None]:
df_reviews = pd.read_csv("goodreads_reviews")

In [None]:
df_reviews.head()

In [None]:
df_reviews.info()

In [None]:
df_reviews['book_id'].duplicated().any()

In [None]:
import pandas as pd
import json
import gzip

chunk_size= 10000
chunks= []

with gzip.open ("./Data/goodreads_books.json.gz", "rt", encoding="utf-8") as f:
    for i, line in enumerate(f): #read line by line
        chunks.append(json.loads(line)) #convert json to stionf dict
         
    #every chuck line, process data to write csv
        if (i + 1) % chunk_size == 0:
            df_chunk = pd.DataFrame(chunks)
            df_chunk.to_csv("goodreads_books", mode="a", index= False, header = (i < chunk_size))
            chunks = []
        
if chunks:
    df_chunk = pd.DataFrame(chunks)
    df_chunk.to_csv("goodreads_books", mode ="a", index=False, header=False) 

In [None]:
df_books = pd.read_csv("goodreads_books")

In [None]:
df_books.head(10)

In [None]:
df_books.info()

In [None]:
print(df_books.columns)

In [None]:
df_merged = df_reviews.merge(df_books, on="book_id", how="inner")

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df_merged.head(10)

In [None]:
print(df_merged.columns)

In [None]:
df_merged=df_merged.drop(columns=['user_id','date_added','read_at','started_at','date_updated','read_at','kindle_asin','work_id','n_comments','asin','similar_books','series','similar_books','publication_month','publication_day','edition_information','is_ebook'])


In [None]:
df_merged.info()

In [None]:
df_merged=df_merged.drop(columns=['format', 'num_pages', 'isbn13', 'link', 'title_without_series'])

In [None]:
df_merged['review_id'].duplicated().any()

In [None]:
(df_merged['text_reviews_count']== 0).any()

In [None]:
df_merged[df_merged['text_reviews_count'] == 0]
#?? maybe outdated text review count

In [None]:
df_merged[df_merged['rating'] == 0]
#reviews that have text but no star rating was left? I am choosing to leave these out of analysis

In [None]:
df_merged= df_merged[df_merged['rating'].notna() & (df_merged['rating'] !=0)]

In [None]:
#for this analysis I will only be focusing on english reviews
#removing nonenglish rows and rows with no text in review_text or description. I dont think this will hurt bc the df is so large
df_merged= df_merged.dropna(subset=['review_text','description'])

In [None]:
df_merged.head()

In [None]:
#cleaning popular shelves column
print(df_merged['popular_shelves'].iloc[0])

In [None]:
#seeing which shelves have the highest counts
import ast
from collections import Counter

#function that extracts shelf names from string lists of the shelf dictionaires
def shelf_names(shelves_str):
    shelves_list = ast.literal_eval(shelves_str) #convert the string to a list of dicts
    if isinstance(shelves_list, list):
        return [shelf['name'] for shelf in shelves_list if 'name' in shelf] #extract 'name' value from each dict if it exists
    return []

shelf_counter = Counter()

In [None]:
#very large operation (takes about 100 minutes to run)
for row in df_merged['popular_shelves'].dropna():
    shelf_counter.update(shelf_names(row))

print(shelf_counter.most_common(30))

In [None]:
import random

unique_shelves = list(shelf_counter.keys())
print(f"unique names: {len(unique_shelves)}")

In [None]:
print(shelf_counter.most_common(1000))

In [None]:
def normalize_shelf(name):
    return name.strip().lower().replace(" ", "-")

In [None]:
#Filtering shelf names

In [None]:
#cleaning the author column
print(df_merged['authors'].iloc[0])

In [None]:
#there is already a language code column but it's not through. Try lang detect to fill in missing
from langdetect import detect
df_merged['dec']

In [None]:
#checking for final cleaning steps to slim down dataset futher before splitting  then saving to a csv

In [None]:
#split df into managable chunks for further analysis

In [None]:
for star in range(0,6):
    df_star = df_merged[df_merged['rating'] == star]
    df_star.to_csv(f"{star}star_reviews.csv")

In [None]:
import zipfile
import os

csv_files = ["./Data/1star_reviews.csv"]

zip_path = "./Data/1star_reviews.zip"

with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for file in csv_files:
        arcname = os.path.basename(file)
        zipf.write(file,arcname=arcname)

zip_path

In [None]:
#assigning them to variables then checking size

df_5star = pd.read_csv("./Data/5star_reviews.csv")
df_5star.info()

In [None]:
df_4star = pd.read_csv("./Data/4star_reviews.csv")
df_4star.info()

In [None]:
df_3star = pd.read_csv("./Data/3star_reviews.csv")
df_3star.info()

In [None]:
df_2star = pd.read_csv("./Data/2star_reviews.csv")
df_2star.info()

In [3]:
import pandas as pd
import json
import gzip
import ast
from collections import Counter

In [None]:
pip install "numpy<2"

In [5]:
df_1star = pd.read_csv("./1star_reviews.csv")
df_1star.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 419874 entries, 0 to 419873
Data columns (total 20 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   Unnamed: 0          419874 non-null  int64  
 1   book_id             419874 non-null  int64  
 2   review_id           419874 non-null  object 
 3   rating              419874 non-null  int64  
 4   review_text         419874 non-null  object 
 5   n_votes             419874 non-null  int64  
 6   isbn                328665 non-null  object 
 7   text_reviews_count  419874 non-null  float64
 8   country_code        419874 non-null  object 
 9   language_code       340979 non-null  object 
 10  popular_shelves     419874 non-null  object 
 11  average_rating      419874 non-null  float64
 12  description         419874 non-null  object 
 13  authors             419874 non-null  object 
 14  publisher           347484 non-null  object 
 15  publication_year    358879 non-nul

In [7]:
# taking a sample of the smallest rating dataset to test for cleaning
sample_1star= df_1star.sample(10000, random_state=42)

In [None]:
#cleaning popular shelves column
print(sample_1star['popular_shelves'].iloc[0])

In [9]:
#seeing which shelves have the highest counts
#function that extracts shelf names from string lists of the shelf dictionaires
def shelf_names(shelves_str):
    shelves_list = ast.literal_eval(shelves_str) #convert the string to a list of dicts
    if isinstance(shelves_list, list):
        return [shelf['name'] for shelf in shelves_list if 'name' in shelf] #extract 'name' value from each dict if it exists
    return []

shelf_counter = Counter()

In [11]:
#very large operation (takes about 100 minutes to run)
for row in sample_1star['popular_shelves'].dropna():
    shelf_counter.update(shelf_names(row))

print(shelf_counter.most_common(60))

[('to-read', 9929), ('currently-reading', 9297), ('owned', 8457), ('fiction', 8311), ('favorites', 8263), ('books-i-own', 7961), ('kindle', 7382), ('ebook', 7213), ('library', 7085), ('owned-books', 6950), ('to-buy', 6593), ('ebooks', 6318), ('wish-list', 5941), ('default', 5726), ('contemporary', 5486), ('my-books', 5381), ('audiobook', 5368), ('adult', 5238), ('romance', 5224), ('audiobooks', 5077), ('i-own', 4903), ('my-library', 4853), ('did-not-finish', 4748), ('dnf', 4737), ('audio', 4647), ('abandoned', 4567), ('favourites', 4412), ('e-book', 4404), ('series', 4208), ('novels', 4206), ('read-in-2015', 4022), ('own-it', 3976), ('books', 3873), ('book-club', 3829), ('fantasy', 3827), ('e-books', 3779), ('read-in-2016', 3764), ('read-in-2014', 3740), ('adult-fiction', 3735), ('maybe', 3717), ('young-adult', 3559), ('read-in-2013', 3381), ('read-in-2017', 3149), ('mystery', 3113), ('have', 3004), ('novel', 2992), ('reviewed', 2966), ('borrowed', 2906), ('ya', 2884), ('audible', 2854

In [None]:
import random

unique_shelves = list(shelf_counter.keys())
print(f"unique names: {len(unique_shelves)}")

In [13]:
import ast

blacklist = {
    'reading_status': [
        'read', 'currently reading', 'dnf', 'unread', 'tbr', 'reread', 'finished', 'finish'
    ],
    'ownership': [
        'owned', 'own', 'buy', 'bought', 'borrow', 'library', 'kindle', 'ebook', 'epub', 
        'paperback', 'nook', 'hardcover', 'download', 'ibooks', 'kobo', 'scribd'
    ],
    'rating_review': [
        'star', 'favorite', 'favourite', 'review', 'recommend', 'amazing', 'must', 
        'best', 'loved', 'meh'
    ],
    'promotion_format': [
        'audiobook', 'audio', 'netgalley', 'gift', 'challenge', 'award', 'edition', 
        'collection', 'release', 'published', 'sequel', 'shelve', 'scan', 'pdf', 'giveaway'
    ],
    'proper_nouns': [
        'neal', 'stephenson', 'amy', 'kate', 'robert', 'emily', 'veronica', 'june', 'sophia', 'palmer', 'sarah'
    ],
    'misc': [
        'storage', 'location', 'page', 'purchase', 'bore', 'new', 'hold', 'mine', 
        'drop', 'theme', 'funny', 'didnt', 'purchased', 'print', 'amazon', 'first',
    ]
}


# Flatten into a set of lowercase blacklist words
blacklist_set = set()
for group in blacklist.values():
    blacklist_set.update(word.lower() for word in group)

In [15]:
import ast

def clean_name(name):
    return name.lower().replace('-', ' ').replace('_', ' ').strip()

def extract_shelves(shelves_str):
    try:
        shelves_list = ast.literal_eval(shelves_str)
    except:
        return []

    if isinstance(shelves_list, list):
        return [(clean_name(shelf['name']), int(shelf.get('count', 0)))
                for shelf in shelves_list if 'name' in shelf]
    return []

In [19]:
genre_mapping = {
    'cowboys': 'cowboy',
    'chick lit': 'chick lit',
    'adult fiction': 'adult fiction',
    'cowboy western': 'cowboy western',
    'genre western': 'western',
    'romantic suspense': 'romantic suspense',
    'action': 'action',
    'series romance': 'romance',
    'genre romance': 'romance',
    'romance modern': 'modern romance',
    'science fiction': 'science fiction',
    'sci fi': 'science fiction',
    'scifi': 'science fiction',
    'post apocalyptic': 'post apocalyptic',
    'sf': 'science fiction',
    'sci fi fantasy': 'science fiction fantasy',
    'dystopia': 'dystopian',
    'apocalyptic': 'apocalyptic',
    'science': 'science',
    'speculative fiction': 'speculative fiction',
    'fantasy sci fi': 'science fiction fantasy',
    'apocalypse': 'apocalyptic',
    'space opera': 'space opera',
    'science fiction fantasy': 'science fiction fantasy',
    'hard sci fi': 'hard science fiction',
    'sff': 'science fiction fantasy',
    'post apocalypse': 'post apocalyptic',
    'sf fantasy': 'science fiction fantasy',
    'sci fi and fantasy': 'science fiction fantasy',
    'hard scifi': 'hard science fiction',
    'sciencefiction': 'science fiction',
    'regency romance': 'regency romance',
    'romance historical': 'historical romance',
    'mf': 'm f',
    'historical romances': 'historical romance',
    'historicals': 'historical',
    'humorous': 'humor',
    'humour': 'humor',
    'humour comedy': 'humor',
    'young adult': 'young adult',
    'ya': 'young adult',
    'fairies': 'fairies',
    'faeries': 'fairies',
    'faerie': 'fairies',
    'fey': 'fae',
    'ya fantasy': 'young adult fantasy',
    'paranormal romance': 'paranormal romance',
    'historical fantasy': 'historical fantasy',
    'historical fic': 'historical fiction',
    'supernatural': 'supernatural',
    'faries': 'fairies',
    'classic lit': 'classic literature',
    'british lit': 'british literature',
    'brit lit': 'british literature',
    'english lit': 'english literature',
    'lit': 'literature',
    'feminist': 'feminism',
    'ya books': 'young adult books',
    'ya fiction': 'young adult fiction',
    'ya': 'young adult',
    'non fiction': 'nonfiction',
    'non fic': 'nonfiction',
    'memoirs': 'memoir',
    'distopian': 'dystopian',
    'ya dystopian': 'young adult dystopian',
    'ya lit': 'young adult literature'
}

In [21]:
from collections import defaultdict

def map_genres(tag_list, genre_mapping):
    tag_counts = defaultdict(int)

    for tag, count in tag_list:
        tag_clean = tag.lower()
        mapped_tag = genre_mapping.get(tag_clean, tag_clean)
        tag_counts[mapped_tag] += count

    # Convert back to list of tuples sorted by count (optional)
    aggregated = sorted(tag_counts.items(), key=lambda x: x[1], reverse=True)
    return aggregated

In [31]:
def apply_cleaning_pipeline(shelves_str, genre_mapping, blacklist_words):
    # Step 1: Parse shelves from string
    shelves = extract_shelves(shelves_str)

    # Step 2: Map tags using genre_mapping
    mapped = []
    for tag, count in shelves:
        cleaned_tag = clean_name(tag)
        mapped_tag = genre_mapping.get(cleaned_tag, cleaned_tag)
        mapped.append((mapped_tag, count))

    # Step 3: Remove shelves if mapped tag contains ANY blacklist word
    result = []
    for tag, count in mapped:
        words = tag.split()
        if not any(bad_word in tag for bad_word in blacklist_words):
            result.append((tag, count))

    # Step 4: Aggregate repeated genres and sort
    from collections import defaultdict
    aggregated = defaultdict(int)
    for tag, count in result:
        aggregated[tag] += count

    return sorted(aggregated.items(), key=lambda x: x[1], reverse=True)

In [33]:
sample_1star['cleaned_shelves'] = sample_1star['popular_shelves'].apply(
    apply_cleaning_pipeline, 
    args=(genre_mapping, blacklist))

UnboundLocalError: cannot access local variable 'filtered' where it is not associated with a value

In [None]:
sample_1star[['cleaned_shelves']].head(20)

In [None]:
#attempting to clean the html first
