# Goodreads Book Reviews Analysis - Sample 10,000

## Purpose:
This notebook contains the core thematic analysis pipeline applied to a 10,000-record sample of 1-star reviews from the UCSD Goodreads dataset. It was developed to enable scalable natural language processing and topic extraction before applying to the full dataset.

## Focus:
	•	Clean and normalize review text
	•	Apply rule-based theme tagging with keyword dictionaries
	•	Use Non-negative Matrix Factorization (NMF) to extract themes from uncategorized reviews
	•	Re-assign updated themes and simplify them into primary categories (top_theme)
	•	Generate visualizations to explore patterns in dissatisfaction

## Outcome:
This notebook is the analytical heart of the project and showcases how user complaints can be interpreted, structured, and visualized using scalable NLP workflows.

## Adding dataset with text reviews

In [None]:
import pandas as pd
import json
import gzip
import ast
from collections import Counter

In [None]:
pip install "numpy<2"

In [None]:
import os 

In [None]:
df_1star = pd.read_csv("../Data/1star_reviews.csv")
df_1star.info()

In [None]:
# taking a sample of the smallest rating dataset to test for cleaning
sample_1star= df_1star.sample(10000, random_state=42)

In [None]:
sample_1star.head(40)

In [None]:
# Step 1: Create a cleaned language column but keep NaNs
sample_1star = sample_1star[
    sample_1star['language_code'].isna() |
    sample_1star['language_code'].str.lower().str.contains(r'\ben\b|\beng\b|en-', na=False)
]

In [None]:
sample_1star['language_code'].value_counts(dropna=False)

In [None]:
!pip install langdetect

In [None]:
from langdetect import detect
import langdetect

def detect_language(text):
    try:
        return detect(text)
    except langdetect.lang_detect_exception.LangDetectException:
        return "unknown"

# Add 'lang' column to sample_1star
sample_1star['lang'] = sample_1star['review_text'].apply(detect_language)

# Filter for English reviews and assign to df_clean
sample_1star = sample_1star[sample_1star['lang'] == 'en'].reset_index(drop=True)

In [None]:
import re

def clean_text(text):
    if pd.isna(text):
        return ""
    
    text = text.lower()                            # Lowercase
    text = re.sub(r'<[^>]+>', '', text)            # Remove HTML tags
    text = re.sub(r'\s+', ' ', text)               # Normalize whitespace
    text = re.sub(r'http\S+', '', text)            # Remove URLs
    text = re.sub(r'[^a-z0-9\s.,!?\'"-]', '', text)  # Remove special characters except common punctuation
    text = text.strip()                            # Trim leading/trailing whitespace
    return text

In [None]:
sample_1star['review_clean'] = sample_1star['review_text'].apply(clean_text)
sample_1star['description_clean'] = sample_1star['description'].apply(clean_text)

In [None]:
sample_1star.sample(40)

In [None]:
sample_1star.columns

In [None]:
pip install textblob

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from textblob import TextBlob

In [None]:
pip install wordcloud

# A look into review text with and without NLP

In [None]:
from wordcloud import WordCloud, STOPWORDS

# Combine all review text into one big string
all_reviews = " ".join(review for review in sample_1star['review_clean'].dropna())

# Define stopwords to exclude common words
custom_stopwords = set(STOPWORDS)
custom_stopwords.update([ 'book', 'read', 'one', 'really', 'even', 'get', 'know', 'make', 'thing',
    'think', 'way', 'page', 
     'time','would', 'could', 'like', 'well', 'just', 'books',
    'say', 'thought', 'felt', 'want', 'back', 'reading', 'see', 'go', 'going',
    'take', 'something', 'much', 'still', 'good', 'bad', 'end', 'start',
    'main', 'people', 'done', 'felt', 'lot', 'actually', 'put', 'will', 'first', 'use',
    'maybe', 'find', 'say', 'said', 'try', 'trying', 'readers', 'review','books', 'reading', 'one', 'like'
])  

# Generate the word cloud
wordcloud = WordCloud(width=1000, height=600, background_color='white',
                      stopwords=custom_stopwords, max_words=200).generate(all_reviews)

# Plot it
plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Most Frequent Words in 1-Star Goodreads Reviews", fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Use bigrams and trigrams
vectorizer = CountVectorizer(ngram_range=(3, 4), stop_words='english', max_features=100)
X = vectorizer.fit_transform(sample_1star['review_clean'].dropna())

# Sum the frequencies
sum_words = X.sum(axis=0)
phrases_freq = [(phrase, sum_words[0, idx]) for phrase, idx in vectorizer.vocabulary_.items()]
phrases_freq = sorted(phrases_freq, key=lambda x: x[1], reverse=True)

# Convert to DataFrame
ngram_df = pd.DataFrame(phrases_freq, columns=['Phrase', 'Frequency'])

# Display top results
plt.figure(figsize=(12,6))
plt.barh(ngram_df['Phrase'][:20][::-1], ngram_df['Frequency'][:20][::-1], color='darkred')
plt.title('Most Frequent 3-4 Word Phrases in 1-Star Reviews')
plt.xlabel('Frequency')
plt.tight_layout()
plt.show()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Use bigrams and trigrams
vectorizer2 = CountVectorizer(ngram_range=(4, 5), stop_words='english', max_features=100)
X_2 = vectorizer2.fit_transform(sample_1star['review_clean'].dropna())

# Sum the frequencies
sum_words2 = X_2.sum(axis=0)
phrases_freq2 = [(phrase, sum_words[0, idx]) for phrase, idx in vectorizer2.vocabulary_.items()]
phrases_freq2 = sorted(phrases_freq2, key=lambda x: x[1], reverse=True)

# Convert to DataFrame
ngram2_df = pd.DataFrame(phrases_freq2, columns=['Phrase', 'Frequency'])

# Display top results
plt.figure(figsize=(12,6))
plt.barh(ngram2_df['Phrase'][:20][::-1], ngram2_df['Frequency'][:20][::-1], color='darkred')
plt.title('Most Frequent 4-5 Word Phrases in 1-Star Reviews')
plt.xlabel('Frequency')
plt.tight_layout()
plt.show()

In [None]:
ngram2_df.head(20)

In [None]:
# Theme mapping dictionary: theme → list of indicative phrases/keywords
complaint_themes = {
    'Character Issues': [
        'main character', 'character development', 'year old', 'immature', 'annoying', 'love interest',
        'unlikable', 'flat character', 'cardboard cutout', 'inconsistent character',
        'poorly developed', 'shallow', 'unlikeable', 'irritating', 'stupid character',
        'weak character', 'flawed character', 'sympathy for', 'no connection with',
        'motivations unclear', 'acted out of character', 'forced relationship',
        'insta-love', 'toxic relationship', 'protagonist', 'antagonist',
        'side character', 'underdeveloped characters', 'one-dimensional',
        'contrived motivations', 'unbelievable actions', 'lack of depth',
        'superficial', 'self-absorbed', 'whiny', 'passive', 'aggressive',
        'jerk', 'bitch', 'mary sue', 'gary stu', 'author insert',
        'wish fulfillment character', 'unearned development',
        'rushed character arc', 'stagnant character', 'rely on stereotypes'
    ],
    'Plot/Structure': [
        'storyline', 'plot', 'spoiler alert', 'twist', 'didn make sense', 'nothing happened',
        'predictable', 'rushed ending', 'slow burn', 'pacing issues', 'convoluted plot',
        'plot holes', 'deus ex machina', 'uneven pacing', 'anticlimactic', 'pointless subplot',
        'disjointed', 'rambling', 'contrived', 'repetitive plot points', 'no resolution',
        'weak plot', 'thin plot', 'overly complicated', 'underdeveloped plot',
        'too much exposition', 'info dump', 'lack of focus', 'meandering',
        'circular narrative', 'incoherent', 'illogical', 'unrealistic plot',
        'convenient coincidences', 'forced conflict', 'lack of stakes',
        'unnecessary scenes', 'padding', 'false climax', 'unsatisfying ending',
        'cliffhanger with no payoff', 'sequel baiting', 'felt incomplete'
    ],
    'Writing Style': [
        'writing style', 'bad writing', 'purple prose', 'repetitive', 'boring', 'poorly written',
        'clunky prose', 'awkward phrasing', 'stilted dialogue', 'unnatural dialogue',
        'telling not showing', 'over descriptive', 'underdeveloped', 'simplistic writing',
        'pretentious writing', 'amateurish', 'grammatical errors', 'typos', 'editing issues',
        'poor sentence structure', 'weak vocabulary', 'monotonous', 'flow issues',
        'choppy', 'dense writing', 'impenetrable', 'overuse of adjectives',
        'clichés', 'hackneyed', 'trite', 'melodramatic', 'overwrought',
        'infodumping through dialogue', 'dialogue felt forced',
        'internal monologue overuse', 'head hopping', 'inconsistent tense'
    ],
    'Engagement': [
        'couldn finish', 'waste time', 'feel like', 'just didn', 'didn like', 'slow', 'dragged',
        'lost interest', 'tedious', 'struggled to get through', 'hard to follow',
        'unengaging', 'dull', 'plodding', 'sleep-inducing', 'wish i hadn read',
        'skimming', 'couldn\'t connect', 'no emotional impact', 'didn care about',
        'wanted it to end', 'a chore to read', 'painful to read', 'eyes glazed over',
        'mind wandered', 'checked page count constantly', 'felt like a slog',
        'momentum stalled', 'pacing was off', 'never invested', 'no suspense',
        'lacked excitement', 'failed to captivate'
    ],
    'Expectations vs Reality': [
        'like book', 'expected', 'thought would', 'overhyped',
        'not what i expected', 'misleading description', 'different from summary',
        'disappointed', 'underwhelming', 'fell flat', 'not as good as', 'wasted potential',
        'false advertising', 'bait and switch', 'promised more than delivered',
        'didn live up to the hype', 'genre wasn\'t what i thought',
        'cover was misleading', 'title was misleading', 'blurb was inaccurate',
        'reviews were misleading', 'fanbase is delusional'
    ],
    'Offensive Content': [
        'offensive', 'problematic', 'sexist', 'racist', 'abuse', 'trigger',
        'misogynistic', 'homophobic', 'ableist', 'culturally insensitive',
        'gory', 'disturbing', 'gratuitous violence', 'sexual assault',
        'animal cruelty', 'hate speech', 'stereotypes', 'colorism', 'fatphobia',
        'victim blaming', 'glorification of violence', 'romanticizing abuse',
        'toxic masculinity', 'white savior trope', 'bury your gays trope',
        'fridging', 'rape as plot device', 'unnecessary graphic detail'
    ],
    'Genre Mismatch': [
        'not romance', 'not fantasy', 'genre', 'more thriller than',
        'felt like', 'marketed as', 'supposed to be', 'wrong genre',
        'elements of', 'blended genres poorly', 'not enough',
        'too much [genre element]', 'this isn\'t [genre]', 'where\'s the',
        'misleading genre tag', 'didn fit the category', 'cross-genre failure',
        'felt like a different genre entirely', 'no present'
    ]
}

In [None]:
def assign_themes(review_text, theme_map):
    review_text = review_text.lower()
    matched_themes = []

    for theme, keywords in theme_map.items():
        if any(kw in review_text for kw in keywords):
            matched_themes.append(theme)

    return matched_themes if matched_themes else ['Uncategorized']

In [None]:
# Apply the theme assignment to your cleaned reviews
sample_1star['complaint_themes'] = sample_1star['review_clean'].apply(
    lambda x: assign_themes(x, complaint_themes)
)

In [None]:
from collections import Counter
from itertools import chain

# Flatten list of themes and count
theme_counts = Counter(chain.from_iterable(sample_1star['complaint_themes']))
print(theme_counts.most_common())

In [None]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from wordcloud import STOPWORDS
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from nltk.stem import WordNetLemmatizer

# Assuming your DataFrame is named 'sample_1star'
uncategorized_df = sample_1star[sample_1star['complaint_themes'].apply(lambda x: 'Uncategorized' in x)].copy()
uncategorized_reviews = uncategorized_df['review_clean'].dropna().tolist()

# 1. Define Stopwords (using your combined list)
nltk_stopwords = set(stopwords.words('english'))
wordcloud_stopwords = set(STOPWORDS)
custom_stopwords = set([
    'book', 'read', 'one', 'really', 'even', 'get', 'know', 'make', 'thing',
    'think', 'way', 'page', 'time', 'would', 'could', 'like', 'well', 'just', 'books',
    'say', 'thought', 'felt', 'want', 'back', 'reading', 'see', 'go', 'going',
    'take', 'something', 'much', 'still', 'good', 'bad', 'end', 'start',
    'main', 'people', 'done', 'felt', 'lot', 'actually', 'put', 'will', 'first', 'use',
    'maybe', 'find', 'say', 'said', 'try', 'trying', 'readers', 'review'
])
combined_stopwords_set = nltk_stopwords.union(wordcloud_stopwords).union(custom_stopwords)

# 2. Lemmatization and Stopword Removal for Uncategorized Reviews
lemmatizer = WordNetLemmatizer()
processed_uncategorized_reviews = []
for review in uncategorized_reviews:
    tokens = word_tokenize(review.lower())
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    processed_tokens = [token for token in lemmatized_tokens if token not in combined_stopwords_set]
    processed_uncategorized_reviews.append(" ".join(processed_tokens))

# 3. Feature Extraction with TF-IDF for Uncategorized Reviews
tfidf_vectorizer_uncat = TfidfVectorizer(stop_words=list(combined_stopwords_set),
                                        ngram_range=(1, 3),
                                        max_df=0.90,
                                        min_df=2)
tfidf_matrix_uncat = tfidf_vectorizer_uncat.fit_transform(processed_uncategorized_reviews)
feature_names_uncat = tfidf_vectorizer_uncat.get_feature_names_out()

# 4. Train the NMF Model for Uncategorized Reviews
num_topics_uncat = 7  # You can experiment with the number of topics for the uncategorized data
nmf_model_uncat = NMF(n_components=num_topics_uncat, random_state=42, max_iter=300)
nmf_model_uncat.fit(tfidf_matrix_uncat)

# 5. Analyze the Topics for Uncategorized Reviews
print("\nNMF Topics for Uncategorized Reviews:")
for topic_idx, topic in enumerate(nmf_model_uncat.components_):
    top_words_indices = topic.argsort()[:-21:-1]
    top_words = [feature_names_uncat[i] for i in top_words_indices]
    print(f"Uncategorized Topic {topic_idx + 1}: {' '.join(top_words)}")

# 6. Get Topic Assignments for Uncategorized Reviews (Optional)
doc_topic_matrix_uncat = nmf_model_uncat.transform(tfidf_matrix_uncat)
dominant_topics_uncat = [row.argmax() + 1 for row in doc_topic_matrix_uncat]

# 7. Add NMF Topic Assignments to the Uncategorized DataFrame (Optional)
uncategorized_df['nmf_topic_uncat'] = pd.Series(dominant_topics_uncat, index=uncategorized_df.index[:len(dominant_topics_uncat)])

# Merge the NMF topic assignments back into the original DataFrame
sample_1star = pd.merge(sample_1star, uncategorized_df[['review_id', 'nmf_topic_uncat']], on='review_id', how='left')

print("\nDataFrame with NMF Topic Labels for Uncategorized (First 10):")
print(sample_1star[['review_clean', 'complaint_themes', 'nmf_topic_uncat']].head(10))

In [None]:
complaint_themes_updated = {
    'Character Issues': [
        'main character', 'character development', 'year old', 'immature', 'annoying', 'love interest',
        'unlikable', 'flat character', 'cardboard cutout', 'inconsistent character',
        'poorly developed', 'shallow', 'unlikeable', 'irritating', 'stupid character',
        'weak character', 'flawed character', 'sympathy for', 'no connection with',
        'motivations unclear', 'acted out of character', 'forced relationship',
        'insta-love', 'toxic relationship', 'protagonist', 'antagonist',
        'side character', 'underdeveloped characters', 'one-dimensional',
        'contrived motivations', 'unbelievable actions', 'lack of depth',
        'superficial', 'self-absorbed', 'whiny', 'passive', 'aggressive',
        'jerk', 'bitch', 'mary sue', 'gary stu', 'author insert',
        'wish fulfillment character', 'unearned development',
        'rushed character arc', 'stagnant character', 'rely on stereotypes',
        'stupid', 'hate'  # Added from Strong Negative Sentiment
    ],
    'Plot/Structure': [
        'storyline', 'plot', 'spoiler alert', 'twist', 'didn make sense', 'nothing happened',
        'predictable', 'rushed ending', 'slow burn', 'pacing issues', 'convoluted plot',
        'plot holes', 'deus ex machina', 'uneven pacing', 'anticlimactic', 'pointless subplot',
        'disjointed', 'rambling', 'contrived', 'repetitive plot points', 'no resolution',
        'weak plot', 'thin plot', 'overly complicated', 'underdeveloped plot',
        'too much exposition', 'info dump', 'lack of focus', 'meandering',
        'circular narrative', 'incoherent', 'illogical', 'unrealistic plot',
        'convenient coincidences', 'forced conflict', 'lack of stakes',
        'unnecessary scenes', 'padding', 'false climax', 'unsatisfying ending',
        'cliffhanger with no payoff', 'sequel baiting', 'felt incomplete',
        'story' # Added from Uncategorized Topic 1 and 4
    ],
    'Writing Style': [
        'writing style', 'bad writing', 'purple prose', 'repetitive', 'boring', 'poorly written',
        'clunky prose', 'awkward phrasing', 'stilted dialogue', 'unnatural dialogue',
        'telling not showing', 'over descriptive', 'underdeveloped', 'simplistic writing',
        'pretentious writing', 'amateurish', 'grammatical errors', 'typos', 'editing issues',
        'poor sentence structure', 'weak vocabulary', 'monotonous', 'flow issues',
        'choppy', 'dense writing', 'impenetrable', 'overuse of adjectives',
        'clichés', 'hackneyed', 'trite', 'melodramatic', 'overwrought',
        'infodumping through dialogue', 'dialogue felt forced',
        'internal monologue overuse', 'head hopping', 'inconsistent tense',
        'written' # Added from Uncategorized Topic 4
    ],
    'Engagement': [
        'couldn finish', 'waste time', 'feel like', 'just didn', 'didn like', 'slow', 'dragged',
        'lost interest', 'tedious', 'struggled to get through', 'hard to follow',
        'unengaging', 'dull', 'plodding', 'sleep-inducing', 'wish i hadn read',
        'skimming', 'couldn\'t connect', 'no emotional impact', 'didn care about',
        'wanted it to end', 'a chore to read', 'painful to read', 'eyes glazed over',
        'mind wandered', 'checked page count constantly', 'felt like a slog',
        'momentum stalled', 'pacing was off', 'never invested', 'no suspense',
        'lacked excitement', 'failed to captivate',
        'couldn\'t continue', 'gave up', 'stopped reading', 'dnf', 'did not finish',
        'bored', 'finish', 'disappointment', 'wa' # Added from NMF Topics
    ],
    'Expectations vs Reality': [
        'like book', 'expected', 'thought would', 'overhyped',
        'not what i expected', 'misleading description', 'different from summary',
        'disappointed', 'underwhelming', 'fell flat', 'not as good as', 'wasted potential',
        'false advertising', 'bait and switch', 'promised more than delivered',
        'didn live up to the hype', 'genre wasn\'t what i thought',
        'cover was misleading', 'title was misleading', 'blurb was inaccurate',
        'reviews were misleading', 'fanbase is delusional',
        'waste of time', 'waste of money', 'didn\'t feel worth it', 'awful', 'terrible' # Added from NMF Topics
    ],
    'Offensive Content': [
        'offensive', 'problematic', 'sexist', 'racist', 'abuse', 'trigger',
        'misogynistic', 'homophobic', 'ableist', 'culturally insensitive',
        'gory', 'disturbing', 'gratuitous violence', 'sexual assault',
        'animal cruelty', 'hate speech', 'stereotypes', 'colorism', 'fatphobia',
        'victim blaming', 'glorification of violence', 'romanticizing abuse',
        'toxic masculinity', 'white savior trope', 'bury your gays trope',
        'fridging', 'rape as plot device', 'unnecessary graphic detail'
    ],
    'Genre Mismatch': [
        'not romance', 'not fantasy', 'genre', 'more thriller than',
        'felt like', 'marketed as', 'supposed to be', 'wrong genre',
        'elements of', 'blended genres poorly', 'not enough',
        'too much [genre element]', 'this isn\'t [genre]', 'where\'s the',
        'misleading genre tag', 'didn fit the category', 'cross-genre failure',
        'felt like a different genre entirely', 'no present'
    ]
}

In [None]:
def assign_themes_updated(review_text, theme_map):
    review_text = review_text.lower()
    matched_themes = []

    for theme, keywords in theme_map.items():
        if any(kw in review_text for kw in keywords):
            matched_themes.append(theme)

    return matched_themes if matched_themes else ['Uncategorized']

In [None]:
# Apply the updated theme assignment and store in the correct column
sample_1star['complaint_themes_updated'] = sample_1star['review_clean'].apply(
    lambda x: assign_themes_updated(x, complaint_themes_updated)
)

from collections import Counter
from itertools import chain

# Flatten list of themes and count the updated themes
theme_counts_updated = Counter(chain.from_iterable(sample_1star['complaint_themes_updated']))
print(theme_counts_updated.most_common())

print("\nDataFrame with Updated Complaint Themes (First 10):")
print(sample_1star[['review_clean', 'complaint_themes', 'complaint_themes_updated']].head(10))

In [None]:
theme_palette = { #distinct under colorblind simulations
    'Engagement': '#1b9e77',          # dark teal
    'Plot/Structure': '#d95f02',      # orange
    'Character Issues': '#7570b3',    # purple-blue
    'Writing Style': '#e7298a',       # reddish pink
    'Expectations vs Reality': '#66a61e',  # olive green
    'Genre Mismatch': '#e6ab02',      # yellow-brown
    'Offensive Content': '#a6761d',   # brown 
    'Uncategorized': '#666666'        # dark gray (if every shown)
}

In [None]:
# Filter out 'Uncategorized' and sort by frequency
filtered_theme_counts = {k: v for k, v in theme_counts_updated.items() if k.lower() != 'uncategorized'}
top_themes = sorted(filtered_theme_counts.items(), key=lambda x: x[1], reverse=True)
labels, counts = zip(*top_themes)

# Apply your fixed, colorblind-accessible palette
colors = [theme_palette[label] for label in labels]

# Plot
plt.figure(figsize=(10, 6))
bars = plt.barh(labels, counts, color=colors)

# Title and axis
plt.title("What Are the Most Common Complaint Themes in 1-Star Reviews?", fontsize=16, weight='bold')
plt.xlabel("Number of Reviews", fontsize=12)

# Add value + percentage labels
total = sum(counts)
for bar, count in zip(bars, counts):
    percent = (count / total) * 100
    plt.text(
        bar.get_width() + 5,
        bar.get_y() + bar.get_height() / 2,
        f"{count} ({percent:.1f}%)",
        va='center', fontsize=10
    )

# Visual refinements
plt.gca().invert_yaxis()  # Highest count on top
plt.grid(False)
plt.figtext(0.5, -0.05,
            "Themes extracted from rule-based keyword matching in review text.",
            wrap=True, horizontalalignment='center', fontsize=10)
plt.xlim(0, max(counts) * 1.15)  # Adds 15% padding to the right
plt.tight_layout()

# Save then show
plt.savefig('themes_1star_reviews.jpg', format='jpg', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
pip install adjustText

In [None]:
# Set style
sns.set(style="whitegrid")
plt.figure(figsize=(12, 7))

# Base scatterplot
sns.scatterplot(
    data=sample_1star,
    x='average_rating',
    y='ratings_count',
    alpha=0.3,
    edgecolor=None
)

plt.yscale('log')

# Top books — no duplicate titles
top_books = (
    sample_1star.sort_values(by='ratings_count', ascending=False)
    .drop_duplicates(subset='title')
    .head(3)
)

# Highlight top books with custom markers and legend
for _, row in top_books.iterrows():
    plt.scatter(row['average_rating'], row['ratings_count'], s=100, marker='X', label=row['title'], zorder=5)

# Add title and labels
plt.title("Do Low-Rated Books Still Get High Average Ratings?", fontsize=16, fontweight='bold')
plt.xlabel("Average Goodreads Rating", fontsize=12)
plt.ylabel("Total Ratings Count (log scale)", fontsize=12)

# Legend
plt.legend(title='Top Rated Books with the Most 1-Star Reviewers', bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.savefig("low_rated_books_high_rating_legend.jpg", format='jpg', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Create a new column with the first complaint theme (or 'Uncategorized')

sample_1star['top_theme'] = sample_1star['complaint_themes_updated'].apply(lambda x: x[0] if isinstance(x, list) and x else 'Uncategorized'

)

In [None]:
# Remove duplicate titles to avoid overplotting books reviewed multiple times
unique_books = sample_1star.drop_duplicates(subset='title')

In [None]:
import matplotlib.patches as mpatches

# Drop duplicate book titles to avoid overplotting
unique_books = sample_1star.drop_duplicates(subset='title')

# Get top-rated book (by ratings count) per complaint theme
top_books_by_theme = (
    unique_books
    .sort_values(by='ratings_count', ascending=False)
    .groupby('top_theme')
    .first()
    .reset_index()
)

# Plot setup
plt.figure(figsize=(14, 8))
sns.set(style="whitegrid")

# Base scatterplot
sns.scatterplot(
    data=unique_books,
    x='average_rating',
    y='ratings_count',
    hue='top_theme',
    palette=theme_palette,
    alpha=0.4,
    edgecolor=None
)

# Highlight top books per theme with larger X markers
legend_handles = []
for _, row in top_books_by_theme.iterrows():
    plt.scatter(
        row['average_rating'],
        row['ratings_count'],
        s=120,
        marker='X',
        color=theme_palette.get(row['top_theme'], '#000000'),
        edgecolor='black',
        linewidth=1.0,
        zorder=5
    )
    legend_handles.append(mpatches.Patch(
        color=theme_palette.get(row['top_theme'], '#000000'),
        label=f"{row['top_theme']}: {row['title']}"
    ))

# Axis settings
plt.yscale('log')
plt.xlabel("Average Goodreads Rating", fontsize=12)
plt.ylabel("Total Ratings Count (log scale)", fontsize=12)
plt.title("Do Low-Rated Books Still Get High Average Ratings (w/Complaint Themes)?", fontsize=16, fontweight='bold')

# Combined legend: themes and top books
plt.legend(handles=legend_handles, title="Top Book per Theme", bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)

plt.tight_layout()
plt.savefig('low_rated_books_by_theme_highlights.jpg', format='jpg', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Sort by median to show themes in order
theme_order = (
    sample_1star.groupby('top_theme')['average_rating']
    .median()
    .sort_values()
    .index
)

plt.figure(figsize=(14, 8))
sns.boxplot(
    x='top_theme',
    y='average_rating',
    data=sample_1star,
    order=theme_order,
    palette=theme_palette
)

# Add median labels above each box
medians = sample_1star.groupby('top_theme')['average_rating'].median().reindex(theme_order)
for i, (theme, median) in enumerate(medians.items()):
    plt.text(i, median + 0.03, f"{median:.2f}", ha='center', fontsize=10, weight='bold')

# Labels and styling
plt.title("Distribution of Average Goodreads Rating by Top Complaint Theme", fontsize=16, fontweight='bold')
plt.xlabel("Top Complaint Theme", fontsize=12)
plt.ylabel("Average Goodreads Rating", fontsize=12)
plt.xticks(rotation=45, ha='right', fontsize=11)
plt.yticks(fontsize=11)
plt.grid(axis='y', linestyle='--', alpha=0.6)
sns.despine()

# Footnote for interpretation
plt.figtext(0.5, -0.08,
            "Box represents IQR; whiskers extend to 1.5×IQR. Points outside are outliers. Median shown above each box.",
            wrap=True, horizontalalignment='center', fontsize=10)

plt.tight_layout(rect=[0, 0.05, 1, 0.95])
plt.savefig('avg_goodreads_rating_by_theme_boxplot.jpg', format='jpg', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
plt.figure(figsize=(16, 9))
sns.boxplot(x='top_theme', y='ratings_count', data=sample_1star, palette=theme_palette)
plt.yscale('log')

plt.title("Distribution of Total Ratings Count (Log Scale) by Top Complaint Theme", fontsize=18, fontweight='bold')
plt.xlabel("Top Complaint Theme", fontsize=14)
plt.ylabel("Total Ratings Count (Log Scale)", fontsize=14)

plt.xticks(rotation=45, ha='right', fontsize=12)
plt.yticks(fontsize=12)

plt.grid(axis='y', linestyle='--', alpha=0.7)
sns.despine()

plt.figtext(0.5, -0.15,
            "Note: This box plot shows the distribution of the total number of ratings (on a logarithmic scale) for books that received 1-star reviews categorized under each theme. The log scale is used to handle the skewed distribution of ratings counts. The box represents the IQR, the line inside is the median, and the whiskers extend to 1.5 times the IQR. Points outside the whiskers are outliers.",
            wrap=True, horizontalalignment='center', fontsize=10)

plt.tight_layout(rect=[0, 0.05, 1, 0.95])
plt.savefig('distribution_total_ratings_logscale_by_complaint_theme.jpg', format='jpg', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Get filtered year counts
sample_1star_clean = sample_1star[(sample_1star['publication_year'] >= 1990) & 
                                  (sample_1star['publication_year'] <= 2018)]

# Generate a complete index from 1990–2020 (so every year is shown)
year_range = list(range(1990, 2018))

# Count and reindex to include all years
year_counts = sample_1star_clean['publication_year'].value_counts().sort_index()
year_counts = year_counts.reindex(year_range, fill_value=0)

# --- STEP 2: Plot the Data ---
highlight_years = [2011, 2012, 2013]
colors = ['orange' if year in highlight_years else 'gray' for year in year_counts.index]

fig, ax = plt.subplots(figsize=(14, 6))

bars = ax.bar(year_counts.index, year_counts.values, color=colors)

# Add exact labels for 2011–2013 just above bars (except 2013, which we'll bold separately)
for bar in bars:
    year = int(bar.get_x() + bar.get_width() / 2)
    height = bar.get_height()
    if year in highlight_years and year != 2013:
        ax.text(bar.get_x() + bar.get_width() / 2, height + 5,
                f'{int(height)}', ha='center', va='bottom', fontsize=10)
# Axis settings
ax.set_title("When Were the Most 1-Star Rated Books Published?", fontsize=16)
ax.set_xlabel("Publication Year")
ax.set_ylabel("Number of 1-Star Reviews")
ax.set_xticks(year_range)
# Bold the label for 2013
xtick_labels = []
for year in year_range:
    if year == 2013:
        xtick_labels.append(f'$\\bf{{{year}}}$')  # LaTeX bold
    else:
        xtick_labels.append(str(year))

ax.set_xticklabels(xtick_labels, rotation=45, fontsize=9)
# Remove vertical gridlines
ax.grid(axis='y', linestyle='--', alpha=0.5)
ax.grid(axis='x', visible=False)

# Highlight 2013 value in bold above the bar
highlight_year = 2013
highlight_value = year_counts[highlight_year]

ax.annotate(f'{highlight_value}',
            xy=(highlight_year, highlight_value),
            xytext=(0, 5),
            textcoords='offset points',
            ha='center',
            va='bottom',
            fontsize=10,
            fontweight='bold')  # Make the label bold

# Add padding above the tallest bar
ax.set_ylim(0, year_counts.max() + 80)

# Add a footnote
plt.figtext(0.5, -0.05,
            "Note: Spike between 2011–2013 may reflect changes in Goodreads review activity or publishing trends.",
            wrap=True, horizontalalignment='center', fontsize=10)

plt.tight_layout()
plt.savefig('when_were_most_1star_reviews_published.jpg', format='jpg', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
theme_counts_by_year = theme_counts_by_year[theme_palette.keys()]

# Plot
ax = theme_counts_by_year.plot(
    kind='bar',
    figsize=(14, 8),
    color=[theme_palette[col] for col in theme_counts_by_year.columns]
)

# Formatting
ax.set_title("Number of 1-Star Reviews by Top Complaint Theme (2011–2014)", fontsize=18, fontweight='bold')
ax.set_xlabel("Publication Year", fontsize=14)
ax.set_ylabel("Number of 1-Star Reviews", fontsize=14)
ax.tick_params(axis='x', labelrotation=0, labelsize=12)
ax.tick_params(axis='y', labelsize=12)
ax.legend(title='Top Complaint Theme', fontsize=12)
ax.grid(axis='y', linestyle='--', alpha=0.7)
sns.despine()

plt.tight_layout()
plt.savefig('1star_reviews_by_theme_2011_2014.jpg', format='jpg', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Create year bins
bins = list(range(1990, 2021, 5))
labels = [f'{i}-{i+4}' for i in bins[:-1]]
sample_1star['year_bin'] = pd.cut(sample_1star['publication_year'], bins=bins, labels=labels, right=False)

# Group by year bin and top_theme
theme_by_year = sample_1star.groupby('year_bin')['top_theme'].value_counts(normalize=True).unstack(fill_value=0)

# Plot
plt.figure(figsize=(16, 9))
ax = theme_by_year.plot(
    kind='bar',
    stacked=True,
    color=[theme_palette.get(col, '#cccccc') for col in theme_by_year.columns],  # apply theme_palette
    ax=plt.gca()
)

plt.title("Proportion of Top Complaint Themes by Publication Year (5-Year Bins)", fontsize=18, fontweight='bold')
plt.xlabel("Publication Year Bin", fontsize=14)
plt.ylabel("Proportion of Reviews", fontsize=14)
plt.xticks(rotation=45, ha='right', fontsize=12)
plt.yticks(fontsize=12)

plt.legend(title='Top Complaint Theme', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout(rect=[0, 0, 0.9, 1])

# Add percentage labels
for i, bin_label in enumerate(theme_by_year.index):
    y_offset = 0
    for theme in theme_by_year.columns:
        height = theme_by_year.loc[bin_label, theme]
        if height > 0.1:
            plt.text(
                i,
                y_offset + height / 2,
                f'{(height * 100):.0f}%',
                ha='center',
                va='center',
                fontsize=9,
                color='white' if height > 0.2 else 'black'
            )
        y_offset += height

# Add caption
plt.figtext(
    0.5, -0.05,
    "Note: This stacked bar chart shows the proportion of each top complaint theme within 5-year publication year bins. "
    "The height of each colored segment represents the proportion of reviews belonging to that theme within that time period.",
    wrap=True, horizontalalignment='center', fontsize=10
)

# Save and show
plt.savefig('proportion_complaint_themes_by_year_bin.jpg', format='jpg', dpi=300, bbox_inches='tight')
plt.show()

# Optional: Clean up temporary column
sample_1star.drop(columns=['year_bin'], inplace=True)

# A look into shelves - the conundrum of genres

In [None]:
sample_1star.head()

In [None]:
#cleaning popular shelves column
print(sample_1star['popular_shelves'].iloc[0])

In [None]:
#seeing which shelves have the highest counts
#function that extracts shelf names from string lists of the shelf dictionaires
def shelf_names(shelves_str):
    shelves_list = ast.literal_eval(shelves_str) #convert the string to a list of dicts
    if isinstance(shelves_list, list):
        return [shelf['name'] for shelf in shelves_list if 'name' in shelf] #extract 'name' value from each dict if it exists
    return []

shelf_counter = Counter()

In [None]:
#very large operation (takes about 100 minutes to run)
for row in sample_1star['popular_shelves'].dropna():
    shelf_counter.update(shelf_names(row))

print(shelf_counter.most_common(60))

In [None]:
import random

unique_shelves = list(shelf_counter.keys())
print(f"unique names: {len(unique_shelves)}")

In [None]:
import ast

blacklist = [
    # 1. Reading status
    'to-read', 'read-in-2016', 'currently-reading', 'tbr', 'read-2016', 'read-2015', 'read-2014',
    'read-2013', 'read-2012', 'read-2011', 'read-2010', 're-read', 'reread', 'to-reread', 'already-read',
    'read-again', 'read-aloud', 'read-alouds', 'read-fiction', 'read-in-school', 'read-as-a-kid',
    'read-comics', 'read-next', 'read-2009', 'read-in-english',

    # 2. Ownership/location
    'owned', 'my-books', 'library', 'kindle', 'ebooks', 'books-i-own', 'to-buy', 'owned-books',
    'i-own', 'my-library', 'own-it', 'borrowed', 'on-my-shelf', 'books-i-have', 'bookshelf',
    'home-library', 'my-bookshelf', 'own-to-read', 'own-ebook', 'my-ebooks', 'owned-tbr',
    'ebooks-i-own', 'own-kindle', 'owned-not-read', 'owned-but-not-read', 'own-on-kindle',
    'kindle-owned', 'own-unread', 'owned-unread', 'owned-ebook', 'own-a-copy', 'my-kindle-books',
    'owned-kindle', 'own-tbr', 'kindle-library', 'books-owned', 'kindle-to-read', 'not-owned',
    'do-not-own', 'owned-not-read', 'owned-but-not-read', 'own-unread', 'own-a-copy',

    # 3. Format or device
    'audiobook', 'ebook', 'paperback', 'hardcover', 'audiobooks', 'audio', 'e-book', 'e-books',
    'audio-books', 'audio-book', 'audible', 'library-book', 'kindle-books', 'nook', 'on-kindle',
    'netgalley', 'overdrive', 'pdf', 'epub', 'ibooks', 'kobo', 'kindle-unlimited', 'digital',
    'graphic', 'ebook-owned', 'epub', 'e-reader', 'calibre', 'ibooks', 'kobo', 'downloaded',
    'kindle-book', 'ebooks-i-own',

    # 4. Rating/review-based
    '5-stars', 'favorites', 'favourites', 'favorite', 'favorite-books', 'favorite-series',
    'my-favorites', 'favorite-authors', 'favorite-author', 'faves', 'favourite', 'fav',
    'gave-up-on', 'gave-up', 'did-not-finish', 'dnf', 'abandoned', 'unfinished', 'didn-t-finish',
    'couldn-t-finish', 'could-not-finish', 'not-finished', 'not-read', 'never-finished', 'paused',
    'stopped-reading', 'not-for-me', 'nope', 'meh', 'dnf', 'review', 'reviewed',

    # 5. Challenge or year-based
    '2016-reading-challenge', '2017-reading-challenge', '2015-reading-challenge', '2014-read',
    '2015-reads', '2014-reads', '2013-reads', '2012-reads', '2011-reads', '2010-reads', '2016-reads',
    '2015-books', '2014-books', '2013-books', '2012-books', '2017-reads', '2017-books',
    '2017-read', '2017-release', '2017-reading-list', '2017-books-read', 'books-read-in-2016',
    'books-read-in-2015', 'books-read-in-2014', 'books-read-in-2013', 'books-read-in-2012',
    'books-read-in-2017', 'books-read-in-2011', 'books-read-in-2010', 'read-in-2015',
    'read-in-2014', 'read-in-2013', 'read-in-2012', 'read-in-2011', 'read-in-2010', '2016-books',
    '2016-read', '2017-books-read', '2017-books-read', 'books-read-in-2016', '2016-books-read',

    # 6. Meta or personal tags
    'wishlist', 'wish-list', 'book-club', 'bookclub', 'book-club-books', 'book-club-reads',
    'book-group', 'to-re-read', 're-read', 'reread', 'to-reread', 'to-read-owned', 'to-read-own',
    'to-read-fiction', 'to-read-non-fiction', 'to-read-nonfiction', 'to-read-ya', 'to-read-series',
    'to-read-classics', 'to-read-fantasy', 'to-review', 'to-purchase', 'to-be-read', 'to-be-released',
    'to-read-soon', 'to-get', 'want-to-read', 'want-to-buy', 'want', 'need', 'need-to-buy',
    'need-to-get', 'not-interested', 'maybe', 'maybe-read', 'recommendations', 'next', 'next-to-read',
    'next-in-series', 'done', 'shelved', 'my-shelf', 'own-a-copy', 'own-tbr', 'top-tbr', 'not-for-me',
    'considering', 'owned-but-not-read', 'wishlist', 'wishlist', 'owned-to-read'
]


blacklist_words = set(word.lower() for word in blacklist)

In [None]:
shelf_cleaning = {
    'cowboys': 'cowboy',
    'chick lit': 'chick lit',
    'adult fiction': 'adult fiction',
    'cowboy western': 'cowboy western',
    'genre western': 'western',
    'romantic suspense': 'romantic suspense',
    'action': 'action',
    'series romance': 'romance',
    'genre romance': 'romance',
    'romance modern': 'modern romance',
    'science fiction': 'science fiction',
    'sci fi': 'science fiction',
    'scifi': 'science fiction',
    'post apocalyptic': 'post apocalyptic',
    'sf': 'science fiction',
    'sci fi fantasy': 'science fiction fantasy',
    'dystopia': 'dystopian',
    'apocalyptic': 'apocalyptic',
    'science': 'science',
    'speculative fiction': 'speculative fiction',
    'fantasy sci fi': 'science fiction fantasy',
    'apocalypse': 'apocalyptic',
    'space opera': 'space opera',
    'science fiction fantasy': 'science fiction fantasy',
    'hard sci fi': 'hard science fiction',
    'sff': 'science fiction fantasy',
    'post apocalypse': 'post apocalyptic',
    'sf fantasy': 'science fiction fantasy',
    'sci fi and fantasy': 'science fiction fantasy',
    'hard scifi': 'hard science fiction',
    'sciencefiction': 'science fiction',
    'regency romance': 'regency romance',
    'romance historical': 'historical romance',
    'mf': 'm f',
    'historical romances': 'historical romance',
    'historicals': 'historical',
    'humorous': 'humor',
    'humour': 'humor',
    'humour comedy': 'humor',
    'young adult': 'young adult',
    'ya': 'young adult',
    'fairies': 'fairies',
    'faeries': 'fairies',
    'faerie': 'fairies',
    'fey': 'fae',
    'ya fantasy': 'young adult fantasy',
    'paranormal romance': 'paranormal romance',
    'historical fantasy': 'historical fantasy',
    'historical fic': 'historical fiction',
    'supernatural': 'supernatural',
    'faries': 'fairies',
    'classic lit': 'classic literature',
    'british lit': 'british literature',
    'brit lit': 'british literature',
    'english lit': 'english literature',
    'lit': 'literature',
    'feminist': 'feminism',
    'ya books': 'young adult books',
    'ya fiction': 'young adult fiction',
    'ya': 'young adult',
    'non fiction': 'nonfiction',
    'non fic': 'nonfiction',
    'distopian': 'dystopian',
    'ya dystopian': 'young adult dystopian',
    'ya lit': 'young adult literature'
}

In [None]:
shelf_cleaning.update({
    'women s fiction': 'womens fiction',
    'womens fiction': 'womens fiction', 
    'children s': 'children books',
    'childrens books': 'children books',
    'children s books': 'children books',
    'children': 'children books',
    'childrens': 'children books',
    'kids books': 'children books',
    'kid books': 'children books',
    'general fiction': 'fiction',
    'novels': 'novel',
})

In [None]:
import pandas as pd
from lcgft_config import lcgft_mapping
from collections import defaultdict
import ast

# Assuming genre_mapping and blacklist_words are defined

def clean_name(name):
    return name.lower().replace('-', ' ').replace('_', ' ').strip()

def extract_shelves(shelves_str):
    try:
        shelves_list = ast.literal_eval(shelves_str)
    except (SyntaxError, ValueError):
        return []
    if isinstance(shelves_list, list):
        return [(clean_name(shelf['name']), int(shelf.get('count', 0)))
                for shelf in shelves_list if isinstance(shelf, dict) and 'name' in shelf]
    return []

def apply_cleaning_pipeline(shelves_str, shelf_cleaning, blacklist_words):
    cleaned_tags_with_counts = defaultdict(int)
    shelves = extract_shelves(shelves_str)
    for tag, count in shelves:
        mapped_tag = shelf_cleaning.get(tag, tag)
        if not any(bad_word in mapped_tag for bad_word in blacklist_words):
            cleaned_tags_with_counts[mapped_tag] += count
    return sorted(cleaned_tags_with_counts.items(), key=lambda x: x[1], reverse=True)

def extract_lcgft_and_tropes_from_cleaned(cleaned_shelves_series):
    extracted_data = []
    for shelf_tuples in cleaned_shelves_series:
        lcgft_genres = []
        trope_keywords = []
        original_shelf = [item[0] for item in shelf_tuples]
        for tag_tuple in shelf_tuples:
            tag_name = tag_tuple[0]
            if tag_name in lcgft_mapping:
                lcgft_genres.append(lcgft_mapping[tag_name])
            else:
                trope_keywords.append(tag_name)
        extracted_data.append({'shelf': original_shelf, 'lcgft_genres': list(set(lcgft_genres)), 'trope_keywords': list(set(trope_keywords))})
    return extracted_data

sample_1star['cleaned_shelves'] = sample_1star['popular_shelves'].apply(
    apply_cleaning_pipeline,
    args=(shelf_cleaning, blacklist_words)
)

shelf_results = extract_lcgft_and_tropes_from_cleaned(sample_1star['cleaned_shelves'])
print(shelf_results[:5]) # Print the first few results

In [None]:
import pandas as pd
from lcgft_config import lcgft_mapping
from collections import defaultdict
import ast

# Assuming genre_mapping and blacklist_words are defined

def clean_name(name):
    return name.lower().replace('-', ' ').replace('_', ' ').strip()

def extract_shelves(shelves_str):
    try:
        shelves_list = ast.literal_eval(shelves_str)
    except (SyntaxError, ValueError):
        return []
    if isinstance(shelves_list, list):
        return [(clean_name(shelf['name']), int(shelf.get('count', 0)))
                for shelf in shelves_list if isinstance(shelf, dict) and 'name' in shelf]
    return []

def apply_cleaning_pipeline(shelves_str, shelf_cleaning, blacklist_words):
    cleaned_tags_with_counts = defaultdict(int)
    shelves = extract_shelves(shelves_str)
    for tag, count in shelves:
        mapped_tag = shelf_cleaning.get(tag, tag)
        if not any(bad_word in mapped_tag for bad_word in blacklist_words):
            cleaned_tags_with_counts[mapped_tag] += count
    return sorted(cleaned_tags_with_counts.items(), key=lambda x: x[1], reverse=True)

def extract_lcgft_and_tropes(shelves_str, blacklist): # Pass blacklist as argument
    cleaned_shelves = apply_cleaning_pipeline(shelves_str, shelf_cleaning, blacklist)
    lcgft_genres = set()
    trope_keywords = set()
    original_shelf = [item[0] for item in cleaned_shelves]
    for tag_tuple in cleaned_shelves:
        tag_name = tag_tuple[0]
        if tag_name in lcgft_mapping:
            lcgft_genres.add(lcgft_mapping[tag_name])
        elif not any(bad_word in tag_name for bad_word in blacklist): # Apply blacklist here too
            trope_keywords.add(tag_name)
    return pd.Series({'original_shelf': list(original_shelf), 'lcgft_genres': list(lcgft_genres), 'trope_keywords': list(trope_keywords)})

# Assuming sample_1star is your DataFrame and 'popular_shelves' is the column
shelf_results_df = sample_1star['popular_shelves'].apply(extract_lcgft_and_tropes, args=(blacklist_words,))

# Assign the columns directly
sample_1star['original_shelf'] = shelf_results_df['original_shelf']
sample_1star['lcgft_genres'] = shelf_results_df['lcgft_genres']
sample_1star['trope_keywords'] = shelf_results_df['trope_keywords']

# Now your DataFrame 'sample_1star' will have the updated columns with the blacklist applied to tropes
print(sample_1star[['popular_shelves', 'original_shelf', 'lcgft_genres', 'trope_keywords']].head())

In [None]:
def clean_trope_list(trope_list): #double cleaning
    cleaned_tropes = []
    if isinstance(trope_list, list):
        for trope in trope_list:
            cleaned_trope = trope.lower().strip()
            if cleaned_trope and cleaned_trope not in ['a', 'the', 'of', 'and', 'in']: # Example stop words
                cleaned_tropes.append(cleaned_trope)
    return list(set(cleaned_tropes)) # Remove duplicates after cleaning

sample_1star['cleaned_tropes'] = sample_1star['trope_keywords'].apply(clean_trope_list)
print(sample_1star[['trope_keywords', 'cleaned_tropes']].head())

In [None]:
print(sample_1star['popular_shelves'].head())

In [None]:
sample_1star.head()

In [None]:
# Flatten and count
shelf_counts = Counter(chain.from_iterable(
    [shelf for shelf, _ in row] for row in sample_1star['cleaned_shelves']
))

# Get top N shelves
top_shelves = shelf_counts.most_common(20)
shelf_names, counts = zip(*top_shelves)

# Plot
plt.figure(figsize=(12, 6))
bars = plt.barh(shelf_names, counts, color='mediumseagreen')
plt.xlabel("Number of Reviews")
plt.title("Most Common Shelves in 1-Star Reviews")
plt.gca().invert_yaxis()

# Add labels
for bar, count in zip(bars, counts):
    plt.text(bar.get_width() + 1, bar.get_y() + bar.get_height()/2,
             str(count), va='center', fontsize=9)

plt.tight_layout()
plt.show()
plt.savefig('Most Common Shelves in 1-Star Reviews', format='jpg', dpi=300, bbox_inches='tight')

## sentiment analysis - currently iterating

In [None]:
# Combine all descriptions
all_descriptions = ' '.join(sample_1star['description_clean'].dropna())

# Generate word cloud
wordcloud = WordCloud(width=1000, height=500, background_color='white',
                      max_words=100, colormap='viridis').generate(all_descriptions)

# Display it
plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Common Words in Book Descriptions', fontsize=16)
plt.show()
plt.savefig('Common Words in Book Descriptions', format='jpg', dpi=300, bbox_inches='tight')

In [None]:
pip install textblob

In [None]:
from textblob import TextBlob

In [None]:
def get_sentiment(text):
    if pd.isnull(text) or not isinstance(text, str) or text.strip() == "":
        return None
    blob = TextBlob(text)
    return blob.sentiment.polarity  # Returns a float from -1 (negative) to 1 (positive)

In [None]:
sample_1star['desc_sentiment'] = sample_1star['description_clean'].apply(get_sentiment)
sample_1star['review_sentiment'] = sample_1star['review_clean'].apply(get_sentiment)

In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(sample_1star['review_sentiment'], bins=30, kde=True, color='tomato')
plt.title("Sentiment Distribution of 1-Star Reviews")
plt.xlabel("Review Sentiment Score")
plt.ylabel("Frequency")
plt.axvline(0, linestyle='--', color='gray', alpha=0.7)
plt.tight_layout()
plt.show()
plt.savefig('Sentiment Distribution of 1-Star Reviews', format='jpg', dpi=300, bbox_inches='tight')

In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(sample_1star['desc_sentiment'], bins=30, kde=True, color='steelblue')
plt.title("Sentiment Distribution of Book Descriptions")
plt.xlabel("Description Sentiment Score")
plt.ylabel("Frequency")
plt.axvline(0, linestyle='--', color='gray', alpha=0.7)
plt.tight_layout()
plt.show()
plt.savefig('Sentiment Distribution of Book Descriptions', format='jpg', dpi=300, bbox_inches='tight')

In [None]:
# Define sentiment agreement/contradiction
def sentiment_relationship(row):
    if row['desc_sentiment'] >= 0 and row['review_sentiment'] < 0:
        return 'Positive Blurb / Negative Review'
    elif row['desc_sentiment'] < 0 and row['review_sentiment'] >= 0:
        return 'Negative Blurb / Positive Review'
    elif row['desc_sentiment'] >= 0 and row['review_sentiment'] >= 0:
        return 'Both Positive'
    else:
        return 'Both Negative'

# Apply this relationship to the DataFrame
sample_1star['sentiment_relation'] = sample_1star.apply(sentiment_relationship, axis=1)

# Set up the plot
plt.figure(figsize=(10, 8))
sns.scatterplot(
    data=sample_1star,
    x='desc_sentiment',
    y='review_sentiment',
    hue='sentiment_relation',
    alpha=0.5,
    palette={
        'Positive Blurb / Negative Review': '#e74c3c',
        'Negative Blurb / Positive Review': '#9b59b6',
        'Both Positive': '#27ae60',
        'Both Negative': '#3498db'
    }
)

# Add vertical and horizontal lines at zero
plt.axhline(0, color='gray', linestyle='--')
plt.axvline(0, color='gray', linestyle='--')

# Annotations for quadrants
plt.text(0.55, -0.9, 'Positive Description\nNegative Review', color='#e74c3c', fontsize=10, weight='bold')
plt.text(-0.9, 0.8, 'Negative Description\nPositive Review', color='#9b59b6', fontsize=10, weight='bold')
plt.text(0.5, 0.75, 'Both Positive', color='#27ae60', fontsize=10, weight='bold')
plt.text(-0.8, -0.8, 'Both Negative', color='#3498db', fontsize=10, weight='bold')

# Labels and title
plt.xlabel("Description Sentiment", fontsize=12)
plt.ylabel("Review Sentiment", fontsize=12)
plt.title("Do 1-Star Reviews Contradict Book Descriptions?", fontsize=14, weight='bold')
plt.legend(title='Sentiment Match', loc='upper left', bbox_to_anchor=(1, 1))
plt.grid(True)
plt.tight_layout()
plt.show()
plt.savefig('Do 1-Star Reviews Contradict Book Descriptions?', format='jpg', dpi=300, bbox_inches='tight')

In [None]:
# Define as a contradiction if description is positive but review is strongly negative
sample_1star['contradiction'] = sample_1star.apply(
    lambda row: row['desc_sentiment'] > 0.2 and row['review_sentiment'] < -0.2,
    axis=1
)

# Show proportion and example titles
contradict_pct = sample_1star['contradiction'].mean() * 100
print(f"{contradict_pct:.2f}% of 1-star reviews contradict the description's positive tone")

# Optionally preview top contradicting cases
sample_1star[sample_1star['contradiction'] == True][['title', 'desc_sentiment', 'review_sentiment']].head()

In [None]:
# Count the number of books per sentiment relationship
sentiment_counts = sample_1star['sentiment_relation'].value_counts().reset_index()
sentiment_counts.columns = ['Sentiment Relationship', 'Count']

# Plotting the bar chart
plt.figure(figsize=(8, 5))
bars = plt.barh(
    sentiment_counts['Sentiment Relationship'],
    sentiment_counts['Count'],
    color=['#e74c3c', '#9b59b6', '#27ae60', '#3498db']
 )

# Add count labels next to bars
for bar in bars:
    plt.text(
        bar.get_width() + 100,
        bar.get_y() + bar.get_height() / 2,
        f"{int(bar.get_width())}",
        va='center',
        fontsize=10
    )

# Title and labels
plt.xlabel("Number of Books")
plt.title("How Often Do Review & Description Sentiments Match?", fontsize=13, weight='bold')
plt.tight_layout()
plt.show()
plt.savefig('How Often Do Review & Description Sentiments Match?', format='jpg', dpi=300, bbox_inches='tight')

In [None]:
sample_1star.columns