In [None]:
import pandas as pd
import numpy as np
import re
import ast

from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import MiniBatchKMeans



path = "../raw_data/goodreads_books_50k.csv"
df = pd.read_csv(path)

print("shape：", df.shape)
df.head()


In [None]:
english_codes = ["eng", "en-US", "en-GB", "en-CA","NaN"]
df = df[df["language_code"].isin(english_codes)].copy()

In [None]:
df = df.dropna(subset=["title", "description"], how="all").reset_index(drop=True)
#drop rows where both title and description are NaN

In [None]:
df.shape

In [None]:
cols_to_drop = [
    "edition_information",
    "asin",
    "kindle_asin",
    "publication_day",
    "publication_month",
    "format",
    "publisher",
    "isbn"
]

df = df.drop(columns=cols_to_drop, errors="ignore")

In [None]:
df.shape

In [None]:
for col in ["average_rating", "ratings_count", "text_reviews_count"]:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

df[["average_rating", "ratings_count", "text_reviews_count"]].describe()

In [None]:
def clean_text(s):
    if not isinstance(s, str):
        return ""
    s = s.lower()
    s = re.sub(r"<.*?>", " ", s)                    # remove HTML tags
    s = s.replace("\n", " ").replace("\r", " ").replace("\t", " ")
    s = re.sub(r"\s+", " ", s).strip()
    return s

In [None]:
df["title_clean"] = (
    df["title_without_series"]
    .fillna(df["title"])
    .fillna("")
    .astype(str)
    .str.lower()
)

In [None]:
df["description_clean"] = df["description"].fillna("").astype(str).apply(clean_text)

df[["title_clean", "description_clean"]].head()

In [None]:
def parse_shelves(s):
    try:
        lst = ast.literal_eval(s)
        return [d.get("name") for d in lst if isinstance(d, dict)]
    except:
        return []

df["shelf_names"] = df["popular_shelves"].apply(parse_shelves)

In [None]:
NOISE_SHELVES = set([
    "to-read", "currently-reading", "read", "owned", "my-books", "books-i-own",
    "books", "library", "kindle", "kindle-free", "free", "freebie", "default",
    "favorites", "favorite-authors", "have", "i-own", "on-my-shelf", "purchased",
    "unfinished", "did-not-finish"
])

In [None]:
SYNONYMS = {
    "m-m": "lgbt",
    "m/m": "lgbt",
    "mm": "lgbt",
    "sci-fi": "science-fiction",
    "ya": "young-adult",
    "ya-fiction": "young-adult",
    "childrens": "children",
    "harlequin": "romance",
    "cozy": "cozy-mystery",
    "mysteries": "mystery"
}

In [None]:
def normalize_shelf(s):

    s = s.lower().strip()
    return SYNONYMS.get(s, s)

def clean_shelves(shelves):

    cleaned = []
    for s in shelves:
        s = s.lower().strip()
        if s in NOISE_SHELVES:
            continue
        cleaned.append(normalize_shelf(s))
    return cleaned

In [None]:
df["clean_shelves"] = df["shelf_names"].apply(clean_shelves)


In [None]:
df["shelf_text"] = df["clean_shelves"].apply(
    lambda lst: " ".join(lst) if isinstance(lst, list) else ""
)

df[["shelf_names", "clean_shelves", "shelf_text"]].head()

In [None]:
def parse_authors(s):
    try:
        lst = ast.literal_eval(s)
        return [d.get("author_id") for d in lst if isinstance(d, dict)]
    except:
        return []

df["author_ids"] = df["authors"].apply(parse_authors)

In [None]:
def parse_similar(s):
    try:
        return ast.literal_eval(s)
    except:
        return []

df["similar_books_list"] = df["similar_books"].apply(parse_similar)


In [None]:
# shelves → "fantasy magic young-adult"
df["shelf_text"] = df["shelf_names"].apply(
    lambda lst: " ".join(lst) if isinstance(lst, list) else ""
)

In [None]:
df["author_text"] = df["author_ids"].apply(
    lambda lst: " ".join("author_" + str(a) for a in lst) if isinstance(lst, list) else ""
)

In [None]:
# similar_books → "sim_111 sim_222 sim_333"
df["sim_text"] = df["similar_books_list"].apply(
    lambda lst: " ".join("sim_" + str(b) for b in lst) if isinstance(lst, list) else ""
)


In [None]:
df["combined_text"] = (
    df["title_clean"].fillna("") + " " +
    df["description_clean"].fillna("") + " " +
    df["shelf_text"].fillna("") + " " +
    df["author_text"].fillna("") + " " +
    df["sim_text"].fillna("")
).str.strip()


In [None]:
vectorizer = TfidfVectorizer(
    stop_words="english",
    max_features=30000,
    min_df=5,
    max_df=0.8
)

In [None]:
tfidf_matrix = vectorizer.fit_transform(df["combined_text"])

tfidf_matrix.shape

many information is missing in "edition_information", also not helpful, "asin" (Amazon ID, can delete), "kindle_asin" (Kindle Amazin ID), "publication_day", "publication_month", "format", "publisher", "isbn". Could be removed.

In [None]:
from sklearn.cluster import MiniBatchKMeans

n_clusters = 30

kmeans = MiniBatchKMeans(
    n_clusters=n_clusters,
    random_state=42,
    batch_size=2000
)

cluster_labels = kmeans.fit_predict(tfidf_matrix)

df["cluster"] = cluster_labels

df["cluster"].value_counts().sort_index()

In [None]:
def show_cluster(c, n=15):
    subset = df[df["cluster"] == c]
    return subset[["title", "average_rating", "ratings_count"]].head(n)


In [None]:
show_cluster(0)


In [None]:
from collections import Counter

def cluster_top_shelves(cluster_id, top_n=10):
    shelves_series = df[df["cluster"] == cluster_id]["shelf_names"]
    flat = [s for lst in shelves_series for s in lst]
    c = Counter(flat)
    return c.most_common(top_n)


In [None]:
cluster_top_shelves(0, top_n=10)


In [None]:
import numpy as np

terms = vectorizer.get_feature_names_out()
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]

def cluster_keywords(cluster_id, top_n=10):
    return [terms[i] for i in order_centroids[cluster_id, :top_n]]


cluster_keywords(0, top_n=10)

In [None]:
def auto_cluster_label(cluster_id):
    top_shelves = [s for s, _ in cluster_top_shelves(cluster_id, top_n=5)]
    top_words = cluster_keywords(cluster_id, top_n=5)
    combined = top_shelves[:3] + top_words[:3]
    label = ", ".join(combined)
    return label or f"Cluster {cluster_id}"


In [None]:
for c in range(n_clusters):
    print(c, "→", auto_cluster_label(c))

In [None]:
cluster_label_map = {c: auto_cluster_label(c) for c in range(n_clusters)}
df["cluster_label"] = df["cluster"].map(cluster_label_map)

df[["title", "cluster", "cluster_label"]].head()