In [3]:
import pandas as pd
import numpy as np
import re
import ast

from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import MiniBatchKMeans



path = "../raw_data/goodreads_books_50k.csv"
df = pd.read_csv(path)

print("shape：", df.shape)
df.head()


shape： (50000, 29)


Unnamed: 0,isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,kindle_asin,...,publication_month,edition_information,publication_year,url,image_url,book_id,ratings_count,work_id,title,title_without_series
0,312853122.0,1.0,[],US,,"[{'count': '3', 'name': 'to-read'}, {'count': ...",,False,4.0,,...,9.0,,1984.0,https://www.goodreads.com/book/show/5333265-w-...,https://images.gr-assets.com/books/1310220028m...,5333265,3.0,5400751.0,W.C. Fields: A Life on Film,W.C. Fields: A Life on Film
1,743509986.0,6.0,[],US,,"[{'count': '2634', 'name': 'to-read'}, {'count...",,False,3.23,B000FC0PBC,...,10.0,Abridged,2001.0,https://www.goodreads.com/book/show/1333909.Go...,https://s.gr-assets.com/assets/nophoto/book/11...,1333909,10.0,1323437.0,Good Harbor,Good Harbor
2,,7.0,['189911'],US,eng,"[{'count': '58', 'name': 'to-read'}, {'count':...",B00071IKUY,False,4.03,,...,,Book Club Edition,1987.0,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,7327624,140.0,8948723.0,"The Unschooled Wizard (Sun Wolf and Starhawk, ...","The Unschooled Wizard (Sun Wolf and Starhawk, ..."
3,743294297.0,3282.0,[],US,eng,"[{'count': '7615', 'name': 'to-read'}, {'count...",,False,3.49,B002ENBLOK,...,7.0,,2009.0,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,6066819,51184.0,6243154.0,Best Friends Forever,Best Friends Forever
4,850308712.0,5.0,[],US,,"[{'count': '32', 'name': 'to-read'}, {'count':...",,False,3.4,,...,,,,https://www.goodreads.com/book/show/287140.Run...,https://images.gr-assets.com/books/1413219371m...,287140,15.0,278577.0,Runic Astrology: Starcraft and Timekeeping in ...,Runic Astrology: Starcraft and Timekeeping in ...


In [4]:
english_codes = ["eng", "en-US", "en-GB", "en-CA","NaN"]
df = df[df["language_code"].isin(english_codes)].copy()

In [5]:
df = df.dropna(subset=["title", "description"], how="all").reset_index(drop=True)
#drop rows where both title and description are NaN

In [6]:
df.shape

(18621, 29)

In [7]:
cols_to_drop = [
    "edition_information",
    "asin",
    "kindle_asin",
    "publication_day",
    "publication_month",
    "format",
    "publisher",
    "isbn"
]

df = df.drop(columns=cols_to_drop, errors="ignore")

In [8]:
df.shape

(18621, 21)

In [9]:
for col in ["average_rating", "ratings_count", "text_reviews_count"]:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

df[["average_rating", "ratings_count", "text_reviews_count"]].describe()

Unnamed: 0,average_rating,ratings_count,text_reviews_count
count,18621.0,18621.0,18621.0
mean,3.923501,867.937866,61.632028
std,0.473281,9684.768052,556.502131
min,0.0,0.0,0.0
25%,3.7,9.0,2.0
50%,3.95,38.0,7.0
75%,4.19,174.0,23.0
max,5.0,575163.0,38878.0


In [10]:
def clean_text(s):
    if not isinstance(s, str):
        return ""
    s = s.lower()
    s = re.sub(r"<.*?>", " ", s)                    # remove HTML tags
    s = s.replace("\n", " ").replace("\r", " ").replace("\t", " ")
    s = re.sub(r"\s+", " ", s).strip()
    return s

In [11]:
df["title_clean"] = (
    df["title_without_series"]
    .fillna(df["title"])
    .fillna("")
    .astype(str)
    .str.lower()
)

In [12]:
df["description_clean"] = df["description"].fillna("").astype(str).apply(clean_text)

df[["title_clean", "description_clean"]].head()

Unnamed: 0,title_clean,description_clean
0,"the unschooled wizard (sun wolf and starhawk, ...",omnibus book club edition containing the ladie...
1,best friends forever,addie downs and valerie adler were eight when ...
2,the house of memory (pluto's snitch #2),
3,the bonfire of the vanities,
4,heaven,what is heaven really going to be like? what w...


In [13]:
def parse_shelves(s):
    try:
        lst = ast.literal_eval(s)
        return [d.get("name") for d in lst if isinstance(d, dict)]
    except:
        return []

df["shelf_names"] = df["popular_shelves"].apply(parse_shelves)

In [14]:
NOISE_SHELVES = set([
    "to-read", "currently-reading", "read", "owned", "my-books", "books-i-own",
    "books", "library", "kindle", "kindle-free", "free", "freebie", "default",
    "favorites", "favorite-authors", "have", "i-own", "on-my-shelf", "purchased",
    "unfinished", "did-not-finish"
])

In [15]:
SYNONYMS = {
    "m-m": "lgbt",
    "m/m": "lgbt",
    "mm": "lgbt",
    "sci-fi": "science-fiction",
    "ya": "young-adult",
    "ya-fiction": "young-adult",
    "childrens": "children",
    "harlequin": "romance",
    "cozy": "cozy-mystery",
    "mysteries": "mystery"
}

In [16]:
def normalize_shelf(s):

    s = s.lower().strip()
    return SYNONYMS.get(s, s)

def clean_shelves(shelves):

    cleaned = []
    for s in shelves:
        s = s.lower().strip()
        if s in NOISE_SHELVES:
            continue
        cleaned.append(normalize_shelf(s))
    return cleaned

In [17]:
df["clean_shelves"] = df["shelf_names"].apply(clean_shelves)


In [18]:
df["shelf_text"] = df["clean_shelves"].apply(
    lambda lst: " ".join(lst) if isinstance(lst, list) else ""
)

df[["shelf_names", "clean_shelves", "shelf_text"]].head()

Unnamed: 0,shelf_names,clean_shelves,shelf_text
0,"[to-read, fantasy, fiction, owned, hardcover, ...","[fantasy, fiction, hardcover, shelfari-favorit...",fantasy fiction hardcover shelfari-favorites s...
1,"[to-read, chick-lit, currently-reading, fictio...","[chick-lit, fiction, jennifer-weiner, chicklit...",chick-lit fiction jennifer-weiner chicklit boo...
2,"[currently-reading, netgalley, kindle, read-20...","[netgalley, read-2017, read-in-2017, paranorma...",netgalley read-2017 read-in-2017 paranormal gh...
3,"[to-read, fiction, favorites, classics, curren...","[fiction, classics, 1001-books, literature, no...",fiction classics 1001-books literature novels ...
4,"[to-read, currently-reading, theology, christi...","[theology, christian-life, christian, non-fict...",theology christian-life christian non-fiction ...


In [19]:
def parse_authors(s):
    try:
        lst = ast.literal_eval(s)
        return [d.get("author_id") for d in lst if isinstance(d, dict)]
    except:
        return []

df["author_ids"] = df["authors"].apply(parse_authors)

In [20]:
def parse_similar(s):
    try:
        return ast.literal_eval(s)
    except:
        return []

df["similar_books_list"] = df["similar_books"].apply(parse_similar)


In [21]:
# shelves → "fantasy magic young-adult"
df["shelf_text"] = df["shelf_names"].apply(
    lambda lst: " ".join(lst) if isinstance(lst, list) else ""
)

In [22]:
df["author_text"] = df["author_ids"].apply(
    lambda lst: " ".join("author_" + str(a) for a in lst) if isinstance(lst, list) else ""
)

In [23]:
# similar_books → "sim_111 sim_222 sim_333"
df["sim_text"] = df["similar_books_list"].apply(
    lambda lst: " ".join("sim_" + str(b) for b in lst) if isinstance(lst, list) else ""
)


In [24]:
df["combined_text"] = (
    df["title_clean"].fillna("") + " " +
    df["description_clean"].fillna("") + " " +
    df["shelf_text"].fillna("") + " " +
    df["author_text"].fillna("") + " " +
    df["sim_text"].fillna("")
).str.strip()


In [25]:
vectorizer = TfidfVectorizer(
    stop_words="english",
    max_features=30000,
    min_df=5,
    max_df=0.8
)

In [26]:
tfidf_matrix = vectorizer.fit_transform(df["combined_text"])

tfidf_matrix.shape

(18621, 29526)

many information is missing in "edition_information", also not helpful, "asin" (Amazon ID, can delete), "kindle_asin" (Kindle Amazin ID), "publication_day", "publication_month", "format", "publisher", "isbn". Could be removed.

In [27]:
from sklearn.cluster import MiniBatchKMeans

n_clusters = 30

kmeans = MiniBatchKMeans(
    n_clusters=n_clusters,
    random_state=42,
    batch_size=2000
)

cluster_labels = kmeans.fit_predict(tfidf_matrix)

df["cluster"] = cluster_labels

df["cluster"].value_counts().sort_index()

cluster
0       45
1      907
2     1055
3      451
4      780
5      151
6      189
7      589
8        2
9      362
10     839
11     420
12     215
13      62
14    5173
15     266
16     426
17     179
18     958
19     606
20     466
21     680
22      68
23     278
24     231
25     556
26     824
27    1277
28      84
29     482
Name: count, dtype: int64

In [28]:
def show_cluster(c, n=15):
    subset = df[df["cluster"] == c]
    return subset[["title", "average_rating", "ratings_count"]].head(n)


In [29]:
show_cluster(0)


Unnamed: 0,title,average_rating,ratings_count
215,Doctor Who: The Child,3.14,29.0
1359,Torchwood: In the Shadows,3.98,211.0
1474,Doctor Who: Short Trips - Volume 1,3.59,68.0
2020,Torchwood: Exodus Code,3.87,96.0
2279,"Torchwood: Corpse Day (Big Finish Torchwood, #...",3.82,38.0
2311,The Case of The Disappearing Doctor,3.17,12.0
2673,Doctor Who: Scaredy Cat,2.62,144.0
4126,Doctor Who: Survival,3.58,89.0
4572,Doctor Who: The Forsaken,3.83,36.0
4684,The Tunnel At The End Of The Light,2.83,20.0


In [30]:
from collections import Counter

def cluster_top_shelves(cluster_id, top_n=10):
    shelves_series = df[df["cluster"] == cluster_id]["clean_shelves"]
    flat = [s for lst in shelves_series for s in lst]
    c = Counter(flat)
    return c.most_common(top_n)


In [31]:
cluster_top_shelves(0, top_n=10)


[('science-fiction', 74),
 ('doctor-who', 41),
 ('fiction', 32),
 ('audiobooks', 30),
 ('dr-who', 30),
 ('audio', 28),
 ('audiobook', 24),
 ('sf', 22),
 ('fantasy', 21),
 ('time-travel', 21)]

In [32]:
import numpy as np

terms = vectorizer.get_feature_names_out()
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]

def cluster_keywords(cluster_id, top_n=10):
    return [terms[i] for i in order_centroids[cluster_id, :top_n]]


cluster_keywords(0, top_n=10)

['doctor',
 'tv',
 'dw',
 'dr',
 'audio',
 'big',
 'audios',
 'fiction',
 'tie',
 'torchwood']

In [33]:
def auto_cluster_label(cluster_id):
    top_shelves = [s for s, _ in cluster_top_shelves(cluster_id, top_n=5)]
    top_words = cluster_keywords(cluster_id, top_n=5)
    combined = top_shelves[:3] + top_words[:3]
    label = ", ".join(combined)
    return label or f"Cluster {cluster_id}"


In [34]:
for c in range(n_clusters):
    print(c, "→", auto_cluster_label(c))

0 → science-fiction, doctor-who, fiction, doctor, tv, dw
1 → science-fiction, fiction, fantasy, fantasy, sci, fi
2 → fiction, novels, owned-books, fiction, literature, 1001
3 → young-adult, children, fiction, children, childhood, childrens
4 → romance, ebook, freebies, kindle, free, freebie
5 → romance, contemporary-romance, contemporary, harlequin, romance, hero
6 → religion, non-fiction, nonfiction, religion, spirituality, philosophy
7 → lgbt, romance, m-m-romance, mm, romance, gay
8 → historical-fiction, fiction, native-american, gear, lakes, kathleen
9 → mystery, cozy-mystery, series, mystery, cozy, mysteries
10 → paranormal, romance, fantasy, paranormal, fantasy, series
11 → romance, historical, historical-romance, romance, historical, regency
12 → poetry, fiction, poems, poetry, poems, plays
13 → science, non-fiction, nonfiction, science, physics, math
14 → fiction, fantasy, science-fiction, life, fiction, world
15 → mystery, fiction, ebook, kindle, mystery, free
16 → children, p

In [35]:
cluster_label_map = {c: auto_cluster_label(c) for c in range(n_clusters)}
df["cluster_label"] = df["cluster"].map(cluster_label_map)

df[["title", "cluster", "cluster_label"]].head()

Unnamed: 0,title,cluster,cluster_label
0,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",1,"science-fiction, fiction, fantasy, fantasy, sc..."
1,Best Friends Forever,2,"fiction, novels, owned-books, fiction, literat..."
2,The House of Memory (Pluto's Snitch #2),9,"mystery, cozy-mystery, series, mystery, cozy, ..."
3,The Bonfire of the Vanities,2,"fiction, novels, owned-books, fiction, literat..."
4,Heaven,23,"christian, christian-fiction, fiction, christi..."
