In [None]:
import ast
import time
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# =========================
# Step 1. 读取 CSV
# =========================
t0 = time.time()
print("Step 1: 开始读取 CSV（全量行数）...")

# 如果以后想先用部分数据测试，可以把下面这一行改成带 nrows，比如：
df = pd.read_csv("goodreads_books_clean.csv", nrows=10000)
#df = pd.read_csv("goodreads_books_clean.csv")

print(f"Step 1 完成：读取到 {len(df)} 行，用时 {time.time() - t0:.1f} 秒")

# =========================
# Step 2. 把字符串字段转回 list
# =========================
def parse_list(x):
    if pd.isna(x) or x == "":
        return []
    try:
        return ast.literal_eval(x)
    except Exception:
        return []

t1 = time.time()
print("Step 2: 解析 authors / popular_shelves / similar_books ...")

df["authors"] = df["authors"].apply(parse_list)
print(f"  authors 解析完成，用时 {time.time() - t1:.1f} 秒")

t2 = time.time()
df["popular_shelves"] = df["popular_shelves"].apply(parse_list)
print(f"  popular_shelves 解析完成，用时 {time.time() - t2:.1f} 秒")

t3 = time.time()
df["similar_books"] = df["similar_books"].apply(parse_list)
print(f"  similar_books 解析完成，用时 {time.time() - t3:.1f} 秒")

print(f"Step 2 总耗时：{time.time() - t1:.1f} 秒")

# =========================
# Step 3. 基本过滤
# =========================
t4 = time.time()
print("Step 3: 做基本过滤（评分人数 / 语言）...")

# 过滤掉评分人数太少的书
df = df[df["ratings_count"] >= 50]

# 只保留英文或 language_code 为空的书
df = df[
    (df["language_code"] == "eng")
]

print(f"过滤后的行数：{len(df)}，用时 {time.time() - t4:.1f} 秒")

# =========================
# Step 4. 准备 shelves_text 文本
# =========================
print("Step 4: 准备 shelves_text 文本...")

df["shelves_text"] = df["popular_shelves"].apply(lambda lst: " ".join(lst))

print(f"Step 4 完成，用时 {time.time() - t4:.1f} 秒（包含过滤 + 文本拼接）")

# =========================
# Step 5. 计算 TF-IDF 向量
# =========================
print("Step 5: 开始计算 TF-IDF 向量")
t5 = time.time()

vectorizer = TfidfVectorizer(
    max_features=5000,            # 特征数，如果内存吃不消可以改小一些，比如 2000
    token_pattern=r"(?u)\b\w+\b"
)

X_shelves = vectorizer.fit_transform(df["shelves_text"])

print(f"TF-IDF 完成，矩阵维度：{X_shelves.shape}，用时 {time.time() - t5:.1f} 秒")

# =========================
# Step 6. 相似书推荐函数
# =========================
def get_index_by_title(title: str):
    """根据书名找到在 df 中的 index（忽略大小写）"""
    matches = df[df["title"].str.lower() == title.lower()]
    if matches.empty:
        print(f"没有找到书名：{title}")
        return None
    return matches.index[0]

def recommend_similar_books(title: str, top_n: int = 5):
    """给定书名，打印最相似的 top_n 本书"""
    idx = get_index_by_title(title)
    if idx is None:
        return

    book_vec = X_shelves[idx]

    sims = cosine_similarity(book_vec, X_shelves).flatten()
    sim_indices = np.argsort(-sims)          # 按相似度从大到小排序
    sim_indices = sim_indices[sim_indices != idx]  # 去掉自己
    top_indices = sim_indices[:top_n]

    results = df.iloc[top_indices][
        ["title", "average_rating", "ratings_count", "image_url"]
    ]

    print(f"\n和《{title}》最相似的 {top_n} 本书：\n")
    print(results.to_string(index=False))

# =========================
# Step 7. 直接运行文件时做一次测试
# =========================
if __name__ == "__main__":
    # 这里填一本文字完全一致、你确定在 CSV 里存在的书名
    test_title = "Best Friends Forever"
    recommend_similar_books(test_title, top_n=5)