In [None]:
import os
import requests
import time
import json
import random

headers = {
    "User-Agent": "Mozilla/5.0",
    "Accept": "application/json",
    "Referer": "https://www.reddit.com/r/KenduInu_Ecosystem/",
}

# === 配置路径 ===
json_dir = "Kendu_JSON"
post_log_file = os.path.join(json_dir, "saved_post_ids.txt")
after_checkpoint_file = os.path.join(json_dir, "after_checkpoint.txt")
page_index_file = os.path.join(json_dir, "page_index.txt")

TOTAL_PAGES = 200000
os.makedirs(json_dir, exist_ok=True)

# === 加载已保存的 post_ids ===
saved_post_ids = set()
if os.path.exists(post_log_file):
    with open(post_log_file, "r") as f:
        saved_post_ids = set(line.strip() for line in f)

# === 加载分页参数 ===
after = None
if os.path.exists(after_checkpoint_file):
    with open(after_checkpoint_file, "r") as f:
        after = f.read().strip()

page = 0
if os.path.exists(page_index_file):
    with open(page_index_file, "r") as f:
        page = int(f.read().strip())

print(f"📌 Resuming from page {page} with after = {after}")

# === 开始抓取页面 ===
for _ in range(page, TOTAL_PAGES):
    url = "https://www.reddit.com/r/KenduInu_Ecosystem/.json"
    if after:
        url += f"?after={after}"

    try:
        res = requests.get(url, headers=headers)
        data = res.json()
    except Exception as e:
        print(f"❌ Error fetching page {page}: {e}")
        time.sleep(5)
        continue

    children = data.get("data", {}).get("children", [])
    after = data.get("data", {}).get("after")

    new_posts = []
    for child in children:
        post_id = child["data"].get("id")
        if post_id and post_id not in saved_post_ids:
            new_posts.append(child)
            saved_post_ids.add(post_id)

    if not new_posts:
        print(f"⚠️ Page {page} contains only duplicate posts. Skipping.")
    else:
        json_filename = f"page_{page}.json"
        save_data = {"data": {"children": new_posts}}
        with open(os.path.join(json_dir, json_filename), "w", encoding="utf-8") as f:
            json.dump(save_data, f, ensure_ascii=False, indent=2)

        # 更新日志和断点
        with open(post_log_file, "a") as f:
            for post in new_posts:
                f.write(post["data"]["id"] + "\n")

        print(f"✅ Saved {json_filename} with {len(new_posts)} new posts.")

    with open(after_checkpoint_file, "w") as f:
        f.write(after if after else "")

    page += 1
    with open(page_index_file, "w") as f:
        f.write(str(page))

    if not after:
        print("✅ No more pages. Stopping.")
        break

    time.sleep(random.uniform(10, 20))


📌 Resuming from page 0 with after = None
✅ Saved page_0.json with 26 new posts.
✅ Saved page_1.json with 25 new posts.
✅ Saved page_2.json with 25 new posts.
✅ Saved page_3.json with 25 new posts.
✅ Saved page_4.json with 25 new posts.
✅ Saved page_5.json with 25 new posts.
✅ Saved page_6.json with 25 new posts.
✅ Saved page_7.json with 25 new posts.
✅ Saved page_8.json with 25 new posts.
✅ Saved page_9.json with 25 new posts.
✅ Saved page_10.json with 25 new posts.
✅ Saved page_11.json with 25 new posts.
✅ Saved page_12.json with 25 new posts.
✅ Saved page_13.json with 25 new posts.
✅ Saved page_14.json with 25 new posts.
✅ Saved page_15.json with 25 new posts.
✅ Saved page_16.json with 25 new posts.
✅ Saved page_17.json with 25 new posts.
✅ Saved page_18.json with 25 new posts.
✅ Saved page_19.json with 25 new posts.
✅ Saved page_20.json with 25 new posts.
✅ Saved page_21.json with 25 new posts.
✅ Saved page_22.json with 25 new posts.
✅ Saved page_23.json with 25 new posts.
✅ Saved p

In [None]:
import os
import json
import csv
import requests
from datetime import datetime
import time
import random

# === 配置路径 ===
json_dir = "Kendu_JSON"
output_dir = "Kendu_CSV"
processed_file = os.path.join(output_dir, "processed_pages.txt")
error_dir = os.path.join(output_dir, "errors")

# === 创建错误保存目录（如不存在） ===
os.makedirs(error_dir, exist_ok=True)

# === 初始化已处理页面 ===
processed_pages = set()
if os.path.exists(processed_file):
    with open(processed_file, "r") as f:
        processed_pages = set(line.strip() for line in f)

# === Headers 设置 ===
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
    "Accept": "application/json",
    "Referer": "https://www.reddit.com/r/KenduInu_Ecosystem/",
}

# === 递归函数：展开所有层级评论 ===
def extract_comments(comments_list, post_id, all_comments):
    for c in comments_list:
        if c.get("kind") != "t1":
            continue
        data = c["data"]
        comment = {
            "post_id": post_id,
            "author": data.get("author"),
            "body": data.get("body", "").replace("\n", " "),
            "score": data.get("score"),
            "created_utc": data.get("created_utc"),
            "date": datetime.utcfromtimestamp(data.get("created_utc")).strftime('%Y-%m-%d %H:%M:%S') if data.get("created_utc") else ""
        }
        all_comments.append(comment)
        # 递归抓取子评论
        if data.get("replies") and isinstance(data["replies"], dict):
            replies = data["replies"]["data"]["children"]
            extract_comments(replies, post_id, all_comments)

# === 遍历所有 JSON 文件 ===
for filename in os.listdir(json_dir):
    if not filename.endswith(".json") or filename in processed_pages:
        continue

    json_path = os.path.join(json_dir, filename)
    with open(json_path, "r", encoding="utf-8") as f:
        try:
            data = json.load(f)
        except Exception as e:
            print(f"❌ Failed to load {filename}: {e}")
            continue

    comments_data = []
    for child in data.get("data", {}).get("children", []):
        post_id = child["data"].get("id")
        if not post_id:
            continue

        comment_url = f"https://www.reddit.com/comments/{post_id}.json"

        try:
            res = requests.get(comment_url, headers=headers)
            if res.status_code == 429:
                wait = int(res.headers.get("Retry-After", 10))
                print(f"⏳ Rate limited on post {post_id}, waiting {wait} seconds...")
                time.sleep(wait)
                continue

            if res.status_code != 200:
                print(f"⚠️ Skipping post {post_id}, status code {res.status_code}")
                continue

            if not res.text.strip():
                print(f"⚠️ Empty response for post {post_id}, possible rate limit or server error")
                continue

            try:
                thread_data = res.json()
            except json.JSONDecodeError:
                error_path = os.path.join(error_dir, f"error_{post_id}.html")
                with open(error_path, "w", encoding="utf-8") as ef:
                    ef.write(res.text)
                print(f"❌ Error parsing JSON for post {post_id}, saved raw response to {error_path}")
                continue

            if not isinstance(thread_data, list) or len(thread_data) < 2:
                print(f"⚠️ Invalid structure for post {post_id}, skipping")
                continue

            comments_list = thread_data[1]["data"]["children"]
            extract_comments(comments_list, post_id, comments_data)
            print(f"✅ {filename} - Post {post_id} -> {len(comments_data)} comments collected so far")
        except Exception as e:
            print(f"❌ Error fetching comments for post {post_id}: {e}")

        time.sleep(random.uniform(1, 2))  # 防止限速

    # 保存为对应 CSV 文件
    csv_filename = filename.replace(".json", ".csv")
    csv_path = os.path.join(output_dir, csv_filename)
    with open(csv_path, "w", newline="", encoding="utf-8") as csvfile:
        fieldnames = ["post_id", "author", "body", "score", "created_utc", "date"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for row in comments_data:
            writer.writerow(row)

    # 标记为已处理
    with open(processed_file, "a") as f:
        f.write(f"{filename}\n")

    print(f"📁 Finished {filename} -> Saved to {csv_filename}")



  "date": datetime.utcfromtimestamp(data.get("created_utc")).strftime('%Y-%m-%d %H:%M:%S') if data.get("created_utc") else ""


✅ page_4.json - Post 1htwf97 -> 7 comments collected so far
✅ page_4.json - Post 1htq1j6 -> 31 comments collected so far
✅ page_4.json - Post 1htebwn -> 31 comments collected so far
✅ page_4.json - Post 1ht7xza -> 93 comments collected so far
✅ page_4.json - Post 1htcufp -> 93 comments collected so far
✅ page_4.json - Post 1hschww -> 118 comments collected so far
✅ page_4.json - Post 1hsa13f -> 124 comments collected so far
✅ page_4.json - Post 1hsdx5k -> 128 comments collected so far
✅ page_4.json - Post 1hod5wp -> 159 comments collected so far
✅ page_4.json - Post 1hofixi -> 189 comments collected so far
✅ page_4.json - Post 1hodket -> 198 comments collected so far
✅ page_4.json - Post 1hn0slx -> 238 comments collected so far
✅ page_4.json - Post 1hl4zli -> 265 comments collected so far
✅ page_4.json - Post 1hl4vw9 -> 289 comments collected so far
✅ page_4.json - Post 1hkur8k -> 314 comments collected so far
✅ page_4.json - Post 1hkl4vg -> 337 comments collected so far
✅ page_4.json 

In [None]:
import os
import pandas as pd

# 设置 CSV 文件夹路径
csv_dir = "Kendu_CSV"

# 收集所有 CSV 文件路径
csv_files = [os.path.join(csv_dir, f) for f in os.listdir(csv_dir) if f.endswith(".csv")]

# 合并所有 CSV 文件的数据
all_data = pd.concat([pd.read_csv(f, usecols=["body"]) for f in csv_files], ignore_index=True)

# 统计
total_rows = len(all_data)
unique_bodies = all_data["body"].drop_duplicates()
unique_rows = len(unique_bodies)
duplicate_rows = total_rows - unique_rows

# 输出结果
print(f"📄 Total comment rows: {total_rows}")
print(f"🔁 Duplicate bodies: {duplicate_rows}")
print(f"✅ Unique bodies: {unique_rows}")


📄 Total comment rows: 11220
🔁 Duplicate bodies: 772
✅ Unique bodies: 10448


In [None]:
import os
import pandas as pd

# 输入和输出路径
input_dir = "Kendu_CSV"
output_file = os.path.join(input_dir, "may_7_merged_comments.csv")

# 合并所有 CSV 文件
all_dfs = []
for filename in os.listdir(input_dir):
    if filename.endswith(".csv") and filename != "may_7_merged_comments.csv":
        file_path = os.path.join(input_dir, filename)
        try:
            df = pd.read_csv(file_path)
            all_dfs.append(df)
            print(f"✅ Loaded {filename} with {len(df)} rows.")
        except Exception as e:
            print(f"❌ Failed to load {filename}: {e}")

# 合并并去重（可选）
if all_dfs:
    merged_df = pd.concat(all_dfs, ignore_index=True)
    print(f"📊 Total merged rows (before deduplication): {len(merged_df)}")

    # 可选：基于 comment_id 去重（或改为 'body'）
    merged_df.drop_duplicates(subset="body", inplace=True)
    print(f"🧹 Rows after deduplication: {len(merged_df)}")

    # 保存合并后的结果
    merged_df.to_csv(output_file, index=False)
    print(f"📁 Merged file saved to: {output_file}")
else:
    print("⚠️ No CSV files found or all failed to load.")


✅ Loaded page_14.csv with 71 rows.
✅ Loaded page_11.csv with 447 rows.
✅ Loaded page_30.csv with 1698 rows.
✅ Loaded page_8.csv with 496 rows.
✅ Loaded page_16.csv with 32 rows.
✅ Loaded page_5.csv with 599 rows.
✅ Loaded page_6.csv with 469 rows.
✅ Loaded page_10.csv with 430 rows.
✅ Loaded page_20.csv with 436 rows.
✅ Loaded page_25.csv with 331 rows.
✅ Loaded page_21.csv with 223 rows.
✅ Loaded page_3.csv with 332 rows.
✅ Loaded page_15.csv with 50 rows.
✅ Loaded page_2.csv with 243 rows.
✅ Loaded page_18.csv with 144 rows.
✅ Loaded page_7.csv with 405 rows.
✅ Loaded page_13.csv with 38 rows.
✅ Loaded page_19.csv with 202 rows.
✅ Loaded page_24.csv with 66 rows.
✅ Loaded page_22.csv with 198 rows.
✅ Loaded page_1.csv with 103 rows.
✅ Loaded page_28.csv with 281 rows.
✅ Loaded page_23.csv with 215 rows.
✅ Loaded page_9.csv with 854 rows.
✅ Loaded page_29.csv with 1137 rows.
✅ Loaded page_26.csv with 170 rows.
✅ Loaded page_12.csv with 483 rows.
✅ Loaded page_4.csv with 502 rows.
✅ Lo

In [1]:
import pandas as pd

# 读取合并后的 CSV 文件
df = pd.read_csv("Kendu_CSV/may_7_merged_comments.csv")

# 确保 'date' 列为 datetime 类型
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# 删除无法解析的日期
df = df.dropna(subset=['date'])

# 提取年份
df['year'] = df['date'].dt.year

# 按年份统计频率
year_counts = df['year'].value_counts().sort_index()

print("📅 每年评论数量分布：")
print(year_counts)

📅 每年评论数量分布：
year
2025    7941
Name: count, dtype: int64


In [5]:
# 提取月份
df['month'] = df['date'].dt.month

# 按年份和月份统计频率
year_month_counts = df.groupby(['year', 'month']).size().unstack(fill_value=0)

print("\n📅 每年每月评论数量分布：")
print(year_month_counts)



📅 每年每月评论数量分布：
month     1     2     3     4    5
year                              
2025   2991  2358  1204  1168  220


In [7]:
# 避免显示被省略
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 200)
pd.set_option('display.max_colwidth', None)


# 解析日期并删除无法解析的项
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df = df.dropna(subset=['date'])

# 提取年、月、日
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day

# 筛选出年份为 2025 的数据
df_2025 = df[df['year'] == 2025]

# 分组统计每年每月每天的评论数量
daily_counts = df_2025.groupby(['year', 'month', 'day']).size().reset_index(name='count')

# 构造透视表：年+月 为行，日为列
pivot_table = daily_counts.pivot_table(
    index=['year', 'month'], columns='day', values='count', fill_value=0
)

# 输出整洁表格
print("📅 2025 年每月每天的评论数量分布：")
print(pivot_table)

  


📅 2025 年每月每天的评论数量分布：
day           1     2     3     4      5     6      7      8     9     10    11     12     13     14    15    16     17     18     19     20     21     22     23     24     25     26     27     28  \
year month                                                                                                                                                                                             
2025 1       0.0   0.0   0.0   0.0    0.0   0.0    0.0    0.0   0.0   0.0   0.0    0.0    0.0    0.0   0.0   0.0    0.0  217.0  245.0  307.0  476.0  354.0  286.0  244.0  187.0  105.0  135.0  117.0   
     2      91.0  63.0  90.0  75.0  104.0  82.0   96.0  108.0  53.0  88.0  73.0  123.0  112.0  117.0  22.0  81.0  118.0  105.0  170.0   42.0  125.0   31.0   38.0   93.0   48.0   67.0   74.0   69.0   
     3      53.0  47.0  34.0  19.0   23.0  15.0  104.0   43.0  51.0  40.0  37.0   21.0   20.0   50.0  44.0  27.0   22.0   26.0   22.0   22.0   32.0   34.0   38.0   64.0   62.0   8