In [None]:
import praw
import pandas as pd
from datetime import datetime, timedelta
import time

# 设置 Reddit API 凭证
reddit = praw.Reddit(
    client_id='Gd_KhZrVZVO79ou3LG_vVw',
    client_secret='-X_P63zpKewLKEreQJWXm2fFXjY1Ew',
    user_agent='reddit-crawler by /u/alpaca_1'
)

# === 参数设置 ===
subreddit_name = "dogecoin"
query = "dogecoin"
target_date = datetime(2025, 4, 23)  # 替换为你想要的日期
next_date = target_date + timedelta(days=1)

# 将目标时间转换为 UTC 时间戳
start_timestamp = int(target_date.timestamp())
end_timestamp = int(next_date.timestamp())

# === 收集帖子 ID ===
subreddit = reddit.subreddit(subreddit_name)
post_ids = []

# 使用 subreddit.search + 时间限制进行搜索
for submission in subreddit.search(query, sort="new", time_filter="all", limit=None):
    if start_timestamp <= submission.created_utc < end_timestamp:
        post_ids.append(submission.id)

print(f"🔍 找到 {len(post_ids)} 篇 {target_date.date()} 当天的包含 'dogecoin' 的帖子")

# === 抓取评论并写入 DataFrame ===
rows = []

for pid in post_ids:
    try:
        submission = reddit.submission(id=pid)
        submission.comments.replace_more(limit=0)
        for comment in submission.comments.list():
            rows.append({
                "post_id": pid,
                "author": str(comment.author) if comment.author else "N/A",
                "body": comment.body.replace("\n", " "),
                "score": comment.score,
                "created_utc": comment.created_utc,
                "date": datetime.utcfromtimestamp(comment.created_utc).strftime('%Y-%m-%d %H:%M:%S')
            })
        time.sleep(1)  # 防止被限速
    except Exception as e:
        print(f"⚠️ Error fetching comments for post {pid}: {e}")
        continue

# === 保存为 CSV ===
df = pd.DataFrame(rows, columns=["post_id", "author", "body", "score", "created_utc", "date"])
output_filename = f"dogecoin_comments_{target_date.date()}.csv"
df.to_csv(output_filename, index=False, encoding="utf-8")
print(f"✅ 已保存评论至 {output_filename}，共 {len(df)} 条")


In [None]:
from datasets import load_dataset
import pandas as pd
import os
from datetime import datetime

# 输出文件夹
os.makedirs("Dogecoin_Comments_HF", exist_ok=True)
output_path = "Dogecoin_Comments_HF/dogecoin_comments_pushshift.csv"

# 准备流式加载 Pushshift Reddit 评论数据
print("🔄 开始加载 Hugging Face 上的 Reddit 评论数据（pushshift）...")

# 加载 Pushshift Reddit 评论数据集
dataset = load_dataset("fddemarco/pushshift-reddit-comments", split="train", streaming=True)


# 筛选 r/dogecoin 的评论并存储
comments = []
max_comments = 50000  # 你可以改成 100000 或更多，根据需要
print(f"🔍 正在筛选 r/dogecoin 的前 {max_comments} 条评论...")

for i, comment in enumerate(dataset):
    if comment.get("subreddit", "").lower() == "dogecoin":
        comments.append({
            "post_id": comment.get("link_id", "").split("_")[-1],
            "author": comment.get("author", "N/A"),
            "body": comment.get("body", "").replace("\n", " "),
            "score": comment.get("score", 0),
            "created_utc": comment.get("created_utc"),
            "date": datetime.utcfromtimestamp(comment["created_utc"]).strftime("%Y-%m-%d %H:%M:%S")
        })
    if i % 5000 == 0 and i > 0:
        print(f"📥 已抓取 {i} 条，dogecoin 评论数量：{len(comments)}")
    if len(comments) >= max_comments:
        break

# 转换为 DataFrame 并保存
df = pd.DataFrame(comments, columns=["post_id", "author", "body", "score", "created_utc", "date"])
df.to_csv(output_path, index=False, encoding="utf-8")
print(f"✅ 成功保存 {len(df)} 条评论至 {output_path}")


In [None]:
from datasets import load_dataset
import pandas as pd
import os

# 配置
SUBREDDIT_NAME = "dogecoin"
MAX_COMMENTS = 10000  # 可根据内存和需要调整
OUTPUT_DIR = "Dogecoin_Comments_HF"
OUTPUT_FILE = os.path.join(OUTPUT_DIR, f"{SUBREDDIT_NAME}_comments_sample.csv")

# 确保输出目录存在
os.makedirs(OUTPUT_DIR, exist_ok=True)

# 加载 Pushshift Reddit 评论数据（使用 fddemarco 版本）
print("🔄 正在加载数据集（Hugging Face - fddemarco/pushshift-reddit-comments）...")
dataset = load_dataset("fddemarco/pushshift-reddit-comments", split="train", streaming=True)

# 过滤 r/dogecoin 的评论
print(f"🔍 正在筛选 r/{SUBREDDIT_NAME} 的评论...")
filtered = dataset.filter(lambda x: x.get("subreddit") == SUBREDDIT_NAME)

# 收集数据
comments = []
for i, row in enumerate(filtered):
    try:
        comments.append({
            "post_id": row.get("link_id", "").split("_")[-1],
            "author": row.get("author", "N/A"),
            "body": row.get("body", "").replace("\n", " "),
            "score": row.get("score", 0),
            "created_utc": row.get("created_utc"),
            "date": pd.to_datetime(row.get("created_utc"), unit='s', utc=True).strftime("%Y-%m-%d %H:%M:%S"),
        })
    except Exception as e:
        print(f"⚠️ 跳过错误数据：{e}")
        continue

    if i + 1 >= MAX_COMMENTS:
        break
    if (i + 1) % 1000 == 0:
        print(f"📥 已抓取 {i + 1} 条评论...")

# 保存为 CSV
df = pd.DataFrame(comments, columns=["post_id", "author", "body", "score", "created_utc", "date"])
df.to_csv(OUTPUT_FILE, index=False, encoding="utf-8")
print(f"✅ 已保存 {len(df)} 条评论至 {OUTPUT_FILE}")


  from .autonotebook import tqdm as notebook_tqdm


🔄 正在加载数据集（Hugging Face - fddemarco/pushshift-reddit-comments）...
🔍 正在筛选 r/dogecoin 的评论...
