In [None]:
import pandas as pd

# 文件路径
file_path = "Dogecoin_CSVs/Dogecoin_Reddit_2023-01-01_to_2024-12-31.csv"

# 加载数据
df = pd.read_csv(file_path)

# 打印总行数
print(f"📊 总评论数（行数）：{len(df)}")

In [None]:
import pandas as pd

input_path = "Dogecoin_CSVs/Dogecoin_Reddit_2023-01-01_to_2024-12-31.csv"
output_path = "Dogecoin_Processed/Dogecoin_Reddit_llm_trackers.csv"

# 加载 CSV，添加唯一 ID 列
df = pd.read_csv(input_path)
df.insert(0, "llm_tracker", range(1, len(df) + 1))  # 从 1 开始编号
df.to_csv(output_path, index=False)
print("✅ 已成功添加 llm_tracker 列。")


In [None]:
import csv
import requests
import time
import os

# 模型设置
OLLAMA_API_URL = "http://localhost:11434/api/generate"
MODEL_NAME = "llama3.3:70b-instruct-q3_K_S"

# 文件路径
input_csv = "Dogecoin_Processed/Dogecoin_Reddit_llm_trackers.csv"
output_csv = "Dogecoin_Processed/Dogecoin_Reddit_Processed.csv"
error_log_csv = "Dogecoin_Processed/error_log.csv"

# Prompt 模板
PROMPT_TEMPLATE = (
    "You are a financial language analysis assistant.\n"
    "Your task is to analyze the following Reddit comment and classify it with financial relevance and sentiment.\n\n"
    "If the comment is NOT related to dogecoin markets, simply return:\n"
    "relevance: false\n\n"
    "If it IS related to markets, return in the following structured format:\n"
    "relevance: true, sentiment: [positive/neutral/negative], emotion_type: [euphoria, fear, anger, FOMO, uncertainty, etc.], "
    "volatility_signal: [low/medium/high], stance: [bullish/bearish/neutral]\n\n"
    "Be concise and always output in a single line.\n\n"
    "Comment: \"{text}\""
)

def classify_comment(text):
    prompt = PROMPT_TEMPLATE.format(text=text.strip().replace("\n", " "))
    response = requests.post(OLLAMA_API_URL, json={
        "model": MODEL_NAME,
        "prompt": prompt,
        "stream": False
    })
    result = response.json()["response"]

    if "relevance: false" in result.lower():
        return {
            "sentiment": "none",
            "relevance": "false",
            "emotion_type": "none",
            "volatility_signal": "none",
            "stance": "none"
        }

    parts = {k.strip(): v.strip() for k, v in [
        pair.split(":") for pair in result.split(",") if ":" in pair
    ]}
    return {
        "sentiment": parts.get("sentiment", "none"),
        "relevance": parts.get("relevance", "true"),
        "emotion_type": parts.get("emotion_type", "none"),
        "volatility_signal": parts.get("volatility_signal", "none"),
        "stance": parts.get("stance", "none")
    }

# --------- 断点续跑机制 ---------
processed_ids = set()
if os.path.exists(output_csv):
    with open(output_csv, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        processed_ids = {row["llm_tracker"] for row in reader if "llm_tracker" in row}

# --------- 主逻辑 ---------
with open(input_csv, newline='', encoding='utf-8') as infile, \
     open(output_csv, 'a', newline='', encoding='utf-8') as outfile, \
     open(error_log_csv, 'a', newline='', encoding='utf-8') as errorfile:

    reader = csv.DictReader(infile)
    fieldnames = reader.fieldnames + ["sentiment", "relevance", "emotion_type", "volatility_signal", "stance"]
    writer = csv.DictWriter(outfile, fieldnames=fieldnames)
    error_writer = csv.DictWriter(errorfile, fieldnames=["llm_tracker", "error_message"])

    # 写入表头（如果文件为空）
    if os.stat(output_csv).st_size == 0:
        writer.writeheader()
    if os.stat(error_log_csv).st_size == 0:
        error_writer.writeheader()

    for idx, row in enumerate(reader):
        comment_id = row.get("llm_tracker")
        if comment_id in processed_ids:
            continue

        comment = row.get("body", "")
        if not comment.strip():
            continue

        try:
            print(f"[{idx+1}] Processing comment (llm_tracker {comment_id}): {comment[:50]}...")
            tags = classify_comment(comment)
            row.update(tags)
            writer.writerow(row)
            outfile.flush()
        except Exception as e:
            print(f"❌ Error at comment {comment_id}: {e}")
            error_writer.writerow({"llm_tracker": comment_id, "error_message": str(e)})
            errorfile.flush()
        time.sleep(0.5)

print("✅ 所有评论处理完毕，结构化标签已写入输出文件。")
print("⚠️ 错误评论已写入 error_log.csv。")


[7228] Processing comment (llm_tracker 7228): What’s about to happen?...
[7229] Processing comment (llm_tracker 7229): Thanks this just reminded me I need to buy more...
[7230] Processing comment (llm_tracker 7230): #JDK Just don’t know...
[7231] Processing comment (llm_tracker 7231): Just bought some too. Let's go!...
[7232] Processing comment (llm_tracker 7232): Likely going to see a big dump from the pumpers...
[7233] Processing comment (llm_tracker 7233): How did you call that one. Nice job!🤙...
[7234] Processing comment (llm_tracker 7234): I will trust this bro haha...
[7235] Processing comment (llm_tracker 7235): 1. W...
[7236] Processing comment (llm_tracker 7236): Good Trust Me Bro TA Right there - ILL TAKE IT MOO...
[7237] Processing comment (llm_tracker 7237): Imthe rockets are lit bro.... The g-forces growing...
[7238] Processing comment (llm_tracker 7238): Picking up speed...
[7239] Processing comment (llm_tracker 7239): I had this over a year ago when I started my accou...