In [None]:
import pandas as pd

# 文件路径
file_path = "Dogecoin_CSVs/Dogecoin_Reddit_2023-01-01_to_2024-12-31.csv"

# 加载数据
df = pd.read_csv(file_path)

# 打印总行数
print(f"📊 总评论数（行数）：{len(df)}")

In [None]:
import csv
import requests
import time

# Ollama 模型设置
OLLAMA_API_URL = "http://localhost:11434/api/generate"
MODEL_NAME = "llama3.3:70b-instruct-q3_K_S"

# 输入输出路径
input_csv = "/home/jesse/Projects/myprojs/MT_1/00_Testing/Dogecoin_Comments_HF/test.csv"
output_csv = "/home/jesse/Projects/myprojs/MT_1/00_Testing/Dogecoin_Comments_HF/labeled_test_2.csv"

# Prompt 模板
PROMPT_TEMPLATE = (
    "You are a financial language analysis assistant.\n"
    "Your task is to analyze the following Reddit comment and classify it with financial relevance and sentiment.\n\n"
    "If the comment is NOT related to dogecoin markets, simply return:\n"
    "relevance: false\n\n"
    "If it IS related to markets, return in the following structured format:\n"
    "relevance: true, sentiment: [positive/neutral/negative], emotion_type: [euphoria, fear, anger, FOMO, uncertainty, etc.], "
    "volatility_signal: [low/medium/high], stance: [bullish/bearish/neutral]\n\n"
    "Be concise and always output in a single line.\n\n"
    "Comment: \"{text}\""
)

def classify_comment(text):
    prompt = PROMPT_TEMPLATE.format(text=text.strip().replace("\n", " "))
    response = requests.post(OLLAMA_API_URL, json={
        "model": MODEL_NAME,
        "prompt": prompt,
        "stream": False
    })

    try:
        result = response.json()["response"]
        # 将响应内容解析为字典
        if "relevance: false" in result.lower():
            return {
                "sentiment": "none",
                "relevance": "false",
                "emotion_type": "none",
                "volatility_signal": "none",
                "stance": "none"
            }

        parts = {k.strip(): v.strip() for k, v in [
            pair.split(":") for pair in result.split(",") if ":" in pair
        ]}
        return {
            "sentiment": parts.get("sentiment", "none"),
            "relevance": parts.get("relevance", "true"),  # 默认相关
            "emotion_type": parts.get("emotion_type", "none"),
            "volatility_signal": parts.get("volatility_signal", "none"),
            "stance": parts.get("stance", "none")
        }

    except Exception as e:
        print(f"Error processing comment: {e}")
        return {
            "sentiment": "none",
            "relevance": "false",
            "emotion_type": "none",
            "volatility_signal": "none",
            "stance": "none"
        }

# 主处理逻辑
with open(input_csv, newline='', encoding='utf-8') as infile, \
     open(output_csv, 'w', newline='', encoding='utf-8') as outfile:

    reader = csv.DictReader(infile)
    fieldnames = reader.fieldnames + ["sentiment", "relevance", "emotion_type", "volatility_signal", "stance"]
    writer = csv.DictWriter(outfile, fieldnames=fieldnames)
    writer.writeheader()

    for idx, row in enumerate(reader):
        comment = row.get("body", "")
        if not comment.strip():
            continue

        print(f"[{idx+1}] Processing comment: {comment[:50]}...")
        tags = classify_comment(comment)
        row.update(tags)
        writer.writerow(row)
        time.sleep(0.5)  # 避免模型过载，可适当调整

print("✅ 所有评论处理完毕，结构化标签已写入输出文件。")

# The CSV file would be processed by the local LLM and then these features would be added (I talked to ChatGPT in Chinese so humor me here)

# We'll work on the details later, but now I'll just work on the code.

特征	是否正确	补充
1. total_comments	✅ 正确	用 pandas groupby 同一小时 count() 即可
2. positive_ratio	✅ 正确	依赖模型返回 sentiment 标签（positive/neutral/negative）
3. bullish_ratio	✅ 正确	依赖模型返回 stance 标签（bullish/bearish/neutral）
4. fomo_ratio	✅ 正确	依赖模型返回 emotion_type 标签，筛选 fomo
5. relevance_ratio	✅ 正确	依赖模型判断 relevance: true/false 来做比率
6. avg_score	✅ 正确	直接统计 CSV 中 score 字段即可（mean）
7. sentiment_entropy	✅ 理解需补充	见下方说明
🔍 什么是 sentiment_entropy？（你提问得非常好）

sentiment_entropy 是衡量一个时间窗口内 情绪分布的“分歧程度”。其思想来源于信息熵：当大家观点高度一致（都正面或负面），熵低；当观点高度分歧，熵高。
✅ 计算方法（以情绪为例）：

设该小时内：

    positive 评论数量为 $p$

    neutral 评论数量为 $n$

    negative 评论数量为 $q$

计算各类概率：
Ppos=pp+n+q,Pneu=np+n+q,Pneg=qp+n+q
Ppos​=p+n+qp​,Pneu​=p+n+qn​,Pneg​=p+n+qq​

计算情绪熵：
H=−∑i∈{pos,neu,neg}Pi⋅log⁡2(Pi+ε)
H=−i∈{pos,neu,neg}∑​Pi​⋅log2​(Pi​+ε)

    ⚠️ 加上 $\varepsilon$ 是为了避免 $\log(0)$ 的数学错误（如 $1e{-8}$）。

解读含义：

    $H \approx 0$：情绪高度一致

    $H \approx 1.5$（最大值）：情绪分布接近均匀，观点分歧严重



### 💡 关于 `emotion_type` 特征处理策略

建议保留原始标签（如 `fomo`, `fear`, `euphoria`, `frustration`, 等），在后处理阶段通过分组映射简化为更高层次的情绪类别（如 `fear_like`, `anger_like`）。

#### ✅ 原因：
- **更灵活**：先保留细粒度，再按需合并，避免信息丢失。
- **便于分析**：支持更清晰的可视化与论文解释。
- **符合业界惯例**：精细标签 → 合并标签是金融 NLP 的通用做法。

#### 🔧 推荐情绪分组映射：
```python
emotion_groups = {
    'fear': 'fear_like',
    'uncertainty': 'fear_like',
    'fomo': 'fomo',
    'anger': 'anger_like',
    'frustration': 'anger_like',
    'euphoria': 'positive_like'
}
