In [1]:
import pandas as pd

# File Directory
file_path = "Dogecoin_CSVs/Dogecoin_Reddit_2023-01-01_to_2024-12-31.csv"

# Load Dataframe
df = pd.read_csv(file_path)

# Let's see how many rows of data are there
print(f"📊 Total Comments (rows): {len(df)}")

📊 Total Comments (rows): 498006


In [None]:
import pandas as pd

input_path = "Dogecoin_CSVs/Dogecoin_Reddit_2023-01-01_to_2024-12-31.csv"
output_path = "Dogecoin_Processed/Dogecoin_Reddit_llm_trackers.csv"

# Add llm_tracker as an unique ID
df = pd.read_csv(input_path)
df.insert(0, "llm_tracker", range(1, len(df) + 1))  # Starting from 1
df.to_csv(output_path, index=False)
print("✅ Added unique ID 'llm_tracker'!")


In [None]:
import csv
import requests
import time
import os

# Ollama Modal Setup
OLLAMA_API_URL = "http://localhost:11434/api/generate"
MODEL_NAME = "llama3.3:70b-instruct-q3_K_S"

# File directories
input_csv = "Dogecoin_Processed/Dogecoin_Reddit_llm_trackers.csv"
output_csv = "Dogecoin_Processed/Dogecoin_Reddit_Processed.csv"
error_log_csv = "Dogecoin_Processed/error_log.csv"

# Prompt Template
PROMPT_TEMPLATE = (
    "You are a financial language analysis assistant.\n"
    "Your task is to analyze the following Reddit comment and classify it with financial relevance and sentiment.\n\n"
    "If the comment is NOT related to dogecoin markets, simply return:\n"
    "relevance: false\n\n"
    "If it IS related to markets, return in the following structured format:\n"
    "relevance: true, sentiment: [positive/neutral/negative], emotion_type: [euphoria, fear, anger, FOMO, uncertainty, etc.], "
    "volatility_signal: [low/medium/high], stance: [bullish/bearish/neutral]\n\n"
    "Be concise and always output in a single line.\n\n"
    "Comment: \"{text}\""
)

def classify_comment(text):
    prompt = PROMPT_TEMPLATE.format(text=text.strip().replace("\n", " "))
    response = requests.post(OLLAMA_API_URL, json={
        "model": MODEL_NAME,
        "prompt": prompt,
        "stream": False
    })
    result = response.json()["response"]

    if "relevance: false" in result.lower():
        return {
            "sentiment": "none",
            "relevance": "false",
            "emotion_type": "none",
            "volatility_signal": "none",
            "stance": "none"
        }

    parts = {k.strip(): v.strip() for k, v in [
        pair.split(":") for pair in result.split(",") if ":" in pair
    ]}
    return {
        "sentiment": parts.get("sentiment", "none"),
        "relevance": parts.get("relevance", "true"),
        "emotion_type": parts.get("emotion_type", "none"),
        "volatility_signal": parts.get("volatility_signal", "none"),
        "stance": parts.get("stance", "none")
    }

# --------- Resumable processing logic ---------
processed_ids = set()
if os.path.exists(output_csv):
    with open(output_csv, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        processed_ids = {row["llm_tracker"] for row in reader if "llm_tracker" in row}

# --------- Main processing loop ---------
with open(input_csv, newline='', encoding='utf-8') as infile, \
     open(output_csv, 'a', newline='', encoding='utf-8') as outfile, \
     open(error_log_csv, 'a', newline='', encoding='utf-8') as errorfile:

    reader = csv.DictReader(infile)
    fieldnames = reader.fieldnames + ["sentiment", "relevance", "emotion_type", "volatility_signal", "stance"]
    writer = csv.DictWriter(outfile, fieldnames=fieldnames)
    error_writer = csv.DictWriter(errorfile, fieldnames=["llm_tracker", "error_message"])

    # Write column headers to CSV file
    if os.stat(output_csv).st_size == 0:
        writer.writeheader()
    if os.stat(error_log_csv).st_size == 0:
        error_writer.writeheader()

    for idx, row in enumerate(reader):
        comment_id = row.get("llm_tracker")
        if comment_id in processed_ids:
            continue

        comment = row.get("body", "")
        if not comment.strip():
            continue

        try:
            print(f"[{idx+1}] Processing comment (llm_tracker {comment_id}): {comment[:50]}...")
            tags = classify_comment(comment)
            row.update(tags)
            writer.writerow(row)
            outfile.flush()
        except Exception as e:
            print(f"❌ Error at comment {comment_id}: {e}")
            error_writer.writerow({"llm_tracker": comment_id, "error_message": str(e)})
            errorfile.flush()
        time.sleep(0.05)

print("✅ All comments processed. Structured labels are written to output CSV files")
print("⚠️ Failed comments are written to error_log.csv。")


[289045] Processing comment (llm_tracker 289045): Its one of the biggest if this doesnt go well what...
[289046] Processing comment (llm_tracker 289046): Please learn about market cap to have a realistic ...
[289047] Processing comment (llm_tracker 289047): we all are and that's why we are here too...
[289048] Processing comment (llm_tracker 289048): Not a bad idea. 

The hashrate has been on a secul...
[289049] Processing comment (llm_tracker 289049): 💎🙌...
[289050] Processing comment (llm_tracker 289050): That’s irrelevant...
[289051] Processing comment (llm_tracker 289051): I like turtles!!...
[289052] Processing comment (llm_tracker 289052): [removed]...
[289053] Processing comment (llm_tracker 289053): [removed]...
[289054] Processing comment (llm_tracker 289054): ![gif](giphy|HW05UrUSfAzZu|downsized)...
[289055] Processing comment (llm_tracker 289055): I will keep the dogecoin I already have. But proba...
[289056] Processing comment (llm_tracker 289056): ![gif](emote|free_emotes_