In [1]:
import pandas as pd

# File Directory
file_path = "Canada_CSV/may_7_merged_comments.csv"

# Load Dataframe
df = pd.read_csv(file_path)

# Let's see how many rows of data are there
print(f"📊 Total Comments (rows): {len(df)}")

📊 Total Comments (rows): 315


In [2]:
import pandas as pd

input_path = "Canada_CSV/may_7_merged_comments.csv"
output_path = "Canada_CSV/may_7_llm_tracker.csv"

# Add llm_tracker as an unique ID
df = pd.read_csv(input_path)
df.insert(0, "llm_tracker", range(1, len(df) + 1))  # Starting from 1
df.to_csv(output_path, index=False)
print("✅ Added unique ID 'llm_tracker'!")

✅ Added unique ID 'llm_tracker'!


In [None]:
import csv
import requests
import time
import os

# Ollama Local Language HTTP Local Host Config
OLLAMA_API_URL = "http://localhost:11434/api/generate"
MODEL_NAME = "deepseek-r1:14b"

# File Directories
input_csv = "Canada_CSV/may_7_llm_tracker.csv"
output_csv = "Canada_CSV/may_7_Processed.csv"
error_log_csv = "Canada_CSV/may_7_Processed_error_log.csv"

# Prompt Template
PROMPT_TEMPLATE = (
    "You are a Reddit comment analysis assistant.\n"
    "Your task is to analyze the following Reddit comment and classify it with:\n"
    "1. Whether the comment is related to studying abroad.\n"
    "2. A concise summary of the main topic.\n"
    "3. The overall sentiment.\n\n"
    "Return your answer in this format (always on a single line):\n"
    "study_abroad: [true/false], topic: [short summary], sentiment: [positive/neutral/negative]\n\n"
    "Comment: \"{text}\""
)

def classify_comment(text):
    prompt = PROMPT_TEMPLATE.format(text=text.strip().replace("\n", " "))
    response = requests.post(OLLAMA_API_URL, json={
        "model": MODEL_NAME,
        "prompt": prompt,
        "stream": False
    })

    result = response.json()["response"]

    # Extract fields (ensure correct format)
    parts = {k.strip(): v.strip() for k, v in [
        pair.split(":") for pair in result.split(",") if ":" in pair
    ]}

    return {
        "study_abroad": parts.get("study_abroad", "false"),
        "topic": parts.get("topic", "none"),
        "sentiment": parts.get("sentiment", "none")
    }

# --------- Checkpoint Resume ---------
processed_ids = set()
if os.path.exists(output_csv):
    with open(output_csv, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        processed_ids = {row["llm_tracker"] for row in reader if "llm_tracker" in row}

# --------- Main Logic ---------
with open(input_csv, newline='', encoding='utf-8') as infile, \
     open(output_csv, 'a', newline='', encoding='utf-8') as outfile, \
     open(error_log_csv, 'a', newline='', encoding='utf-8') as errorfile:

    reader = csv.DictReader(infile)
    fieldnames = reader.fieldnames + ["study_abroad", "topic", "sentiment"]
    writer = csv.DictWriter(outfile, fieldnames=fieldnames)
    error_writer = csv.DictWriter(errorfile, fieldnames=["llm_tracker", "error_message"])

    # Write Headers
    if os.stat(output_csv).st_size == 0:
        writer.writeheader()
    if os.stat(error_log_csv).st_size == 0:
        error_writer.writeheader()

    for idx, row in enumerate(reader):
        comment_id = row.get("llm_tracker")
        if comment_id in processed_ids:
            continue

        comment = row.get("body", "")
        if not comment.strip():
            continue

        try:
            print(f"[{idx+1}] Processing comment (llm_tracker {comment_id}): {comment[:50]}...")
            tags = classify_comment(comment)
            row.update(tags)
            writer.writerow(row)
            outfile.flush()
        except Exception as e:
            print(f"❌ Error at comment {comment_id}: {e}")
            error_writer.writerow({"llm_tracker": comment_id, "error_message": str(e)})
            errorfile.flush()
        time.sleep(0.1)

print("✅ All comments processed. Structured labels are written to output CSV files")
print("⚠️ Failed comments are written to error_log.csv。")


[1] Processing comment (llm_tracker 1): 好像不太活跃啊这个，有群吗😁...
[2] Processing comment (llm_tracker 2): https://t.me/+NxH2ffR8-NtiMTMx...
❌ Error at comment 2: too many values to unpack (expected 2)
[3] Processing comment (llm_tracker 3): 人呢？...
❌ Error at comment 3: too many values to unpack (expected 2)
[4] Processing comment (llm_tracker 4): 在呢...
[5] Processing comment (llm_tracker 5): 嗯？...
[6] Processing comment (llm_tracker 6): ?...
[7] Processing comment (llm_tracker 7): 有...
[8] Processing comment (llm_tracker 8): 有人咩...
❌ Error at comment 8: too many values to unpack (expected 2)
[9] Processing comment (llm_tracker 9): 晚上好...
[10] Processing comment (llm_tracker 10): 分享下我对国家的计量标准。我对加拿大不熟悉，大家可以补充。  # 地缘政治安全程度  现在最惨的是乌...
