# NOT SELECTED DUE TOLONG EXECUTION TIME -> 24h plus 

In [2]:
# Imports
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime
import json
from tqdm.auto import tqdm
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
import warnings

warnings.filterwarnings('ignore')
print("✓ Libraries imported")


✓ Libraries imported


In [3]:
# Configuration
BASE_DIR = Path('/Users/stahlma/Desktop/01_Studium/11_Thesis/Data_Experiment')
INPUT_DIR = BASE_DIR / 'data' / '01_corpus' / '03_qa' / 'reddit'
DATA_DIR = BASE_DIR / 'data' / '02_topics' / '03_gold' / 'reddit'
OUTPUT_DIR = DATA_DIR
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Ollama configuration
OLLAMA_BASE_URL = 'http://localhost:11434'
MODEL_NAME = 'gemma3:4b'  # or faster variant / quantized

# Processing configuration
MAX_WORKERS = 16               # increase parallelism
BATCH_DOCS_PER_CALL = 16       # docs per model call (tune)
MAX_TOKENS = 4                 # only return a small JSON integer/array
TEMPERATURE = 0.1

print("\nConfiguration:")
print(f" Input dir: {INPUT_DIR}")
print(f" Output dir: {OUTPUT_DIR}")
print(f" Model: {MODEL_NAME}")
print(f" Ollama URL: {OLLAMA_BASE_URL}")
print(f" Workers: {MAX_WORKERS}")
print(f" Docs per call: {BATCH_DOCS_PER_CALL}")



Configuration:
 Input dir: /Users/stahlma/Desktop/01_Studium/11_Thesis/Data_Experiment/data/01_corpus/03_qa/reddit
 Output dir: /Users/stahlma/Desktop/01_Studium/11_Thesis/Data_Experiment/data/02_topics/03_gold/reddit
 Model: gemma3:4b
 Ollama URL: http://localhost:11434
 Workers: 16
 Docs per call: 16


## Check Ollama Setup

In [4]:
# Check Ollama
def check_ollama():
    try:
        resp = requests.get(f"{OLLAMA_BASE_URL}/api/tags", timeout=5)
        if resp.status_code == 200:
            models = resp.json().get('models', [])
            model_names = [m['name'] for m in models]
            print("✓ Ollama is running")
            print(f"\nAvailable models: {model_names}")
            if any(MODEL_NAME in m for m in model_names):
                print(f"✓ Model '{MODEL_NAME}' is available")
                return True
            else:
                print(f"\n⚠️ Model '{MODEL_NAME}' not found")
                print(f"To install: ollama pull {MODEL_NAME}")
                return False
        else:
            print(f"❌ Ollama status code: {resp.status_code}")
            return False
    except requests.exceptions.ConnectionError:
        print("❌ Ollama is not running")
        print("\nStart Ollama and pull model:")
        print("  ollama serve")
        print(f"  ollama pull {MODEL_NAME}")
        return False
    except Exception as e:
        print(f"❌ Error checking Ollama: {e}")
        return False

ollama_ready = check_ollama()
if not ollama_ready:
    print("\n⚠️ Please set up Ollama before continuing")


✓ Ollama is running

Available models: ['gemma3:4b']
✓ Model 'gemma3:4b' is available


## Define Political Topic Taxonomy

20 political topics adapted from the Comparative Agendas Project (CAP) framework.

In [5]:
# Taxonomy
POLITICAL_TOPICS = [
    "Elections & Voting",
    "Presidential Politics",
    "Congress & Legislation",
    "Healthcare Policy",
    "Immigration & Borders",
    "Economy & Employment",
    "Budget & Taxation",
    "Education Policy",
    "Criminal Justice",
    "Gun Rights & Control",
    "Environment & Climate",
    "Energy Policy",
    "Foreign Policy & Diplomacy",
    "Defense & Military",
    "Trade Policy",
    "Social Issues",
    "Civil Rights & Discrimination",
    "Media & Free Speech",
    "Technology & Privacy",
    "Infrastructure",
]

topic_to_id = {t: i for i, t in enumerate(POLITICAL_TOPICS)}
id_to_topic = {i: t for i, t in enumerate(POLITICAL_TOPICS)}

print("\n=== Political Topic Taxonomy (20 Topics) ===\n")
for i, topic in enumerate(POLITICAL_TOPICS):
    print(f"{i:2d}. {topic}")



=== Political Topic Taxonomy (20 Topics) ===

 0. Elections & Voting
 1. Presidential Politics
 2. Congress & Legislation
 3. Healthcare Policy
 4. Immigration & Borders
 5. Economy & Employment
 6. Budget & Taxation
 7. Education Policy
 8. Criminal Justice
 9. Gun Rights & Control
10. Environment & Climate
11. Energy Policy
12. Foreign Policy & Diplomacy
13. Defense & Military
14. Trade Policy
15. Social Issues
16. Civil Rights & Discrimination
17. Media & Free Speech
18. Technology & Privacy
19. Infrastructure


## Create Classification Prompt

In [6]:
# Batched prompt helper

SYSTEM_PROMPT = (
    "You are a political discourse classifier. "
    "Given multiple short texts, classify each text into exactly ONE of these 20 topics, "
    "and return ONLY a JSON array of 20 integer topic IDs in the same order.\n\n"
    "Topics (ID: label):\n" +
    "\n".join([f"{i}: {label}" for i, label in enumerate(POLITICAL_TOPICS)]) +
    "\n\n"
    "Output format example for 3 texts: [0, 5, 12]\n"
    "No explanation, no extra text."
)

def build_batched_prompt(texts):
    lines = []
    for idx, t in enumerate(texts):
        # keep it short & robust
        t_clean = str(t).replace("\n", " ").strip()
        lines.append(f"{idx}: {t_clean}")
    joined = "\n".join(lines)
    user_prompt = (
        "Classify each of the following texts into one topic ID.\n\n"
        f"{joined}\n\n"
        "Return ONLY a JSON array of integers, e.g. [0, 5, 12]."
    )
    return SYSTEM_PROMPT, user_prompt


In [7]:
# Low-level Ollama call for a batch

def classify_batch_with_ollama(texts):
    """
    texts: list[str] of length <= BATCH_DOCS_PER_CALL
    returns: list[int] of topic IDs, same length
    """
    system_prompt, user_prompt = build_batched_prompt(texts)

    payload = {
        "model": MODEL_NAME,
        "prompt": f"{system_prompt}\n\nUSER:\n{user_prompt}",
        "stream": False,
        "options": {
            "temperature": TEMPERATURE,
            "num_predict": MAX_TOKENS,
        },
    }

    resp = requests.post(
        f"{OLLAMA_BASE_URL}/api/generate",
        json=payload,
        timeout=120,
    )
    resp.raise_for_status()
    data = resp.json()
    raw = data.get("response", "").strip()

    # Expect something like: [0, 5, 12]
    try:
        # Be defensive: sometimes model wraps in backticks or text
        start = raw.find("[")
        end = raw.rfind("]")
        if start == -1 or end == -1:
            raise ValueError(f"Invalid JSON array: {raw}")
        arr_str = raw[start:end+1]
        ids = json.loads(arr_str)
        # enforce list[int] of correct length
        if not isinstance(ids, list) or len(ids) != len(texts):
            raise ValueError(f"Length mismatch or invalid list: {ids}")
        clean_ids = []
        for v in ids:
            try:
                clean_ids.append(int(v))
            except Exception:
                clean_ids.append(0)
        return clean_ids
    except Exception as e:
        # Fallback: mark all as topic 0
        print(f"Parse error: {e} | raw: {raw[:200]}")
        return [0] * len(texts)


In [None]:
# High-level classification over all documents
def chunk_iterable(seq, size):
    for i in range(0, len(seq), size):
        yield i, seq[i:i+size]

def classify_all_documents(documents, max_workers=MAX_WORKERS, batch_docs=BATCH_DOCS_PER_CALL):
    """
    documents: list[str]
    returns: list[int] topic_ids aligned with documents
    """
    n = len(documents)
    topic_ids = [0] * n

    # Build batch index mapping
    batches = list(chunk_iterable(documents, batch_docs))

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {}
        for batch_idx, (start, texts) in enumerate(batches):
            fut = executor.submit(classify_batch_with_ollama, texts)
            futures[fut] = (start, len(texts))

        for fut in tqdm(as_completed(futures), total=len(futures), desc="Classifying (batched)"):
            start, length = futures[fut]
            try:
                batch_ids = fut.result()
            except Exception as e:
                print(f"Worker error: {e}")
                batch_ids = [0] * length
            # write into main array
            topic_ids[start:start+length] = batch_ids

    return topic_ids

## Load Data

In [9]:
# Load your pseudodocs (adapt path/column name if needed)
if not ollama_ready:
    raise RuntimeError("Ollama not ready")

input_file = INPUT_DIR / "thread_pseudodocs.parquet"  # adjust to your file
thread_docs = pd.read_parquet(input_file)

# Make sure 'pseudodoc_text' exists, as in your original notebook
documents = thread_docs['pseudodoc_text'].astype(str).tolist()

print(f"\nDocuments to classify: {len(documents):,}")
print(f"Parallel workers: {MAX_WORKERS}")
print(f"Docs per model call: {BATCH_DOCS_PER_CALL}")


Documents to classify: 433,973
Parallel workers: 16
Docs per model call: 16


## Classify All Documents

Process documents in parallel for maximum speed. Estimated time: 1-2 hours for ~430k documents.

In [None]:
# Run classification
start_time = datetime.now()
topic_ids = classify_all_documents(documents)
end_time = datetime.now()

duration_min = (end_time - start_time).total_seconds() / 60
throughput = len(documents) / duration_min if duration_min > 0 else 0

print("\n✓ Classification complete!")
print(f" Duration: {duration_min:.2f} minutes")
print(f" Throughput: {throughput:.0f} docs/minute")
print(f" Avg time per doc: {duration_min * 60 / len(documents):.3f} s")

Classifying (batched):   0%|          | 0/27124 [00:00<?, ?it/s]

Parse error: Invalid JSON array: ```json | raw: ```json
Parse error: Invalid JSON array: [0, | raw: [0,
Parse error: Invalid JSON array: [1, | raw: [1,
Parse error: Invalid JSON array: [1, | raw: [1,
Parse error: Invalid JSON array: [0, | raw: [0,
Parse error: Invalid JSON array: [1, | raw: [1,
Parse error: Invalid JSON array: [0, | raw: [0,
Parse error: Invalid JSON array: [1, | raw: [1,
Parse error: Invalid JSON array: Here's | raw: Here's


## Analyze Results

In [None]:
if ollama_ready and 'topic_ids' in locals():
    # Add results to dataframe
    thread_docs['supervised_topic_id'] = topic_ids
    thread_docs['supervised_topic_label'] = [POLITICAL_TOPICS[tid] for tid in topic_ids]
    thread_docs['supervised_topic_confidence'] = confidences
    
    print("\n=== Classification Results ===")
    print(f"\nTotal documents classified: {len(thread_docs):,}")
    print(f"\nTopic distribution:")
    topic_dist = thread_docs['supervised_topic_label'].value_counts().sort_index()
    for label, count in topic_dist.items():
        pct = count / len(thread_docs) * 100
        print(f"  {label:30s}: {count:7,} ({pct:5.2f}%)")
    
    print(f"\nConfidence statistics:")
    print(f"  Mean confidence: {thread_docs['supervised_topic_confidence'].mean():.3f}")
    print(f"  Median confidence: {thread_docs['supervised_topic_confidence'].median():.3f}")
    print(f"  High confidence (1.0): {(thread_docs['supervised_topic_confidence'] == 1.0).sum():,} ({(thread_docs['supervised_topic_confidence'] == 1.0).sum()/len(thread_docs)*100:.2f}%)")
    print(f"  Low confidence (0.5): {(thread_docs['supervised_topic_confidence'] == 0.5).sum():,} ({(thread_docs['supervised_topic_confidence'] == 0.5).sum()/len(thread_docs)*100:.2f}%)")
else:
    print("\n⚠️  No results to analyze - run classification first")

## Visualize Topic Distribution

In [None]:
if ollama_ready and 'topic_ids' in locals():
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    sns.set_style('whitegrid')
    
    # Topic distribution
    fig, axes = plt.subplots(2, 1, figsize=(14, 10))
    
    # 1. Topic counts
    topic_counts = thread_docs['supervised_topic_id'].value_counts().sort_index()
    ax = axes[0]
    topic_labels_plot = [POLITICAL_TOPICS[i] for i in topic_counts.index]
    ax.bar(range(len(topic_counts)), topic_counts.values, color='steelblue', alpha=0.7)
    ax.set_xticks(range(len(topic_counts)))
    ax.set_xticklabels(topic_labels_plot, rotation=45, ha='right')
    ax.set_ylabel('Document Count')
    ax.set_title('Document Distribution Across Political Topics (Local SLM)')
    ax.grid(axis='y', alpha=0.3)
    
    # 2. Confidence distribution by topic
    ax = axes[1]
    topic_conf = thread_docs.groupby('supervised_topic_id')['supervised_topic_confidence'].mean().sort_index()
    topic_conf_labels = [POLITICAL_TOPICS[i] for i in topic_conf.index]
    ax.bar(range(len(topic_conf)), topic_conf.values, color='coral', alpha=0.7)
    ax.set_xticks(range(len(topic_conf)))
    ax.set_xticklabels(topic_conf_labels, rotation=45, ha='right')
    ax.set_ylabel('Mean Confidence')
    ax.set_ylim(0, 1)
    ax.axhline(y=0.75, color='red', linestyle='--', alpha=0.5, label='High Confidence Threshold')
    ax.set_title('Mean Classification Confidence by Topic')
    ax.legend()
    ax.grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / 'supervised_llm_topic_distribution.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("✓ Visualization saved")
else:
    print("\n⚠️  No results to visualize - run classification first")

## Temporal Analysis

In [None]:
if ollama_ready and 'topic_ids' in locals():
    # Temporal trends
    thread_docs['date'] = pd.to_datetime(thread_docs['date_temp'])
    temporal_df = thread_docs.groupby(['date', 'supervised_topic_label']).size().reset_index(name='count')
    
    # Top 5 topics over time
    top_topics = thread_docs['supervised_topic_label'].value_counts().head(5).index
    temporal_top = temporal_df[temporal_df['supervised_topic_label'].isin(top_topics)]
    
    fig, ax = plt.subplots(figsize=(14, 6))
    for topic in top_topics:
        topic_data = temporal_top[temporal_top['supervised_topic_label'] == topic]
        ax.plot(topic_data['date'], topic_data['count'], label=topic, marker='o', markersize=3, alpha=0.7)
    
    ax.set_xlabel('Date')
    ax.set_ylabel('Document Count')
    ax.set_title('Top 5 Topics Over Time (Sep-Oct 2016) - Local SLM Classification')
    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    ax.grid(alpha=0.3)
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / 'supervised_llm_temporal_trends.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("✓ Temporal analysis complete")
else:
    print("\n⚠️  No results for temporal analysis - run classification first")

## Save Results

In [None]:
# Attach topic labels & simple confidence placeholder

thread_docs['supervised_topic_id'] = topic_ids
thread_docs['supervised_topic_label'] = [id_to_topic.get(tid, "Unknown") for tid in topic_ids]

# If you still want a crude confidence, you can set all to 1.0 or 0.75
thread_docs['supervised_topic_confidence'] = 1.0

print("\n=== Classification Results (basic) ===")
print(f"Total documents: {len(thread_docs):,}")
print("\nTopic distribution:")
topic_dist = thread_docs['supervised_topic_label'].value_counts().sort_index()
for label, count in topic_dist.items():
    pct = count / len(thread_docs) * 100
    print(f" {label:30s}: {count:7,} ({pct:5.2f}%)")


## Summary

In [None]:
# Save results + metadata

output_file = OUTPUT_DIR / 'thread_pseudodocs_with_supervised_topics_llm_batched.parquet'
thread_docs.to_parquet(output_file, index=False)

duration = duration_min
metadata = {
    "method": "local_slm_classification_batched",
    "model": MODEL_NAME,
    "taxonomy": "Comparative Agendas Project (CAP)",
    "num_topics": 20,
    "topics": POLITICAL_TOPICS,
    "num_documents": len(thread_docs),
    "classification_date": datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    "duration_minutes": duration,
    "throughput_docs_per_minute": throughput,
    "workers": MAX_WORKERS,
    "docs_per_call": BATCH_DOCS_PER_CALL,
}

metadata_file = OUTPUT_DIR / 'supervised_llm_classification_metadata_batched.json'
with open(metadata_file, 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"\n✓ Results saved to: {output_file}")
print(f"✓ Metadata saved to: {metadata_file}")