In [3]:
# Environment setup
import sys
from pathlib import Path

# Add src to path
workspace_root = Path.cwd()
sys.path.insert(0, str(workspace_root / 'src'))

print(f"Project root: {workspace_root}")
print(f"Python version: {sys.version}")
print("✓ Environment configured")

Project root: /Users/stahlma/Desktop/01_Studium/11_Thesis/Data_Experiment
Python version: 3.12.0 (v3.12.0:0fb18b02c8, Oct  2 2023, 09:45:56) [Clang 13.0.0 (clang-1300.0.29.30)]
✓ Environment configured


In [4]:
# Imports
import pandas as pd
import numpy as np
from datetime import datetime
import json
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Thesis pipeline utilities
from thesis_pipeline.io.parquet import read_parquet, write_parquet

# Plotting setup
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("✓ All imports successful")

✓ All imports successful


## 1. Load Filtered Data

Load comments and submissions with topic assignments from notebook 15b.

In [5]:
# Paths
topics_path = workspace_root / 'data' / '02_topics' / 'reddit'
output_path = workspace_root / 'data' / '03_stance' / 'reddit'
output_path.mkdir(parents=True, exist_ok=True)

print(f"Input: {topics_path}")
print(f"Output: {output_path}")

# Check files exist
print(f"\nValidating input files:")
print(f"  Comments: {(topics_path / 'comments_with_topics.parquet').exists()}")
print(f"  Submissions: {(topics_path / 'submissions_with_topics.parquet').exists()}")
print(f"  Topic definitions: {(topics_path / 'topic_definitions.json').exists()}")

Input: /Users/stahlma/Desktop/01_Studium/11_Thesis/Data_Experiment/data/02_topics/reddit
Output: /Users/stahlma/Desktop/01_Studium/11_Thesis/Data_Experiment/data/03_stance/reddit

Validating input files:
  Comments: True
  Submissions: True
  Topic definitions: True


In [None]:
# Load data
print("Loading filtered data...")

df_comments = read_parquet(topics_path / 'comments_with_topics.parquet')
df_submissions = read_parquet(topics_path / 'submissions_with_topics.parquet')

with open(topics_path / 'topic_definitions.json', 'r') as f:
    topic_definitions = json.load(f)

print(f"\n✓ Loaded {len(df_comments):,} comments")
print(f"✓ Loaded {len(df_submissions):,} submissions")
print(f"✓ Loaded {len(topic_definitions)} topic definitions")

print(f"\nComment columns: {df_comments.columns.tolist()}")
print(f"\nTopic distribution:")
for topic_id, count in df_comments['topic_id'].value_counts().items():
    topic_label = topic_definitions[topic_id]['label']
    print(f"  {topic_label}: {count:,} comments")

## 2. Define Stance Categories

Define clear stance definitions for each topic.

In [None]:
# Define stance categories for each topic
stance_definitions = {
    'climate_change': {
        'topic_label': 'Climate Change',
        'target_position': 'Climate change is real, human-caused, and requires action',
        'pro': {
            'label': 'Pro (Favor)',
            'description': 'Supports climate action, acknowledges human-caused climate change, advocates for environmental policies',
            'examples': [
                'We need to transition to renewable energy now',
                'The science is clear - humans are causing global warming',
                'Carbon emissions must be reduced immediately'
            ]
        },
        'against': {
            'label': 'Against (Oppose)',
            'description': 'Denies climate change, opposes climate action, questions climate science',
            'examples': [
                'Climate change is a hoax',
                'The earth naturally goes through temperature cycles',
                'Environmental regulations hurt the economy'
            ]
        },
        'neutral': {
            'label': 'Neutral',
            'description': 'Neither supports nor opposes, asks questions, presents both sides, unclear stance',
            'examples': [
                'What do experts say about this?',
                'I\'m not sure what to believe',
                'There are arguments on both sides'
            ]
        }
    },
    'donald_trump': {
        'topic_label': 'Donald Trump',
        'target_position': 'Donald Trump as president and his policies',
        'pro': {
            'label': 'Pro (Support Trump)',
            'description': 'Supports Trump, defends his actions/policies, positive about his presidency',
            'examples': [
                'Trump is making America great again',
                'Finally a president who keeps his promises',
                'Best economy we\'ve had in decades under Trump'
            ]
        },
        'against': {
            'label': 'Against (Oppose Trump)',
            'description': 'Opposes Trump, criticizes his actions/policies, negative about his presidency',
            'examples': [
                'Trump is unfit for office',
                'His policies are hurting Americans',
                'He\'s an embarrassment to the country'
            ]
        },
        'neutral': {
            'label': 'Neutral',
            'description': 'Neither supports nor opposes, factual reporting, balanced view, unclear stance',
            'examples': [
                'Trump signed this executive order today',
                'Some policies I agree with, others I don\'t',
                'What are the details of this decision?'
            ]
        }
    },
    'gun_control': {
        'topic_label': 'Gun Control',
        'target_position': 'Stricter gun control and regulations',
        'pro': {
            'label': 'Pro (Support Gun Control)',
            'description': 'Supports stricter gun laws, advocates for regulations, emphasizes public safety',
            'examples': [
                'We need universal background checks',
                'Assault weapons should be banned',
                'Common sense gun laws save lives'
            ]
        },
        'against': {
            'label': 'Against (Oppose Gun Control)',
            'description': 'Opposes gun restrictions, defends Second Amendment rights, pro-gun ownership',
            'examples': [
                'Shall not be infringed',
                'Gun control only hurts law-abiding citizens',
                'More guns means more safety'
            ]
        },
        'neutral': {
            'label': 'Neutral',
            'description': 'Neither strongly pro nor anti gun control, balanced view, unclear stance',
            'examples': [
                'Both sides have valid points',
                'What regulations are being proposed?',
                'I support some measures but not others'
            ]
        }
    },
    'immigration': {
        'topic_label': 'Immigration',
        'target_position': 'More open/liberal immigration policies',
        'pro': {
            'label': 'Pro (Support Immigration)',
            'description': 'Supports immigrants, advocates for reform/pathways to citizenship, opposes harsh restrictions',
            'examples': [
                'We should welcome refugees',
                'Dreamers deserve a path to citizenship',
                'Immigration strengthens our economy'
            ]
        },
        'against': {
            'label': 'Against (Restrict Immigration)',
            'description': 'Opposes immigration, supports border wall/restrictions, anti-illegal immigration',
            'examples': [
                'Build the wall',
                'Illegal immigration is out of control',
                'We need to protect our borders'
            ]
        },
        'neutral': {
            'label': 'Neutral',
            'description': 'Neither strongly pro nor anti immigration, balanced view, unclear stance',
            'examples': [
                'We need legal immigration reform',
                'What are the economic impacts?',
                'There must be a middle ground'
            ]
        }
    },
    'vaccination': {
        'topic_label': 'Vaccination',
        'target_position': 'Vaccines are safe, effective, and should be widely used',
        'pro': {
            'label': 'Pro (Support Vaccination)',
            'description': 'Supports vaccines, advocates for vaccination, trusts medical science',
            'examples': [
                'Vaccines save lives',
                'The science is clear on vaccine safety',
                'Everyone should get vaccinated'
            ]
        },
        'against': {
            'label': 'Against (Anti-Vaccine)',
            'description': 'Opposes vaccines, questions vaccine safety, anti-mandate',
            'examples': [
                'Vaccines are dangerous',
                'Natural immunity is better',
                'No one should be forced to vaccinate'
            ]
        },
        'neutral': {
            'label': 'Neutral',
            'description': 'Neither strongly pro nor anti vaccine, questions/concerns, unclear stance',
            'examples': [
                'What are the long-term effects?',
                'I have some vaccines but not others',
                'More research is needed'
            ]
        }
    }
}

print("Defined stance categories for all topics:")
for topic_id, stance_def in stance_definitions.items():
    print(f"\n{stance_def['topic_label']}:")
    print(f"  Target: {stance_def['target_position']}")
    print(f"  Categories: Pro / Against / Neutral")

## 3. Sampling Strategy

Sample 100 comments per topic using stratified sampling to ensure diversity.

In [None]:
# Sampling parameters
SAMPLES_PER_TOPIC = 100
RANDOM_SEED = 42

print(f"Sampling strategy:")
print(f"  Target: {SAMPLES_PER_TOPIC} comments per topic")
print(f"  Total target: {SAMPLES_PER_TOPIC * len(topic_definitions)} comments")
print(f"  Random seed: {RANDOM_SEED}")

# Add text length for filtering
df_comments['text_length'] = df_comments['body'].str.len()

print(f"\nText length statistics:")
print(df_comments['text_length'].describe())

In [None]:
# Filter criteria for annotation quality
print("Applying filters for annotation quality...\n")

# Filter 1: Minimum length (exclude very short comments)
MIN_LENGTH = 50
print(f"Filter 1: Minimum length = {MIN_LENGTH} characters")
df_filtered = df_comments[df_comments['text_length'] >= MIN_LENGTH].copy()
print(f"  Remaining: {len(df_filtered):,} comments ({len(df_filtered)/len(df_comments)*100:.1f}%)")

# Filter 2: Exclude deleted/removed
print(f"\nFilter 2: Exclude [deleted] and [removed]")
df_filtered = df_filtered[
    ~df_filtered['body'].isin(['[deleted]', '[removed]'])
].copy()
print(f"  Remaining: {len(df_filtered):,} comments ({len(df_filtered)/len(df_comments)*100:.1f}%)")

# Filter 3: Exclude AutoModerator
if 'author' in df_filtered.columns:
    print(f"\nFilter 3: Exclude AutoModerator")
    df_filtered = df_filtered[
        df_filtered['author'] != 'AutoModerator'
    ].copy()
    print(f"  Remaining: {len(df_filtered):,} comments ({len(df_filtered)/len(df_comments)*100:.1f}%)")

print(f"\nFiltered corpus:")
print(f"  Original: {len(df_comments):,} comments")
print(f"  Filtered: {len(df_filtered):,} comments")
print(f"  Reduction: {(1 - len(df_filtered)/len(df_comments))*100:.1f}%")

print(f"\nComments per topic (after filtering):")
for topic_id, count in df_filtered['topic_id'].value_counts().items():
    topic_label = topic_definitions[topic_id]['label']
    print(f"  {topic_label}: {count:,} comments")

In [None]:
# Stratified sampling per topic
print(f"Sampling {SAMPLES_PER_TOPIC} comments per topic...\n")

sampled_dfs = []

for topic_id, topic_info in topic_definitions.items():
    topic_label = topic_info['label']
    
    # Get comments for this topic
    topic_comments = df_filtered[df_filtered['topic_id'] == topic_id].copy()
    
    print(f"{topic_label}:")
    print(f"  Available: {len(topic_comments):,} comments")
    
    if len(topic_comments) < SAMPLES_PER_TOPIC:
        print(f"  WARNING: Only {len(topic_comments)} comments available (< {SAMPLES_PER_TOPIC} target)")
        sample = topic_comments.copy()
    else:
        # Stratified sampling by text length quartiles
        # This ensures diversity in comment length
        topic_comments['length_quartile'] = pd.qcut(
            topic_comments['text_length'], 
            q=4, 
            labels=['Q1', 'Q2', 'Q3', 'Q4'],
            duplicates='drop'
        )
        
        # Sample proportionally from each quartile
        sample = topic_comments.groupby('length_quartile', group_keys=False).apply(
            lambda x: x.sample(n=min(len(x), SAMPLES_PER_TOPIC // 4), random_state=RANDOM_SEED)
        )
        
        # If we don't have enough, randomly sample remainder
        if len(sample) < SAMPLES_PER_TOPIC:
            remaining_needed = SAMPLES_PER_TOPIC - len(sample)
            remaining_pool = topic_comments[~topic_comments.index.isin(sample.index)]
            additional = remaining_pool.sample(n=min(len(remaining_pool), remaining_needed), random_state=RANDOM_SEED)
            sample = pd.concat([sample, additional])
    
    print(f"  Sampled: {len(sample)} comments")
    
    sampled_dfs.append(sample)
    print()

# Combine all samples
df_annotation_set = pd.concat(sampled_dfs, ignore_index=True)

print(f"\n✓ Total sampled: {len(df_annotation_set):,} comments")
print(f"\nSamples per topic:")
for topic_id, count in df_annotation_set['topic_id'].value_counts().items():
    topic_label = topic_definitions[topic_id]['label']
    print(f"  {topic_label}: {count} comments")

## 4. Prepare Annotation Dataset

Add submission context and prepare columns for annotation.

In [None]:
# Add submission titles for context
print("Adding submission context...")

# Create submission lookup
submission_lookup = df_submissions.set_index('submission_id')[['title', 'subreddit']].to_dict('index')

df_annotation_set['submission_title'] = df_annotation_set['submission_id'].map(
    lambda x: submission_lookup.get(x, {}).get('title', 'N/A')
)

df_annotation_set['subreddit'] = df_annotation_set['submission_id'].map(
    lambda x: submission_lookup.get(x, {}).get('subreddit', 'N/A')
)

print(f"✓ Added submission context")

In [None]:
# Add annotation columns
print("Preparing annotation columns...")

df_annotation_set['stance_label'] = None  # To be filled: 'pro', 'against', 'neutral'
df_annotation_set['annotation_confidence'] = None  # Optional: 'high', 'medium', 'low'
df_annotation_set['annotation_notes'] = ''  # Optional: Any notes
df_annotation_set['annotator'] = ''  # Annotator ID
df_annotation_set['annotation_date'] = None

# Add unique ID for tracking
df_annotation_set['annotation_id'] = [
    f"ANN_{i:05d}" for i in range(1, len(df_annotation_set) + 1)
]

print(f"✓ Added annotation columns")
print(f"\nAnnotation columns: {['annotation_id', 'stance_label', 'annotation_confidence', 'annotation_notes', 'annotator', 'annotation_date']}")

In [None]:
# Reorder columns for annotation workflow
annotation_cols = [
    'annotation_id',
    'topic_id',
    'submission_title',
    'body',
    'stance_label',
    'annotation_confidence',
    'annotation_notes',
    'annotator',
    'annotation_date',
    'comment_id',
    'submission_id',
    'subreddit',
    'text_length',
    'created_utc',
    'score',
    'author'
]

# Only include columns that exist
available_cols = [col for col in annotation_cols if col in df_annotation_set.columns]

df_annotation_set = df_annotation_set[available_cols].copy()

print(f"✓ Reordered columns for annotation")
print(f"\nFinal columns: {df_annotation_set.columns.tolist()}")

## 5. Save Annotation Dataset

Save the prepared dataset and create annotation guidelines.

In [None]:
# Save annotation dataset
annotation_file = output_path / 'stance_annotation_dataset.parquet'
write_parquet(df_annotation_set, annotation_file)

print(f"✓ Saved annotation dataset: {annotation_file}")
print(f"  {len(df_annotation_set):,} comments ready for annotation")
print(f"  {df_annotation_set.memory_usage(deep=True).sum() / 1e6:.1f} MB")

In [None]:
# Save stance definitions
stance_def_file = output_path / 'stance_definitions.json'

with open(stance_def_file, 'w') as f:
    json.dump(stance_definitions, f, indent=2)

print(f"✓ Saved stance definitions: {stance_def_file}")

In [None]:
# Create annotation guidelines document
guidelines = f"""# Stance Annotation Guidelines

**Date**: {datetime.now().strftime('%Y-%m-%d')}
**Dataset**: {len(df_annotation_set)} comments across 5 topics
**Target**: Tri-class stance (Pro/Against/Neutral) per topic

---

## Overview

You will annotate comments for their **stance** (position) toward specific topics. For each comment, determine whether the author:
- **Supports/Favors** the topic (Pro)
- **Opposes** the topic (Against)  
- Takes a **Neutral** position or unclear stance

---

## General Guidelines

### 1. Focus on the Comment Text
- Base your judgment on what the comment says, not your personal opinion
- Consider the submission title for context, but label the **comment's** stance

### 2. Stance vs. Sentiment
- **Stance** = Position toward the topic (support/oppose)
- **Sentiment** = Emotional tone (positive/negative)
- These can differ! Example: "I hate to admit it but Trump is right" = Pro-Trump stance, negative sentiment

### 3. Explicit vs. Implicit Stance
- Look for **explicit** statements: "I support...", "I'm against..."
- Also consider **implicit** cues: Sarcasm, rhetorical questions, implications
- Example: "Sure, let's just ignore science..." (sarcastic = Against climate action)

### 4. When to Choose Neutral
- Asks questions without revealing stance
- Presents both sides equally
- Off-topic or unclear
- Genuinely ambiguous (when in doubt, choose Neutral)

### 5. Confidence Levels (Optional)
- **High**: Clear, unambiguous stance
- **Medium**: Reasonably clear but some ambiguity
- **Low**: Difficult to determine, borderline case

---

## Topic-Specific Definitions

"""

# Add topic-specific guidelines
for topic_id, stance_def in stance_definitions.items():
    guidelines += f"""\n### {stance_def['topic_label']}

**Target Position**: {stance_def['target_position']}

**Pro (Support)**: {stance_def['pro']['description']}
Examples:
"""
    for ex in stance_def['pro']['examples']:
        guidelines += f"- \"{ex}\"\n"
    
    guidelines += f"\n**Against (Oppose)**: {stance_def['against']['description']}\nExamples:\n"
    for ex in stance_def['against']['examples']:
        guidelines += f"- \"{ex}\"\n"
    
    guidelines += f"\n**Neutral**: {stance_def['neutral']['description']}\nExamples:\n"
    for ex in stance_def['neutral']['examples']:
        guidelines += f"- \"{ex}\"\n"
    
    guidelines += "\n---\n"

# Add annotation workflow
guidelines += f"""
## Annotation Workflow

### Setup
1. Load `stance_annotation_dataset.parquet` in a notebook or annotation tool
2. Read these guidelines thoroughly
3. Start with a few examples to calibrate

### For Each Comment:
1. **Read the submission title** (provides topic context)
2. **Read the comment body** (what you're labeling)
3. **Identify the topic** (shown in `topic_id` column)
4. **Determine stance**: Pro / Against / Neutral
5. **Record confidence** (optional): High / Medium / Low
6. **Add notes** (optional): Explain difficult cases

### Filling the Columns:
- `stance_label`: Must be one of: 'pro', 'against', 'neutral'
- `annotation_confidence`: Optional: 'high', 'medium', 'low'
- `annotation_notes`: Optional: Free text for notes
- `annotator`: Your initials or ID
- `annotation_date`: Today's date (YYYY-MM-DD)

### Quality Checks:
- Take breaks every 20-30 annotations
- If unsure, mark as 'neutral' with low confidence
- Note edge cases in `annotation_notes`
- Aim for consistency across similar comments

---

## Example Annotations

### Example 1: Clear Pro
**Topic**: Climate Change  
**Title**: "New climate report shows accelerating warming"  
**Comment**: "We need to act now. Renewable energy is our only hope."  
**Label**: `pro`  
**Confidence**: `high`

### Example 2: Clear Against
**Topic**: Gun Control  
**Title**: "Lawmakers propose assault weapon ban"  
**Comment**: "Shall not be infringed. This is unconstitutional."  
**Label**: `against`  
**Confidence**: `high`

### Example 3: Neutral - Question
**Topic**: Vaccination  
**Title**: "New vaccine guidelines released"  
**Comment**: "What do the long-term studies show? I'd like to see more data."  
**Label**: `neutral`  
**Confidence**: `high`

### Example 4: Implicit Stance (Sarcasm)
**Topic**: Donald Trump  
**Title**: "Trump claims election was rigged"  
**Comment**: "Sure, and I'm the Queen of England. Give me a break."  
**Label**: `against` (sarcastic = opposing Trump's claim)  
**Confidence**: `medium`

### Example 5: Neutral - Balanced View
**Topic**: Immigration  
**Title**: "Border wall debate continues"  
**Comment**: "Both sides have valid concerns. We need comprehensive reform that addresses security and compassion."  
**Label**: `neutral`  
**Confidence**: `high`

---

## Troubleshooting

### "The comment doesn't mention the topic"
→ Consider context. If it's clearly related (e.g., in a climate subreddit), stance may be implicit

### "I genuinely can't tell"
→ Label as `neutral` with `low` confidence. Add notes explaining why.

### "The comment is sarcastic"
→ Try to infer the actual stance. Sarcasm usually implies the opposite of what's said literally.

### "The comment has mixed stance"
→ Choose the **dominant** stance. If truly balanced, use `neutral`.

---

## After Annotation

When complete:
1. Save the annotated file: `stance_annotation_dataset_annotated.parquet`
2. Check for missing values in `stance_label` column
3. Review a random sample for consistency
4. Calculate inter-annotator agreement if multiple annotators

The annotated dataset will be used in notebook **16b** to train the stance detection model.

---

**Questions?** Document them in your annotation notes for discussion.
"""

# Save guidelines
guidelines_file = output_path / 'stance_annotation_guidelines.md'
with open(guidelines_file, 'w') as f:
    f.write(guidelines)

print(f"✓ Saved annotation guidelines: {guidelines_file}")

## 6. Preview Annotation Dataset

Show examples from the annotation dataset.

In [None]:
# Show sample annotations per topic
print("=" * 80)
print("ANNOTATION DATASET PREVIEW")
print("=" * 80)

for topic_id in df_annotation_set['topic_id'].unique():
    topic_label = topic_definitions[topic_id]['label']
    topic_samples = df_annotation_set[df_annotation_set['topic_id'] == topic_id].head(3)
    
    print(f"\n{'='*80}")
    print(f"{topic_label.upper()} - Sample Annotations")
    print(f"{'='*80}")
    
    for idx, (_, row) in enumerate(topic_samples.iterrows(), 1):
        print(f"\n[{idx}] ID: {row['annotation_id']}")
        print(f"    Submission: {row['submission_title'][:80]}...")
        print(f"    Subreddit: r/{row['subreddit']}")
        print(f"    Comment ({row['text_length']} chars):")
        print(f"    \"{row['body'][:200]}...\"")
        print(f"    → Stance label: [TO BE ANNOTATED]")
        print("-" * 80)

In [None]:
# Show first few rows of dataset
print("\nFirst 5 rows of annotation dataset:")
df_annotation_set[[
    'annotation_id', 'topic_id', 'submission_title', 'body', 'stance_label'
]].head()

## 7. Summary

Generate summary of annotation dataset.

In [None]:
# Create summary
summary = {
    'notebook': '16a_stance_annotation_sampling',
    'timestamp': datetime.now().isoformat(),
    'sampling': {
        'target_per_topic': SAMPLES_PER_TOPIC,
        'random_seed': RANDOM_SEED,
        'min_text_length': MIN_LENGTH,
        'strategy': 'stratified by text length quartiles'
    },
    'dataset': {
        'total_comments': len(df_annotation_set),
        'topics': len(topic_definitions),
        'samples_per_topic': df_annotation_set['topic_id'].value_counts().to_dict()
    },
    'text_statistics': {
        'mean_length': float(df_annotation_set['text_length'].mean()),
        'median_length': float(df_annotation_set['text_length'].median()),
        'min_length': int(df_annotation_set['text_length'].min()),
        'max_length': int(df_annotation_set['text_length'].max())
    },
    'outputs': {
        'annotation_dataset': str(annotation_file),
        'stance_definitions': str(stance_def_file),
        'guidelines': str(guidelines_file)
    }
}

# Save summary
summary_file = output_path / '16a_sampling_summary.json'
with open(summary_file, 'w') as f:
    json.dump(summary, f, indent=2)

print(f"✓ Saved summary: {summary_file}")

In [None]:
# Print final summary
print("\n" + "=" * 80)
print("ANNOTATION DATASET READY")
print("=" * 80)

print(f"\n✓ Sampled {len(df_annotation_set):,} comments for annotation")
print(f"\nSamples per topic:")
for topic_id, count in df_annotation_set['topic_id'].value_counts().items():
    topic_label = topic_definitions[topic_id]['label']
    print(f"  {topic_label}: {count} comments")

print(f"\n✓ Created files:")
print(f"  {annotation_file}")
print(f"  {stance_def_file}")
print(f"  {guidelines_file}")
print(f"  {summary_file}")

print("\n" + "=" * 80)
print("NEXT STEPS")
print("=" * 80)

print("\n1. MANUAL ANNOTATION:")
print("   - Read the annotation guidelines thoroughly")
print("   - Load the annotation dataset")
print("   - Fill in 'stance_label' column: 'pro', 'against', or 'neutral'")
print("   - Optionally add confidence and notes")
print("   - Save as 'stance_annotation_dataset_annotated.parquet'")

print("\n2. QUALITY CHECKS:")
print("   - Ensure all rows have stance_label filled")
print("   - Review a random sample for consistency")
print("   - If multiple annotators, calculate inter-annotator agreement")

print("\n3. PROCEED TO NOTEBOOK 16b:")
print("   - Use annotated dataset to train stance detection model")
print("   - Apply model to all filtered comments")
print("   - Validate on held-out annotations")

print("\n" + "=" * 80)