In [1]:
# Imports
import sys
from pathlib import Path
import pandas as pd
import json
import torch
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

# Add src to path
workspace_root = Path().cwd()
sys.path.insert(0, str(workspace_root / 'src'))

# Thesis pipeline utilities
from thesis_pipeline.io.paths import get_data_path
from thesis_pipeline.io.parquet import read_parquet, write_parquet

# Set device
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
print(f"Using device: {device}")

# Set style
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

print("âœ“ All imports successful")

Using device: mps
âœ“ All imports successful


# 1. Load Files

In [2]:
# Paths
topics_path = get_data_path('topics', 'reddit')
gold_path = get_data_path('gold', 'reddit')

print(f"Topics data: {topics_path}")
print(f"Gold data: {gold_path}")

Topics data: /Users/stahlma/Desktop/01_Studium/11_Thesis/Data_Experiment/data/02_topics/reddit
Gold data: /Users/stahlma/Desktop/01_Studium/11_Thesis/Data_Experiment/data/01_corpus/02_gold/reddit


In [3]:
# Load supervised topic assignments (multi-label)
thread_docs = read_parquet(topics_path / 'embeddings' /'thread_pseudodocs_with_supervised_topics_multilabel.parquet')

# Load metadata for topic definitions
with open(topics_path / 'embeddings' / 'supervised_multilabel_classification_metadata.json', 'r') as f:
    metadata = json.load(f)

# Extract topic definitions
topics_info = metadata['topics']
topic_definitions = {
    int(tid): {
        'id': int(tid),
        'label': topics_info[str(tid)]['label'],
        'description': topics_info[str(tid)]['description']
    }
    for tid in range(20)
}

print(f"\nðŸ“Š Loaded supervised topics:")
print(f"  Thread documents: {len(thread_docs):,}")
print(f"  Date range: {thread_docs['created_utc'].min():.0f} to {thread_docs['created_utc'].max():.0f}")
print(f"  Topics defined: {len(topic_definitions)}")
print(f"\n  Sample topic definition:")
print(f"    ID: {topic_definitions[0]['id']}")
print(f"    Label: {topic_definitions[0]['label']}")
print(f"    Description: {topic_definitions[0]['description']}")


ðŸ“Š Loaded supervised topics:
  Thread documents: 433,973
  Date range: 1472688024 to 1477954796
  Topics defined: 20

  Sample topic definition:
    ID: 0
    Label: Elections & Voting
    Description: Electoral processes, political campaigns, voting rights, electoral reform, voter registration, election results, polling, ballots, primaries, caucuses, electoral college


In [4]:
comments_path = gold_path / 'comments'
submissions_path = gold_path / 'submissions'

print(f"\nðŸ“¦ Loading gold data from:")
print(f"  Comments: {comments_path}")
print(f"  Submissions: {submissions_path}")

# Load comments
comment_files = sorted(comments_path.glob('2016-*.parquet'))
print(f"\n  Found {len(comment_files)} comment files: {[f.name for f in comment_files]}")

comments_chunks = []
for file in tqdm(comment_files, desc="Loading comments"):
    chunk = read_parquet(file)
    chunk['is_submission'] = False
    comments_chunks.append(chunk)

comments_df = pd.concat(comments_chunks, ignore_index=True) if comments_chunks else pd.DataFrame()

# Load submissions
submission_files = sorted(submissions_path.glob('2016-*.parquet'))
print(f"  Found {len(submission_files)} submission files: {[f.name for f in submission_files]}")

submissions_chunks = []
for file in tqdm(submission_files, desc="Loading submissions"):
    chunk = read_parquet(file)
    chunk['is_submission'] = True
    submissions_chunks.append(chunk)

submissions_df = pd.concat(submissions_chunks, ignore_index=True) if submissions_chunks else pd.DataFrame()



ðŸ“¦ Loading gold data from:
  Comments: /Users/stahlma/Desktop/01_Studium/11_Thesis/Data_Experiment/data/01_corpus/02_gold/reddit/comments
  Submissions: /Users/stahlma/Desktop/01_Studium/11_Thesis/Data_Experiment/data/01_corpus/02_gold/reddit/submissions

  Found 2 comment files: ['2016-09.parquet', '2016-10.parquet']


Loading comments:   0%|          | 0/2 [00:00<?, ?it/s]

  Found 2 submission files: ['2016-09.parquet', '2016-10.parquet']


Loading submissions:   0%|          | 0/2 [00:00<?, ?it/s]

# 2. Map Topics

In [5]:
# Extract topic columns from thread_docs
submission_topics = thread_docs[['submission_id', 'supervised_topics', 'supervised_topic_labels']].copy()

# Merge topics onto submissions (submission_id â†’ submission_id)
submissions_with_topics = submissions_df.merge(
    submission_topics,
    on='submission_id',
    how='inner'
)

# Merge topics onto comments (submission_id â†’ submission_id)
comments_with_topics = comments_df.merge(
    submission_topics,
    on='submission_id',
    how='inner'
)

print(f"\nâœ“ Merged topics with data:")
print(f"  Submissions with topics: {len(submissions_with_topics):,}")
print(f"    Unique submissions: {submissions_with_topics['submission_id'].nunique():,}")
print(f"    Average topics per submission: {submissions_with_topics['supervised_topics'].apply(len).mean():.2f}")
print(f"\n  Comments with topics: {len(comments_with_topics):,}")
print(f"    Unique comments: {comments_with_topics['comment_id'].nunique():,}")
print(f"    Average topics per comment: {comments_with_topics['supervised_topics'].apply(len).mean():.2f}")


âœ“ Merged topics with data:
  Submissions with topics: 433,973
    Unique submissions: 433,973
    Average topics per submission: 1.27

  Comments with topics: 8,624,040
    Unique comments: 8,624,040
    Average topics per comment: 1.36


In [6]:
# Expand submissions to one row per (submission, topic) pair
submissions_expanded = []

for idx, row in tqdm(submissions_with_topics.iterrows(), total=len(submissions_with_topics), desc="Expanding submissions"):
    topics = row['supervised_topics']
    topic_labels = row['supervised_topic_labels']
    
    for topic_id, topic_label in zip(topics, topic_labels):
        submissions_expanded.append({
            'submission_id': row['submission_id'],
            'created_utc': row['created_utc'],
            'text': row['title'],
            'topic_id': topic_id,
            'topic_label': topic_label,
            'topic_description': topic_definitions[topic_id]['description']
        })

submissions_expanded_df = pd.DataFrame(submissions_expanded)

# Expand comments to one row per (comment, topic) pair
comments_expanded = []

for idx, row in tqdm(comments_with_topics.iterrows(), total=len(comments_with_topics), desc="Expanding comments"):
    topics = row['supervised_topics']
    topic_labels = row['supervised_topic_labels']
    
    for topic_id, topic_label in zip(topics, topic_labels):
        comments_expanded.append({
            'comment_id': row['comment_id'],
            'submission_id': row['submission_id'],
            'created_utc': row['created_utc'],
            'text': row['body'],
            'topic_id': topic_id,
            'topic_label': topic_label,
            'topic_description': topic_definitions[topic_id]['description']
        })

comments_expanded_df = pd.DataFrame(comments_expanded)

print(f"\nâœ“ Expanded to (text, topic) pairs:")
print(f"  Submissions: {len(submissions_expanded_df):,} rows from {submissions_expanded_df['submission_id'].nunique():,} unique submissions")
print(f"  Comments: {len(comments_expanded_df):,} rows from {comments_expanded_df['comment_id'].nunique():,} unique comments")
print(f"  Total pairs: {len(submissions_expanded_df) + len(comments_expanded_df):,}")

Expanding submissions:   0%|          | 0/433973 [00:00<?, ?it/s]

Expanding comments:   0%|          | 0/8624040 [00:00<?, ?it/s]


âœ“ Expanded to (text, topic) pairs:
  Submissions: 549,962 rows from 433,973 unique submissions
  Comments: 11,710,114 rows from 8,624,040 unique comments
  Total pairs: 12,260,076


# 3. Save Topics

In [7]:
# Save expanded dataframes with topics
output_path = topics_path / 'embeddings'
output_path.mkdir(parents=True, exist_ok=True)

# Save submissions expanded
submissions_output = output_path / 'submissions_expanded_with_topics.parquet'
write_parquet(submissions_expanded_df, submissions_output)
print(f"âœ“ Saved submissions: {submissions_output}")
print(f"  Rows: {len(submissions_expanded_df):,}")

# Save comments expanded
comments_output = output_path / 'comments_expanded_with_topics.parquet'
write_parquet(comments_expanded_df, comments_output)
print(f"âœ“ Saved comments: {comments_output}")
print(f"  Rows: {len(comments_expanded_df):,}")

print(f"\nâœ“ All files saved to: {output_path}")

âœ“ Wrote 549,962 rows to submissions_expanded_with_topics.parquet (29.6 MB)
âœ“ Saved submissions: /Users/stahlma/Desktop/01_Studium/11_Thesis/Data_Experiment/data/02_topics/reddit/embeddings/submissions_expanded_with_topics.parquet
  Rows: 549,962
âœ“ Wrote 11,710,114 rows to comments_expanded_with_topics.parquet (1327.3 MB)
âœ“ Saved comments: /Users/stahlma/Desktop/01_Studium/11_Thesis/Data_Experiment/data/02_topics/reddit/embeddings/comments_expanded_with_topics.parquet
  Rows: 11,710,114

âœ“ All files saved to: /Users/stahlma/Desktop/01_Studium/11_Thesis/Data_Experiment/data/02_topics/reddit/embeddings
