## 1. Load Politosphere Subreddit Filter

Load the list of political subreddits from politosphere dataset to filter Pushshift submissions.

**Output**: Silver layer submissions in `data/01_corpus/01_silver/reddit/submissions/`

In [4]:
# Environment setup
import sys
from pathlib import Path
from datetime import datetime
import json
import zstandard as zstd
import pandas as pd
from tqdm import tqdm
from collections import defaultdict

workspace_root = Path.cwd()
sys.path.insert(0, str(workspace_root / 'src'))

from thesis_pipeline.io.paths import get_data_path
from thesis_pipeline.io.parquet import write_parquet

print(f"Submission extraction started: {datetime.now().isoformat()}")
print(f"Workspace: {workspace_root}")
print("✓ Imports complete")

Submission extraction started: 2025-12-19T16:11:24.346671
Workspace: /Users/stahlma/Desktop/01_Studium/11_Thesis/Data_Experiment
✓ Imports complete


In [5]:
# Load subreddit filter list
subreddits_file = get_data_path('raw') / 'reddit' / 'subreddits.txt'

print(f"Loading subreddit filter from: {subreddits_file}")

with open(subreddits_file, 'r') as f:
    political_subreddits = set(line.strip().lower() for line in f if line.strip())

print(f"\n✓ Loaded {len(political_subreddits)} political subreddits")
print(f"Sample subreddits: {sorted(list(political_subreddits))[:10]}")

Loading subreddit filter from: /Users/stahlma/Desktop/01_Studium/11_Thesis/Data_Experiment/data/01_corpus/00_raw/reddit/subreddits.txt

✓ Loaded 605 political subreddits
Sample subreddits: ['2012elections', '2016_elections', '2016elections', '2aliberals', 'abetterworldnews', 'abortiondebate', 'accidentallycommunist', 'acteuropa', 'activemeasures', 'actualconspiracies']


## 2. Setup Paths and Check Files

In [None]:
# Paths
raw_dir = workspace_root / 'data/01_corpus/00_raw/reddit/politosphere_2016-09_2016-10'
output_dir = workspace_root / 'data/01_corpus/01_silver/reddit/submissions'
output_dir.mkdir(parents=True, exist_ok=True)

# Submission archive files
submission_files = [
    raw_dir / 'RS_2016-09.zst',
    raw_dir / 'RS_2016-10.zst'
]

print("Input files:")
for f in submission_files:
    if f.exists():
        size_mb = f.stat().st_size / (1024**2)
        print(f"  ✓ {f.name}: {size_mb:.1f} MB")
    else:
        print(f"  ✗ {f.name}: NOT FOUND")

print(f"\nOutput directory: {output_dir.relative_to(workspace_root)}")

Input files:
  ✓ RS_2016-09.zst: 2011.1 MB
  ✓ RS_2016-10.zst: 2173.9 MB

Output directory: data/01_corpus/00_raw/reddit/submissions


## 3. Inspect Submission File Structure

In [7]:
# Peek at first few submissions to understand structure
print("Sample submissions from RS_2016-09.zst:")
print("=" * 80)

with open(submission_files[0], 'rb') as fh:
    dctx = zstd.ZstdDecompressor()
    with dctx.stream_reader(fh) as reader:
        text_stream = reader.read(1024 * 1024).decode('utf-8')  # Read first 1MB
        lines = text_stream.split('\n')
        
        for i, line in enumerate(lines[:3]):
            if not line.strip():
                continue
            try:
                sub = json.loads(line)
                print(f"\nSubmission {i+1}:")
                print(f"  ID: {sub.get('id', 'N/A')}")
                print(f"  Title: {sub.get('title', '')[:80]}...")
                print(f"  Subreddit: {sub.get('subreddit', 'N/A')}")
                print(f"  Author: {sub.get('author', 'N/A')}")
                print(f"  Created UTC: {sub.get('created_utc', 'N/A')}")
                print(f"  Selftext: {sub.get('selftext', '')[:100]}...")
                
                if i == 0:
                    print(f"\n  All fields: {list(sub.keys())}")
                    
            except json.JSONDecodeError:
                continue

print("\n" + "=" * 80)

Sample submissions from RS_2016-09.zst:

Submission 1:
  ID: 50kc60
  Title: Server Owners/Operators, how have your servers been doing in terms of traffic si...
  Subreddit: tf2
  Author: Herpsties
  Created UTC: 1472688000
  Selftext: It's been awhile since MyM hit and a lot of people were discussing the impact the update may have on...

  All fields: ['title', 'stickied', 'created_utc', 'over_18', 'selftext', 'contest_mode', 'subreddit_id', 'media_embed', 'distinguished', 'gilded', 'author_flair_text', 'thumbnail', 'media', 'url', 'secure_media', 'is_self', 'archived', 'downs', 'ups', 'permalink', 'name', 'locked', 'link_flair_css_class', 'retrieved_on', 'subreddit', 'hide_score', 'secure_media_embed', 'domain', 'num_comments', 'saved', 'author_flair_css_class', 'link_flair_text', 'id', 'score', 'author', 'edited', 'quarantine']

Submission 2:
  ID: 50kc61
  Title: Crashed Presto site leaves customers, staff scrambling | Metro News...
  Subreddit: ottawa
  Author: neoCanuck
  Created

## 4. Extract and Filter Submissions

Process the Pushshift archives and extract submissions from political subreddits for Sep-Oct 2016.

In [8]:
def process_submission_file(file_path, political_subreddits, period_start, period_end):
    """
    Extract submissions from a Pushshift .zst file.
    Returns dict mapping date -> list of submission records.
    """
    submissions_by_date = defaultdict(list)
    found_count = 0
    total_count = 0
    
    print(f"\nProcessing: {file_path.name}")
    print(f"Filtering for {len(political_subreddits)} political subreddits")
    print(f"Date range: {period_start.date()} to {period_end.date()}")
    
    with open(file_path, 'rb') as fh:
        dctx = zstd.ZstdDecompressor()
        with dctx.stream_reader(fh) as reader:
            text_reader = reader.read()
            
            # Process line by line
            for line in tqdm(text_reader.decode('utf-8', errors='ignore').split('\n'), 
                           desc=f"Scanning {file_path.name}"):
                if not line.strip():
                    continue
                    
                total_count += 1
                
                try:
                    sub = json.loads(line)
                    
                    # Check subreddit filter
                    subreddit = sub.get('subreddit', '').lower()
                    if subreddit not in political_subreddits:
                        continue
                    
                    # Check date range
                    created_utc = sub.get('created_utc', 0)
                    created_dt = datetime.fromtimestamp(created_utc)
                    if not (period_start <= created_dt <= period_end):
                        continue
                    
                    # Extract relevant fields
                    record = {
                        'thread_id': sub.get('id'),
                        'title': sub.get('title', ''),
                        'selftext': sub.get('selftext', ''),
                        'author': sub.get('author', ''),
                        'subreddit': sub.get('subreddit', ''),
                        'subreddit_id': sub.get('subreddit_id', ''),
                        'created_utc': created_utc,
                        'score': sub.get('score', 0),
                        'num_comments': sub.get('num_comments', 0),
                        'url': sub.get('url', ''),
                        'domain': sub.get('domain', ''),
                    }
                    
                    # Group by date
                    date = created_dt.strftime('%Y-%m-%d')
                    submissions_by_date[date].append(record)
                    found_count += 1
                        
                except json.JSONDecodeError:
                    continue
                except Exception as e:
                    # Skip malformed records
                    continue
    
    print(f"\n✓ Processed {total_count:,} total submissions")
    print(f"✓ Found {found_count:,} matching submissions ({found_count/total_count*100:.2f}%)")
    print(f"✓ Covering {len(submissions_by_date)} dates")
    
    return submissions_by_date

In [9]:
# Define date range for Sep-Oct 2016
period_start = datetime(2016, 9, 1)
period_end = datetime(2016, 10, 31, 23, 59, 59)

# Process both submission files
print("Extracting submissions from Pushshift archives...")
print("=" * 80)

all_submissions_by_date = defaultdict(list)

for sub_file in submission_files:
    if not sub_file.exists():
        print(f"⚠️  Skipping {sub_file.name} - file not found")
        continue
        
    file_submissions = process_submission_file(sub_file, political_subreddits, period_start, period_end)
    
    # Merge into main dict
    for date, subs in file_submissions.items():
        all_submissions_by_date[date].extend(subs)

print("\n" + "=" * 80)
print(f"Total unique dates with submissions: {len(all_submissions_by_date)}")
print(f"Total submissions extracted: {sum(len(v) for v in all_submissions_by_date.values()):,}")
print(f"Coverage: {100*len([k for k in all_submissions_by_date if k])/61:.1f}% of 61 days")

Extracting submissions from Pushshift archives...

Processing: RS_2016-09.zst
Filtering for 605 political subreddits
Date range: 2016-09-01 to 2016-10-31


Scanning RS_2016-09.zst: 100%|██████████| 7437863/7437863 [00:58<00:00, 127585.59it/s]



✓ Processed 7,437,862 total submissions
✓ Found 387,373 matching submissions (5.21%)
✓ Covering 31 dates

Processing: RS_2016-10.zst
Filtering for 605 political subreddits
Date range: 2016-09-01 to 2016-10-31


Scanning RS_2016-10.zst: 100%|██████████| 8286760/8286760 [01:33<00:00, 89029.63it/s] 



✓ Processed 8,286,759 total submissions
✓ Found 536,058 matching submissions (6.47%)
✓ Covering 31 dates

Total unique dates with submissions: 61
Total submissions extracted: 923,431
Coverage: 100.0% of 61 days


## 5. Save Daily Submission Files

Save as parquet files, one per day, matching the comment data structure.

In [10]:
# Save each day's submissions
print("Saving daily submission files...")
print("=" * 80)

saved_files = []
total_submissions = 0

for date in sorted(all_submissions_by_date.keys()):
    subs = all_submissions_by_date[date]
    
    if not subs:
        continue
    
    # Convert to DataFrame
    df = pd.DataFrame(subs)
    
    # Add date column for consistency
    df['date'] = date
    
    # Reorder columns
    column_order = ['date', 'created_utc', 'thread_id', 'title', 'selftext', 
                    'author', 'subreddit', 'subreddit_id', 'score', 'num_comments',
                    'url', 'domain']
    df = df[column_order]
    
    # Save
    output_file = output_dir / f'{date}.parquet'
    write_parquet(df, output_file)
    
    saved_files.append(output_file)
    total_submissions += len(df)
    
    print(f"  {date}: {len(df):,} submissions -> {output_file.name}")

print("\n" + "=" * 80)
print(f"✓ Saved {len(saved_files)} daily files")
print(f"✓ Total submissions: {total_submissions:,}")
print(f"\nOutput location: {output_dir.relative_to(workspace_root)}")

Saving daily submission files...
✓ Wrote 11,525 rows to 2016-09-01.parquet (1.7 MB)
  2016-09-01: 11,525 submissions -> 2016-09-01.parquet
✓ Wrote 11,323 rows to 2016-09-02.parquet (1.6 MB)
  2016-09-02: 11,323 submissions -> 2016-09-02.parquet
✓ Wrote 8,746 rows to 2016-09-03.parquet (1.3 MB)
  2016-09-03: 8,746 submissions -> 2016-09-03.parquet
✓ Wrote 7,939 rows to 2016-09-04.parquet (1.2 MB)
  2016-09-04: 7,939 submissions -> 2016-09-04.parquet
✓ Wrote 9,248 rows to 2016-09-05.parquet (1.3 MB)
  2016-09-05: 9,248 submissions -> 2016-09-05.parquet
✓ Wrote 12,184 rows to 2016-09-06.parquet (1.7 MB)
  2016-09-06: 12,184 submissions -> 2016-09-06.parquet
✓ Wrote 12,363 rows to 2016-09-07.parquet (1.8 MB)
  2016-09-07: 12,363 submissions -> 2016-09-07.parquet
✓ Wrote 12,783 rows to 2016-09-08.parquet (1.8 MB)
  2016-09-08: 12,783 submissions -> 2016-09-08.parquet
✓ Wrote 12,062 rows to 2016-09-09.parquet (1.7 MB)
  2016-09-09: 12,062 submissions -> 2016-09-09.parquet
✓ Wrote 9,838 rows 