In [12]:
import sys
from pathlib import Path
from datetime import datetime
import json
import requests
from tqdm import tqdm

workspace_root = Path.cwd()
sys.path.insert(0, str(workspace_root / 'src'))

from thesis_pipeline.io.config import load_all_configs

print(f"Reddit download started: {datetime.now().isoformat()}")
print(f"Workspace: {workspace_root}")

Reddit download started: 2025-12-18T14:35:57.626263
Workspace: /Users/stahlma/Desktop/01_Studium/11_Thesis/Data_Experiment


## 1. Load Configuration and Setup Paths

In [13]:
configs = load_all_configs(workspace_root / 'configs')
reddit_cfg = configs['reddit']
global_cfg = configs['global']

# Setup output directory
raw_reddit_dir = workspace_root / 'data/00_raw/reddit/politosphere_2016-09_2016-10'
raw_reddit_dir.mkdir(parents=True, exist_ok=True)

print(f"Source: {reddit_cfg['source']['name']}")
print(f"Zenodo DOI: {reddit_cfg['source']['zenodo_doi']}")
print(f"Output directory: {raw_reddit_dir.relative_to(workspace_root)}")

Source: Politosphere
Zenodo DOI: 10.5281/zenodo.5851729
Output directory: data/00_raw/reddit/politosphere_2016-09_2016-10


## 2. File Structure (from manual inspection)

Based on the Zenodo repository structure:
- **Comment files by month:** `comments_YYYY-MM.bz2` (bzip2 compressed)
- **Network files by year:** `networks_YYYY.csv`
- **Metadata files:** `subreddits_metadata.json`, `users_metadata.json`

**For Sep-Oct 2016, we need:**
1. `comments_2016-09.bz2`
2. `comments_2016-10.bz2`
3. `networks_2016.csv` (thread relationships)
4. `subreddits_metadata.json`
5. `users_metadata.json`

In [14]:
# Zenodo record information
zenodo_doi = reddit_cfg['source']['zenodo_doi']
record_id = zenodo_doi.split('.')[-1]  # Extract record ID: 5851729

# Direct download URLs (bypassing API due to rate limits)
zenodo_base_url = f"https://zenodo.org/records/{record_id}/files"

# Files we need for Sep-Oct 2016
files_to_download = [
    {
        'filename': 'comments_2016-09.bz2',
        'url': f"{zenodo_base_url}/comments_2016-09.bz2",
        'type': 'comments',
        'period': '2016-09'
    },
    {
        'filename': 'comments_2016-10.bz2',
        'url': f"{zenodo_base_url}/comments_2016-10.bz2",
        'type': 'comments',
        'period': '2016-10'
    },
    {
        'filename': 'networks_2016.csv',
        'url': f"{zenodo_base_url}/networks_2016.csv",
        'type': 'networks',
        'period': '2016'
    },
    {
        'filename': 'subreddits_metadata.json',
        'url': f"{zenodo_base_url}/subreddits_metadata.json",
        'type': 'metadata',
        'period': 'all'
    },
    {
        'filename': 'users_metadata.json',
        'url': f"{zenodo_base_url}/users_metadata.json",
        'type': 'metadata',
        'period': 'all'
    }
]

print(f"Zenodo DOI: {zenodo_doi}")
print(f"Record ID: {record_id}")
print(f"\nFiles to download: {len(files_to_download)}")
for f in files_to_download:
    print(f"  - {f['filename']} ({f['type']})")

Zenodo DOI: 10.5281/zenodo.5851729
Record ID: 5851729

Files to download: 5
  - comments_2016-09.bz2 (comments)
  - comments_2016-10.bz2 (comments)
  - networks_2016.csv (networks)
  - subreddits_metadata.json (metadata)
  - users_metadata.json (metadata)


## 3. Download Function

In [15]:
def download_file(url, destination, filename=None):
    """
    Download a file from URL with progress bar.
    
    Args:
        url: Download URL
        destination: Directory to save file
        filename: Optional filename override
    
    Returns:
        Path to downloaded file
    """
    if filename is None:
        filename = url.split('/')[-1]
    
    filepath = Path(destination) / filename
    
    # Skip if already exists
    if filepath.exists():
        print(f"âœ“ File already exists: {filename}")
        return filepath
    
    print(f"Downloading: {filename}")
    
    response = requests.get(url, stream=True)
    response.raise_for_status()
    
    total_size = int(response.headers.get('content-length', 0))
    
    with open(filepath, 'wb') as f, tqdm(
        total=total_size,
        unit='iB',
        unit_scale=True,
        unit_divisor=1024,
    ) as pbar:
        for chunk in response.iter_content(chunk_size=8192):
            size = f.write(chunk)
            pbar.update(size)
    
    print(f"âœ“ Downloaded: {filename} ({filepath.stat().st_size / (1024**2):.1f} MB)")
    return filepath

print("âœ“ Download function ready")

âœ“ Download function ready


## 4. Download Files

In [18]:
# Download all required files
import time

downloaded_files = []
download_errors = []

print("Starting downloads...")
print("=" * 80)

for i, file_info in enumerate(files_to_download):
    try:
        url = file_info['url']
        filename = file_info['filename']
        
        filepath = download_file(url, raw_reddit_dir, filename)
        downloaded_files.append(filepath)
        
        # Add delay between downloads to avoid rate limiting (skip for last file)
        if i < len(files_to_download) - 1:
            time.sleep(5)  # 2 second delay
        
    except Exception as e:
        error_msg = f"Failed to download {filename}: {e}"
        print(f"âœ— {error_msg}")
        download_errors.append(error_msg)

print("=" * 80)
print(f"\nâœ“ Successfully downloaded: {len(downloaded_files)}/{len(files_to_download)} files")

if download_errors:
    print(f"âœ— Errors: {len(download_errors)}")
    for err in download_errors:
        print(f"  - {err}")
    print("\nðŸ’¡ Tip: Rerun this cell to retry failed downloads (existing files will be skipped)")

Starting downloads...
âœ“ File already exists: comments_2016-09.bz2
âœ“ File already exists: comments_2016-10.bz2
âœ“ File already exists: networks_2016.csv
âœ“ File already exists: subreddits_metadata.json
Downloading: users_metadata.json
âœ— Failed to download users_metadata.json: 429 Client Error: Too Many Requests for url: https://zenodo.org/records/5851729/files/users_metadata.json

âœ“ Successfully downloaded: 4/5 files
âœ— Errors: 1
  - Failed to download users_metadata.json: 429 Client Error: Too Many Requests for url: https://zenodo.org/records/5851729/files/users_metadata.json

ðŸ’¡ Tip: Rerun this cell to retry failed downloads (existing files will be skipped)


## 5. Verify Downloaded Files

In [19]:
print("Downloaded files summary:")
print("=" * 80)

total_size = 0
file_breakdown = {'comments': 0, 'networks': 0, 'metadata': 0}

for filepath in downloaded_files:
    size_mb = filepath.stat().st_size / (1024**2)
    total_size += size_mb
    
    # Categorize file
    if 'comments' in filepath.name:
        file_breakdown['comments'] += size_mb
    elif 'networks' in filepath.name:
        file_breakdown['networks'] += size_mb
    elif 'metadata' in filepath.name:
        file_breakdown['metadata'] += size_mb
    
    print(f"âœ“ {filepath.name}: {size_mb:.1f} MB")

print(f"\nTotal downloaded: {total_size:.1f} MB ({total_size/1024:.2f} GB)")
print(f"  Comments: {file_breakdown['comments']:.1f} MB")
print(f"  Networks: {file_breakdown['networks']:.1f} MB")
print(f"  Metadata: {file_breakdown['metadata']:.1f} MB")
print("=" * 80)

Downloaded files summary:
âœ“ comments_2016-09.bz2: 500.1 MB
âœ“ comments_2016-10.bz2: 616.0 MB
âœ“ networks_2016.csv: 0.3 MB
âœ“ subreddits_metadata.json: 0.1 MB

Total downloaded: 1116.5 MB (1.09 GB)
  Comments: 1116.1 MB
  Networks: 0.3 MB
  Metadata: 0.1 MB


## 6. Inspect File Contents (Quick Peek)

In [20]:
import bz2

# Quick peek at comment file structure (first 5 lines)
comment_file = raw_reddit_dir / 'comments_2016-09.bz2'
if comment_file.exists():
    print("Sample from comments_2016-09.bz2 (first 5 lines):")
    print("=" * 80)
    with bz2.open(comment_file, 'rt', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i >= 5:
                break
            # Truncate long lines for display
            display_line = line[:200] + '...' if len(line) > 200 else line
            print(f"{i+1}: {display_line.strip()}")
    print("=" * 80)
else:
    print("âš  Comment file not found for inspection")

# Check metadata file structure
metadata_file = raw_reddit_dir / 'subreddits_metadata.json'
if metadata_file.exists():
    with open(metadata_file, 'r') as f:
        # Read first 1000 chars to see structure
        sample = f.read(1000)
        print(f"\nSample from subreddits_metadata.json (first 1000 chars):")
        print("=" * 80)
        print(sample)
        print("=" * 80)
else:
    print("âš  Subreddits metadata file not found")

Sample from comments_2016-09.bz2 (first 5 lines):
1: {"author":"krb7H","body":"trump seems to be gaining supporters at an increasing rate. I remember when he had single-digit chances of winning. Now it almost seems like it could be a competition. The cu...
2: {"author":"OQcjv","body":"Hi `alictrmods`. Thank you for participating in \/r\/Politics. However, [your submission](https:\/\/www.reddit.com\/r\/politics\/comments\/50juq5\/press_ignores_kaepernicks_h...
3: {"author":"mQu7y","body":"The Mistakes of the Obama...","body_cleaned":"the mistakes of the obama ...","controversiality":0,"created_utc":1472688002,"distinguished":null,"edited":false,"gilded":0,"id"...
4: {"author":"yEoqZ","body":"Right. Same difference. The mexican's pay.","body_cleaned":"right . same difference . the mexican 's pay .","controversiality":0,"created_utc":1472688003,"distinguished":null...
5: {"author":"b5Wub","body":"These were stipulations for some of the progressive policies on the Democratic platform. Unfo

In [23]:
with bz2.open(comment_file, 'rt', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i >= 2:
                  break;
            print(f"{i+1}: {line.strip()}")

1: {"author":"krb7H","body":"trump seems to be gaining supporters at an increasing rate. I remember when he had single-digit chances of winning. Now it almost seems like it could be a competition. The current trend of the graph on 538 terrifies me","body_cleaned":"trump seems to be gaining supporters at an increasing rate . i remember when he had single-digit chances of winning . now it almost seems like it could be a competition . the current trend of the graph on 538 terrifies me","controversiality":0,"created_utc":1472688001,"distinguished":null,"edited":false,"gilded":0,"id":"d74qmz3","language":"en","link_id":"t3_50grgt","parent_id":"t1_d74ft4z","retrieved_on":1475215923,"score":3,"subreddit":"politics","subreddit_id":"t5_2cneq"}
2: {"author":"OQcjv","body":"Hi `alictrmods`. Thank you for participating in \/r\/Politics. However, [your submission](https:\/\/www.reddit.com\/r\/politics\/comments\/50juq5\/press_ignores_kaepernicks_hillary_for_prison\/) has been removed for the follow

## 7. Save Download Metadata

In [21]:
# Record download details
download_metadata = {
    'timestamp': datetime.now().isoformat(),
    'source': {
        'name': reddit_cfg['source']['name'],
        'zenodo_doi': zenodo_doi,
        'github_repo': reddit_cfg['source']['github_repo'],
        'record_id': record_id
    },
    'period': {
        'start': global_cfg['validation_run']['period_start'],
        'end': global_cfg['validation_run']['period_end']
    },
    'files': [
        {
            'filename': f.name,
            'path': str(f.relative_to(workspace_root)),
            'size_mb': round(f.stat().st_size / (1024**2), 2),
            'type': 'comments' if 'comments' in f.name else 'networks' if 'networks' in f.name else 'metadata'
        }
        for f in downloaded_files
    ],
    'summary': {
        'total_files': len(downloaded_files),
        'total_size_mb': round(total_size, 2),
        'total_size_gb': round(total_size/1024, 2),
        'errors': download_errors if download_errors else None
    },
    'output_directory': str(raw_reddit_dir.relative_to(workspace_root))
}

metadata_file = raw_reddit_dir / 'download_metadata.json'
with open(metadata_file, 'w') as f:
    json.dump(download_metadata, f, indent=2)

print(f"âœ“ Download metadata saved: {metadata_file.relative_to(workspace_root)}")

âœ“ Download metadata saved: data/00_raw/reddit/politosphere_2016-09_2016-10/download_metadata.json


## Summary

**Download Complete!**

Files downloaded:
- `comments_2016-09.bz2` (Sep 2016 comments)
- `comments_2016-10.bz2` (Oct 2016 comments)
- `networks_2016.csv` (thread relationships)
- `subreddits_metadata.json` (subreddit information)
- `users_metadata.json` (user information)

**Next Step:** Proceed to notebook 11 to extract, filter, and create daily silver layer files.