# 🦋 Bluesky Social Justice Data Collection & Analysis
## DFP F25 Social Media Blue Team

This notebook provides an interactive interface for collecting and analyzing social justice data from Bluesky with rich author influence metrics.

**Key Features:**
- ✅ **Real-time data collection** with Bluesky firehose
- ✅ **Author influence metrics** (follower counts, verification status)
- ✅ **Session-based organization** with alltime datasets
- ✅ **2-minute batching** with automatic alltime updates
- ✅ **Rich content analysis** (hashtags, media, emotions)
- ✅ **Secure authentication** from external credentials file

**Social Justice Keywords:**
- Food insecurity
- Housing crisis
- Homelessness
- Unemployment
- Gender inequality


## 🔧 Setup and Dependencies

First, let's import all required libraries and check our setup.


In [None]:
# Import required libraries
import json
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timezone
from collections import defaultdict, Counter
import re

# Check if atproto is available
try:
    from atproto import Client
    print("✅ atproto library available")
except ImportError:
    print("❌ atproto not found. Install with: pip install atproto")

# Set up plotting
plt.style.use('default')
sns.set_palette("husl")

print("🔧 Setup complete!")


## 📊 Load and Explore Collected Data

Let's examine the social justice data we've collected from Bluesky.


In [None]:
# Load all alltime data for analysis
keywords = ["food_insecurity", "housing", "homeless", "unemployment", "gender_inequality"]
all_data = []
keyword_counts = {}

print("📊 Loading Alltime Data:")
print("=" * 40)

for keyword in keywords:
    alltime_file = f"data/alltime/{keyword}_alltime.jsonl"
    if os.path.exists(alltime_file):
        keyword_posts = []
        with open(alltime_file, 'r') as f:
            for line in f:
                try:
                    post = json.loads(line.strip())
                    all_data.append(post)
                    keyword_posts.append(post)
                except json.JSONDecodeError:
                    continue
        
        keyword_counts[keyword] = len(keyword_posts)
        print(f"✅ {keyword.replace('_', ' ').title()}: {len(keyword_posts):,} posts")
    else:
        keyword_counts[keyword] = 0
        print(f"❌ {keyword.replace('_', ' ').title()}: No data")

print(f"\n📈 Total Posts Loaded: {len(all_data):,}")

# Convert to DataFrame for analysis
if all_data:
    df = pd.DataFrame(all_data)
    print(f"✅ DataFrame created with {len(df)} rows and {len(df.columns)} columns")
    
    # Show data structure
    print(f"\n🔍 Data Structure:")
    print(f"   Post fields: uri, text, created_at, author_handle")
    print(f"   Author fields: followers_count, influence_score, verified")
    print(f"   Content fields: word_count, hashtags, emotion_score")
    print(f"   Session fields: session_name, collected_at")
else:
    df = pd.DataFrame()
    print("❌ No data available - run collection first")


## 🚀 Data Collection Execution

### Collection Parameters

Set your collection duration below, then run the execution cell.


In [None]:
# 🔧 COLLECTION PARAMETERS - Edit these values
DURATION_SECONDS = 300  # 5 minutes (change to 30, 600, 1200, 1800, 3600 as needed)
SESSION_NAME = "notebook_test"  # Custom session name (optional)

# Duration conversion helper
duration_minutes = DURATION_SECONDS / 60
print(f"📊 Collection Parameters:")
print(f"   Duration: {DURATION_SECONDS} seconds ({duration_minutes:.1f} minutes)")
print(f"   Session: {SESSION_NAME}")
print(f"   Batching: Every 2 minutes (120 seconds)")
print(f"   Output: data/sessions/{SESSION_NAME}/ + data/alltime/")

# Common duration options
print(f"\n⏱️ Common Duration Options:")
print(f"   30 seconds (quick test): DURATION_SECONDS = 30")
print(f"   10 minutes (short): DURATION_SECONDS = 600") 
print(f"   20 minutes (medium): DURATION_SECONDS = 1200")
print(f"   30 minutes (long): DURATION_SECONDS = 1800")
print(f"   60 minutes (extended): DURATION_SECONDS = 3600")


### ▶️ Execute Collection

Run the cell below to start data collection with your parameters.


In [None]:
import subprocess
import time
from datetime import datetime

# Check current alltime data BEFORE collection
print("📊 Current Alltime Data BEFORE Collection:")
print("=" * 50)

keywords = ["food_insecurity", "housing", "homeless", "unemployment", "gender_inequality"]
before_counts = {}

for keyword in keywords:
    alltime_file = f"data/alltime/{keyword}_alltime.jsonl"
    if os.path.exists(alltime_file):
        with open(alltime_file, 'r') as f:
            count = sum(1 for line in f)
        before_counts[keyword] = count
        print(f"   {keyword.replace('_', ' ').title()}: {count} posts")
    else:
        before_counts[keyword] = 0
        print(f"   {keyword.replace('_', ' ').title()}: 0 posts")

total_before = sum(before_counts.values())
print(f"\n📈 Total Before: {total_before} posts")

print(f"\n🚀 Starting Collection...")
print(f"   Duration: {DURATION_SECONDS} seconds ({DURATION_SECONDS/60:.1f} minutes)")
print(f"   Session: {SESSION_NAME}")
print(f"   Batching: Every 2 minutes")

# Run collection
start_time = time.time()
try:
    result = subprocess.run([
        'python', 'bluesky_social_justice_collector.py',
        '--duration', str(DURATION_SECONDS),
        '--session_name', SESSION_NAME
    ], capture_output=True, text=True, timeout=DURATION_SECONDS + 60)
    
    print("✅ Collection completed!")
    print("\n📋 Collection Output:")
    print("-" * 30)
    print(result.stdout[-1000:])  # Show last 1000 characters
    
except subprocess.TimeoutExpired:
    print("⏰ Collection timed out (normal for long runs)")
except Exception as e:
    print(f"❌ Collection error: {e}")

actual_duration = time.time() - start_time
print(f"\n⏱️ Actual runtime: {actual_duration/60:.1f} minutes")


### 📊 Collection Results & Output Directory

Check the results and see what data was collected.


In [None]:
# Check AFTER collection results
print("📊 Collection Results:")
print("=" * 50)

# Check alltime data AFTER collection
after_counts = {}
for keyword in keywords:
    alltime_file = f"data/alltime/{keyword}_alltime.jsonl"
    if os.path.exists(alltime_file):
        with open(alltime_file, 'r') as f:
            count = sum(1 for line in f)
        after_counts[keyword] = count
        growth = count - before_counts.get(keyword, 0)
        print(f"   {keyword.replace('_', ' ').title()}: {count} posts (+{growth} new)")
    else:
        after_counts[keyword] = 0
        print(f"   {keyword.replace('_', ' ').title()}: 0 posts")

total_after = sum(after_counts.values())
total_growth = total_after - total_before

print(f"\n📈 Total Growth: {total_before} → {total_after} (+{total_growth} new posts)")

# Show output directories
print(f"\n📁 Output Directories:")
print(f"   Session data: data/sessions/{SESSION_NAME}/")
print(f"   Alltime data: data/alltime/")

# Check session directory
session_dir = f"data/sessions/{SESSION_NAME}"
if os.path.exists(session_dir):
    session_files = [f for f in os.listdir(session_dir) if f.endswith('.jsonl')]
    print(f"\n📂 Session Files Created:")
    for file in session_files:
        file_path = os.path.join(session_dir, file)
        with open(file_path, 'r') as f:
            count = sum(1 for line in f)
        print(f"   {file}: {count} posts")

# Show sample of latest collected data
print(f"\n📝 Sample of Latest Data:")
print("-" * 30)

for keyword in keywords:
    alltime_file = f"data/alltime/{keyword}_alltime.jsonl"
    if os.path.exists(alltime_file) and after_counts[keyword] > before_counts.get(keyword, 0):
        try:
            # Get last post
            with open(alltime_file, 'r') as f:
                lines = f.readlines()
            if lines:
                last_post = json.loads(lines[-1].strip())
                print(f"\n🎯 Latest {keyword.replace('_', ' ').title()} post:")
                print(f"   Author: @{last_post.get('author_handle', 'unknown')}")
                print(f"   Followers: {last_post.get('author_followers_count', 0):,}")
                print(f"   Text: {last_post.get('text', '')[:100]}...")
                print(f"   Session: {last_post.get('session_name', 'unknown')}")
                break
        except:
            continue
