# Bluesky Homelessness Data Analysis
## DFP F25 Social Media Blue Team

This notebook demonstrates how to analyze collected homelessness data from Bluesky and generate comprehensive EDA reports.

### Prerequisites:
1. Run data collection: `python main.py` or `python demo.py`
2. Install requirements: `pip install -r requirements.txt`
3. Generate EDA report: `python improved_eda.py`


## 1. Setup and Data Loading


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json

# Load most recent data
data_dir = Path("data/alltime_socmed")
csv_files = list(data_dir.glob("*.csv"))

if csv_files:
    latest_file = max(csv_files, key=lambda x: x.stat().st_mtime)
    df = pd.read_csv(latest_file)
    print(f"✅ Loaded {len(df)} posts from {latest_file.name}")
else:
    print("❌ No data found. Run: python main.py")
    df = None


## 2. Basic Data Overview


In [None]:
if df is not None:
    print("📊 Data Overview:")
    print(f"  Total posts: {len(df)}")
    print(f"  Unique authors: {df['author_handle'].nunique()}")
    print(f"  Average text length: {df['text'].str.len().mean():.1f} characters")
    print(f"  Average word count: {df['text'].str.split().str.len().mean():.1f} words")
    print(f"  Posts with media: {(df.get('has_images', False) | df.get('has_media', False)).sum()}")
    print(f"  Average likes: {df.get('like_count', 0).mean():.1f}")
    print(f"  Average reposts: {df.get('repost_count', 0).mean():.1f}")
    
    # Show sample post
    if len(df) > 0:
        sample = df.iloc[0]
        print(f"\n📝 Sample Post:")
        print(f"  Text: {sample['text'][:100]}...")
        print(f"  Author: @{sample['author_handle']}")
        print(f"  Likes: {sample.get('like_count', 0)}")
        print(f"  Reposts: {sample.get('repost_count', 0)}")
        print(f"  Created: {sample.get('created_at', 'Unknown')}")


## 3. Generate EDA Report


In [None]:
print("📊 Generate comprehensive EDA report:")
print()
print("# Generate full EDA analysis:")
print("python improved_eda.py")
print()
print("📁 This will create: improved_eda_report.html")
print("🌐 Open the HTML file in your browser to view the analysis!")
print()
print("📈 The report includes:")
print("  • Key metrics and statistics")
print("  • Keywords searched")
print("  • Geographic distribution (word cloud + world map)")
print("  • Content analysis and word clouds")
print("  • Engagement analysis")
print("  • Author analysis")
print("  • Top posts feeds with clickable links")


## 4. Data Collection Commands


In [None]:
print("🚀 To collect new data, run one of these commands:")
print()
print("# Interactive demo (recommended):")
print("python demo.py")
print()
print("# Direct commands:")
print("python main.py --duration 60 --keywords homelessness  # 1 minute test")
print("python main.py --duration 300 --keywords all          # 5 minutes")
print("python main.py --duration 900 --keywords all          # 15 minutes")
print()
print("# Merge all session data:")
print("python main.py --merge-data")
print()
print("📁 Data will be saved to: data/alltime_socmed/")
print("📊 Then re-run this notebook to analyze the new data!")
