# NGO Network Analysis - Exploratory Data Analysis

This notebook provides examples of how to analyze the scraped data from Czech climate NGOs.

## Contents
1. Loading scraped data
2. Link network analysis
3. Document analysis
4. Visualization
5. Statistical summaries

In [None]:
# Import required libraries
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from pathlib import Path
from collections import Counter
from datetime import datetime

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

## 1. Loading Scraped Data

In [None]:
# Define data directory
data_dir = Path('../data')

# List all NGOs that have been scraped
ngo_dirs = list((data_dir / 'raw').glob('*'))
print(f"Found {len(ngo_dirs)} scraped NGOs:")
for ngo_dir in ngo_dirs:
    print(f"  - {ngo_dir.name}")

In [None]:
def load_ngo_data(ngo_name):
    """
    Load all data for a specific NGO.
    Returns links, metadata, and session info.
    """
    # Find the most recent scraping session
    ngo_path = data_dir / 'raw' / ngo_name
    sessions = sorted(ngo_path.glob('*'), reverse=True)
    
    if not sessions:
        print(f"No data found for {ngo_name}")
        return None, None, None
    
    latest_session = sessions[0]
    print(f"Loading data from: {latest_session}")
    
    # Load links
    links_file = latest_session / 'links.json'
    if links_file.exists():
        with open(links_file, 'r', encoding='utf-8') as f:
            links = json.load(f)
    else:
        links = []
    
    # Load metadata
    metadata_file = latest_session / 'metadata.json'
    if metadata_file.exists():
        with open(metadata_file, 'r', encoding='utf-8') as f:
            metadata = json.load(f)
    else:
        metadata = {}
    
    # Count pages and documents
    pages_dir = latest_session / 'pages'
    docs_dir = latest_session / 'documents'
    
    n_pages = len(list(pages_dir.glob('*.html'))) if pages_dir.exists() else 0
    n_docs = len(list(docs_dir.glob('*'))) if docs_dir.exists() else 0
    
    session_info = {
        'ngo_name': ngo_name,
        'session_date': latest_session.name,
        'n_pages': n_pages,
        'n_documents': n_docs,
        'n_links': len(links)
    }
    
    return links, metadata, session_info

In [None]:
# Example: Load data for a specific NGO
ngo_name = 'Hnut√≠ DUHA'  # Change this to any NGO name
links, metadata, session_info = load_ngo_data(ngo_name)

if session_info:
    print("\nSession Summary:")
    print(f"  NGO: {session_info['ngo_name']}")
    print(f"  Session Date: {session_info['session_date']}")
    print(f"  Pages Scraped: {session_info['n_pages']}")
    print(f"  Documents Downloaded: {session_info['n_documents']}")
    print(f"  Links Extracted: {session_info['n_links']}")

## 2. Link Network Analysis

In [None]:
# Convert links to DataFrame
if links:
    links_df = pd.DataFrame(links)
    print(f"Total links: {len(links_df)}")
    print(f"\nLink types:")
    print(links_df['link_type'].value_counts())
    
    # Display sample links
    print("\nSample links:")
    links_df.head()

In [None]:
# Analyze internal link structure
if links:
    internal_links = links_df[links_df['link_type'] == 'internal']
    
    # Most linked pages
    print("Most linked internal pages:")
    target_counts = internal_links['target_url'].value_counts().head(10)
    print(target_counts)
    
    # Plot
    plt.figure(figsize=(12, 6))
    target_counts.plot(kind='barh')
    plt.xlabel('Number of Incoming Links')
    plt.title(f'Top 10 Most Linked Pages - {ngo_name}')
    plt.tight_layout()
    plt.show()

In [None]:
# Analyze external links
if links:
    external_links = links_df[links_df['link_type'] == 'external']
    
    # Extract domains from external links
    from urllib.parse import urlparse
    
    external_links['domain'] = external_links['target_url'].apply(
        lambda x: urlparse(x).netloc
    )
    
    print("Top external domains linked to:")
    domain_counts = external_links['domain'].value_counts().head(15)
    print(domain_counts)
    
    # Plot
    plt.figure(figsize=(12, 8))
    domain_counts.plot(kind='barh')
    plt.xlabel('Number of Links')
    plt.title(f'Top 15 External Domains - {ngo_name}')
    plt.tight_layout()
    plt.show()

In [None]:
# Create network graph
if links:
    # Build network from internal links only
    G = nx.DiGraph()
    
    for _, row in internal_links.iterrows():
        G.add_edge(row['source_url'], row['target_url'])
    
    print(f"Network statistics:")
    print(f"  Nodes: {G.number_of_nodes()}")
    print(f"  Edges: {G.number_of_edges()}")
    print(f"  Density: {nx.density(G):.4f}")
    
    # Calculate degree centrality
    in_degree = dict(G.in_degree())
    out_degree = dict(G.out_degree())
    
    # Most central pages (by in-degree)
    print("\nMost central pages (by incoming links):")
    sorted_in_degree = sorted(in_degree.items(), key=lambda x: x[1], reverse=True)[:5]
    for url, degree in sorted_in_degree:
        print(f"  {degree}: {url}")

## 3. Document Analysis

In [None]:
# Load document metadata
def load_document_metadata(ngo_name):
    """Load metadata about downloaded documents."""
    ngo_path = data_dir / 'metadata' / ngo_name
    sessions = sorted(ngo_path.glob('*'), reverse=True)
    
    if not sessions:
        return pd.DataFrame()
    
    latest_session = sessions[0]
    metadata_file = latest_session / 'documents_metadata.jsonl'
    
    if not metadata_file.exists():
        return pd.DataFrame()
    
    # Read JSONL file
    docs = []
    with open(metadata_file, 'r', encoding='utf-8') as f:
        for line in f:
            docs.append(json.loads(line))
    
    return pd.DataFrame(docs)

docs_df = load_document_metadata(ngo_name)

if not docs_df.empty:
    print(f"Total documents: {len(docs_df)}")
    print(f"\nDocument types:")
    print(docs_df['content_type'].value_counts())
    
    # Document sizes
    print(f"\nDocument size statistics (bytes):")
    print(docs_df['size_bytes'].describe())
    
    # Display sample
    docs_df.head()

## 4. Cross-NGO Analysis

In [None]:
# Load data for all NGOs
all_ngo_stats = []

for ngo_dir in ngo_dirs:
    ngo_name = ngo_dir.name
    _, _, session_info = load_ngo_data(ngo_name)
    if session_info:
        all_ngo_stats.append(session_info)

# Create DataFrame
ngo_stats_df = pd.DataFrame(all_ngo_stats)

if not ngo_stats_df.empty:
    print("Summary statistics across all NGOs:")
    print(ngo_stats_df[['n_pages', 'n_documents', 'n_links']].describe())
    
    # Display all NGOs
    ngo_stats_df.sort_values('n_pages', ascending=False)

In [None]:
# Visualize comparison
if not ngo_stats_df.empty:
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))
    
    # Pages
    ngo_stats_df.sort_values('n_pages', ascending=True).plot(
        x='ngo_name', y='n_pages', kind='barh', ax=axes[0], legend=False
    )
    axes[0].set_title('Pages Scraped')
    axes[0].set_xlabel('Number of Pages')
    
    # Documents
    ngo_stats_df.sort_values('n_documents', ascending=True).plot(
        x='ngo_name', y='n_documents', kind='barh', ax=axes[1], legend=False, color='orange'
    )
    axes[1].set_title('Documents Downloaded')
    axes[1].set_xlabel('Number of Documents')
    
    # Links
    ngo_stats_df.sort_values('n_links', ascending=True).plot(
        x='ngo_name', y='n_links', kind='barh', ax=axes[2], legend=False, color='green'
    )
    axes[2].set_title('Links Extracted')
    axes[2].set_xlabel('Number of Links')
    
    plt.tight_layout()
    plt.show()

## 5. Inter-NGO Network Analysis

In [None]:
# Build inter-NGO network based on external links
# This shows which NGOs link to each other

inter_ngo_edges = []

# Load NGO domains
ngo_config = pd.read_csv('../config/ngo_list.csv')
ngo_domains = dict(zip(ngo_config['canonical_name'], ngo_config['website_domain']))

# For each NGO, check external links to other NGOs
for ngo_name in ngo_domains.keys():
    links, _, _ = load_ngo_data(ngo_name)
    if not links:
        continue
    
    links_df = pd.DataFrame(links)
    external = links_df[links_df['link_type'] == 'external']
    
    for _, row in external.iterrows():
        target_url = row['target_url']
        
        # Check if this links to another NGO
        for target_ngo, domain in ngo_domains.items():
            if domain in target_url and target_ngo != ngo_name:
                inter_ngo_edges.append({
                    'source': ngo_name,
                    'target': target_ngo
                })
                break

if inter_ngo_edges:
    inter_ngo_df = pd.DataFrame(inter_ngo_edges)
    print(f"Found {len(inter_ngo_df)} inter-NGO links")
    
    # Count links between NGOs
    link_counts = inter_ngo_df.groupby(['source', 'target']).size().reset_index(name='count')
    print("\nTop inter-NGO connections:")
    print(link_counts.sort_values('count', ascending=False).head(10))
else:
    print("No inter-NGO links found (need to scrape more NGOs)")

In [None]:
# Visualize inter-NGO network
if inter_ngo_edges:
    # Create network
    G_inter = nx.DiGraph()
    for _, row in link_counts.iterrows():
        G_inter.add_edge(row['source'], row['target'], weight=row['count'])
    
    # Plot
    plt.figure(figsize=(14, 14))
    pos = nx.spring_layout(G_inter, k=2, iterations=50)
    
    # Draw nodes
    nx.draw_networkx_nodes(G_inter, pos, node_size=1000, node_color='lightblue', alpha=0.9)
    
    # Draw edges with varying thickness
    edges = G_inter.edges()
    weights = [G_inter[u][v]['weight'] for u, v in edges]
    nx.draw_networkx_edges(G_inter, pos, width=[w*0.5 for w in weights], 
                          alpha=0.5, arrows=True, arrowsize=20)
    
    # Draw labels
    nx.draw_networkx_labels(G_inter, pos, font_size=8)
    
    plt.title('Inter-NGO Link Network')
    plt.axis('off')
    plt.tight_layout()
    plt.show()

## 6. Export for Further Analysis

In [None]:
# Export processed data for further analysis
if not ngo_stats_df.empty:
    ngo_stats_df.to_csv('../data/ngo_summary_statistics.csv', index=False)
    print("Exported summary statistics to: data/ngo_summary_statistics.csv")

if inter_ngo_edges:
    link_counts.to_csv('../data/inter_ngo_links.csv', index=False)
    print("Exported inter-NGO links to: data/inter_ngo_links.csv")

## Next Steps

1. **Content Analysis**: Extract and analyze text content from HTML pages
2. **LLM Analysis**: Use LLMs to extract structured information about relationships
3. **Comparison**: Compare with survey-based COMPON data
4. **Temporal Analysis**: Track changes over multiple scraping sessions
5. **Topic Modeling**: Identify main themes and topics across NGOs