# Swiss Voting Data Aggregation

This notebook reads all individual Swiss federal voting JSON files and combines them into a single comprehensive JSON file.

In [None]:
import json
import os
from pathlib import Path
from datetime import datetime
import glob

In [None]:
# Define paths
data_dir = Path('data/votes')
output_file = Path('data/all_votings_combined.json')

# Pattern for voting files
file_pattern = 'sd-t-17-02-*-eidgAbstimmung.json'

print(f"Data directory: {data_dir}")
print(f"Output file: {output_file}")

In [None]:
# Get all voting JSON files
voting_files = sorted(data_dir.glob(file_pattern))
print(f"Found {len(voting_files)} voting files")

# Display first few files
print("\nFirst 5 files:")
for file in voting_files[:5]:
    print(f"  - {file.name}")

In [None]:
# Initialize container for all votings
all_votings = {
    "metadata": {
        "created_at": datetime.now().isoformat(),
        "source_files_count": len(voting_files),
        "description": "Combined Swiss federal voting data from 2000-2025",
        "note": "Geographic structures (Gemeinde, Bezirke, Kantone) have changed over time"
    },
    "votings": []
}

print("Initialized data structure for combined votings")

In [None]:
# Read and process each voting file
errors = []
successful_reads = 0

for idx, file_path in enumerate(voting_files, 1):
    try:
        # Extract date from filename
        filename = file_path.name
        date_str = filename.split('-')[5].replace('eidgAbstimmung.json', '')
        
        # Read JSON file
        with open(file_path, 'r', encoding='utf-8') as f:
            voting_data = json.load(f)
        
        # Add source filename to the data
        voting_data['source_file'] = filename
        
        # Add to combined structure
        all_votings['votings'].append(voting_data)
        
        successful_reads += 1
        
        # Progress indicator
        if idx % 10 == 0:
            print(f"Processed {idx}/{len(voting_files)} files...")
            
    except Exception as e:
        error_msg = f"Error reading {filename}: {str(e)}"
        errors.append(error_msg)
        print(f"  ⚠ {error_msg}")

print(f"\n✓ Successfully read {successful_reads}/{len(voting_files)} files")
if errors:
    print(f"⚠ Encountered {len(errors)} errors")

In [None]:
# Sort votings by date
all_votings['votings'] = sorted(all_votings['votings'], key=lambda x: x.get('abstimmtag', ''))

# Display summary statistics
print("Summary Statistics:")
print(f"- Total votings: {len(all_votings['votings'])}")

if all_votings['votings']:
    print(f"- Date range: {all_votings['votings'][0].get('abstimmtag', 'N/A')} to {all_votings['votings'][-1].get('abstimmtag', 'N/A')}")
    
    # Count unique spatial reference dates
    spatial_dates = set()
    for voting in all_votings['votings']:
        if 'spatial_reference' in voting:
            for ref in voting['spatial_reference']:
                spatial_dates.add((ref.get('spatial_unit'), ref.get('spatial_date')))
    
    print(f"- Unique spatial reference combinations: {len(spatial_dates)}")
    print("\nSpatial units found:")
    for unit, date in sorted(spatial_dates):
        print(f"  - {unit}: {date}")

In [None]:
# Save combined data to JSON file
print(f"\nSaving combined data to {output_file}...")

try:
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(all_votings, f, ensure_ascii=False, indent=2)
    
    # Check file size
    file_size_mb = output_file.stat().st_size / (1024 * 1024)
    print(f"✓ Successfully saved combined JSON file")
    print(f"  File size: {file_size_mb:.2f} MB")
    
except Exception as e:
    print(f"✗ Error saving file: {str(e)}")

In [None]:
# Validation: Read back and verify
print("\nValidating saved file...")

try:
    with open(output_file, 'r', encoding='utf-8') as f:
        validation_data = json.load(f)
    
    print(f"✓ File is valid JSON")
    print(f"  Contains {len(validation_data['votings'])} voting records")
    print(f"  Metadata: {validation_data['metadata']}")
    
except Exception as e:
    print(f"✗ Validation failed: {str(e)}")

In [None]:
# Display sample structure of first voting
if all_votings['votings']:
    print("\nSample structure of first voting record:")
    first_voting = all_votings['votings'][0]
    
    def print_structure(obj, indent=0):
        """Recursively print JSON structure with data types"""
        if isinstance(obj, dict):
            for key in list(obj.keys())[:5]:  # Show first 5 keys
                value = obj[key]
                if isinstance(value, (dict, list)):
                    print(" " * indent + f"{key}: {type(value).__name__}")
                    if indent < 6:  # Limit recursion depth
                        print_structure(value, indent + 2)
                else:
                    print(" " * indent + f"{key}: {type(value).__name__} = {str(value)[:50]}..." if len(str(value)) > 50 else f"{key}: {type(value).__name__} = {value}")
        elif isinstance(obj, list) and obj:
            print(" " * indent + f"[List with {len(obj)} items]")
            if indent < 6:  # Limit recursion depth
                print_structure(obj[0], indent + 2)
    
    print_structure(first_voting)

In [None]:
# Display the structure in a readable format
print("\n" + "="*50)
print("JSON STRUCTURE (for LLM understanding):")
print("="*50 + "\n")
print(json.dumps(structure, indent=2, ensure_ascii=False)[:2000] + "...\n[truncated for display]")

print("\n" + "="*50)
print("IMPORTANT NOTES:")
print("="*50)
print("1. Full data file size: 300+ MB (all_votings_combined.json)")
print("2. Structure file size: < 50 KB (voting_structure.json)")
print("3. ⚠️  ALWAYS use the structure files for LLM analysis, NOT the full data file!")
print("4. Files created:")
print("   - data/all_votings_combined.json (300+ MB) - Full data")
print("   - data/voting_structure.json (~KB) - JSON structure for LLMs")
print("   - data/voting_structure_documentation.md (~KB) - Human-readable documentation")

In [None]:
# Also create a human-readable structure documentation
structure_doc_file = Path('data/voting_structure_documentation.md')

def generate_markdown_doc(structure, indent_level=0):
    """Generate a markdown documentation of the JSON structure"""
    lines = []
    indent = "  " * indent_level
    
    if isinstance(structure, dict):
        # Check if it's an array description
        if "_type" in structure and structure["_type"] == "array":
            lines.append(f"{indent}- **Array** ({structure.get('_count', 'unknown')} items)")
            if "_sample" in structure:
                lines.append(f"{indent}  Sample item structure:")
                lines.extend(generate_markdown_doc(structure["_sample"], indent_level + 2))
            elif "_samples" in structure:
                lines.append(f"{indent}  Note: {structure.get('_note', '')}")
                for i, sample in enumerate(structure["_samples"]):
                    lines.append(f"{indent}  Sample {i+1}:")
                    lines.extend(generate_markdown_doc(sample, indent_level + 2))
        else:
            # Regular dictionary
            for key, value in structure.items():
                if isinstance(value, dict):
                    lines.append(f"{indent}- **{key}**: Object")
                    lines.extend(generate_markdown_doc(value, indent_level + 1))
                elif isinstance(value, str):
                    lines.append(f"{indent}- **{key}**: {value}")
                else:
                    lines.append(f"{indent}- **{key}**: {str(value)}")
    else:
        lines.append(f"{indent}- {structure}")
    
    return lines

# Generate documentation
doc_content = [
    "# Swiss Voting Data Structure Documentation",
    "",
    "This document describes the structure of the combined Swiss voting JSON data.",
    "The actual data file is 300+ MB, but this structure document helps understand the schema.",
    "",
    "## Data Structure",
    ""
]

doc_content.extend(generate_markdown_doc(structure))

doc_content.extend([
    "",
    "## Notes",
    "",
    "- The full dataset (`all_votings_combined.json`) is over 300 MB",
    "- Use this structure file (`voting_structure.json`) for LLM analysis instead of the full file",
    "- Geographic structures (Gemeinde, Bezirke, Kantone) have changed over time (2000-2025)",
    "- Each voting contains nested hierarchical data: Switzerland → Cantons → Districts → Municipalities",
    ""
])

try:
    with open(structure_doc_file, 'w', encoding='utf-8') as f:
        f.write("\\n".join(doc_content))
    
    print(f"✓ Structure documentation saved to {structure_doc_file}")
    print(f"  File size: {structure_doc_file.stat().st_size / 1024:.2f} KB")
    
except Exception as e:
    print(f"✗ Error saving documentation: {str(e)}")

In [None]:
# Save the structure to a JSON file
structure_file = Path('data/voting_structure.json')

try:
    with open(structure_file, 'w', encoding='utf-8') as f:
        json.dump(structure, f, ensure_ascii=False, indent=2)
    
    # Check file size
    file_size_kb = structure_file.stat().st_size / 1024
    print(f"✓ Structure saved to {structure_file}")
    print(f"  File size: {file_size_kb:.2f} KB (vs {output_file.stat().st_size / (1024*1024):.2f} MB for full data)")
    
except Exception as e:
    print(f"✗ Error saving structure file: {str(e)}")

In [None]:
def extract_structure(obj, max_array_sample=2, depth=0, max_depth=10):
    """
    Extract the structure/schema of a JSON object.
    For arrays, only samples the first few items to keep size small.
    """
    if depth > max_depth:
        return "...[max depth reached]"
    
    if isinstance(obj, dict):
        structure = {}
        for key, value in obj.items():
            structure[key] = extract_structure(value, max_array_sample, depth + 1, max_depth)
        return structure
    
    elif isinstance(obj, list):
        if not obj:
            return "[]"
        
        # For lists, sample only first few items and detect if all items have same structure
        sample_items = obj[:min(max_array_sample, len(obj))]
        structures = [extract_structure(item, max_array_sample, depth + 1, max_depth) for item in sample_items]
        
        # Check if all sampled items have the same structure
        if len(set(str(s) for s in structures)) == 1:
            # All items have same structure
            return {
                "_type": "array",
                "_count": len(obj),
                "_sample": structures[0]
            }
        else:
            # Different structures
            return {
                "_type": "array",
                "_count": len(obj),
                "_samples": structures,
                "_note": "Items have varying structures"
            }
    
    elif isinstance(obj, str):
        # Return type and a sample value (truncated if long)
        if len(obj) > 50:
            return f"string (sample: '{obj[:47]}...')"
        return f"string ('{obj}')"
    
    elif isinstance(obj, (int, float)):
        return f"{type(obj).__name__} ({obj})"
    
    elif isinstance(obj, bool):
        return f"boolean ({obj})"
    
    elif obj is None:
        return "null"
    
    else:
        return f"{type(obj).__name__}"

# Extract structure from the combined data
print("Extracting JSON structure...")
if all_votings and all_votings.get('votings'):
    structure = extract_structure(all_votings, max_array_sample=1)
    print("✓ Structure extracted successfully")

## Extract JSON Schema/Structure

The following cells create a schema file that describes the structure of the JSON data. This schema file is much smaller than the actual data (KB instead of 300+ MB) and can be used by LLMs to understand the data structure without loading the entire dataset.