# European Citizens' Initiative (ECI) Data Analysis
## Exploratory Data Analysis for ECI Initiative Organizers

This notebook analyzes 121 ECI initiatives registered between 2012-2025,
examining success patterns, barriers, and key performance indicators.

**Enhanced with interactive Plotly visualizations** ðŸ“Š

## Setup: Import Libraries and Load Data

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import json
from collections import Counter
import warnings
import os 
warnings.filterwarnings('ignore')

# Plotly for interactive visualizations
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Set plotly default template
import plotly.io as pio
pio.templates.default = "plotly_white"

# Load the dataset
data_folder = "../data/2025-09-18_16-33-57"
df = pd.read_csv(f'{data_folder}/eci_initiatives_2025-11-04_11-59-38.csv')
print(f"Dataset loaded: {len(df)} initiatives")
print(f"Columns: {len(df.columns)}")

# Set colors
viridis_colors = px.colors.sequential.Viridis

## Data Cleaning and Feature Engineering

In [None]:
# Parse date strings from DD/MM/YYYY to datetime
def parse_date(date_str):
    if pd.isna(date_str) or date_str == '':
        return pd.NaT
    try:
        return pd.to_datetime(date_str, format='%d/%m/%Y')
    except:
        return pd.NaT

# Parse main timeline dates
df['registered_date'] = df['timeline_registered'].apply(parse_date)
df['collection_start_date'] = df['timeline_collection_start_date'].apply(parse_date)
df['collection_closed_date'] = df['timeline_collection_closed'].apply(parse_date)
df['verification_start_date'] = df['timeline_verification_start'].apply(parse_date)
df['verification_end_date'] = df['timeline_verification_end'].apply(parse_date)
df['commission_response_date'] = df['timeline_response_commission_date'].apply(parse_date)

# Extract year from registration
df['registration_year'] = df['registered_date'].dt.year

# Calculate durations
df['collection_duration_days'] = (df['collection_closed_date'] - df['collection_start_date']).dt.days
df['verification_duration_days'] = (df['verification_end_date'] - df['verification_start_date']).dt.days
df['time_to_commission_response_days'] = (df['commission_response_date'] - df['registered_date']).dt.days
df['registration_to_collection_days'] = (df['collection_start_date'] - df['registered_date']).dt.days

# Parse signatures (handle commas and convert to numeric)
def parse_signatures(sig):
    if pd.isna(sig):
        return np.nan
    if isinstance(sig, str):
        return float(sig.replace(',', ''))
    return float(sig)

df['signatures_numeric'] = df['signatures_collected'].apply(parse_signatures)
df['signatures_threshold_met_numeric'] = pd.to_numeric(df['signatures_threshold_met'], errors='coerce')

# Parse funding (handle commas and convert to numeric)
def parse_funding(funding):
    if pd.isna(funding):
        return 0
    if isinstance(funding, str):
        return float(funding.replace(',', ''))
    return float(funding)

df['funding_numeric'] = df['funding_total'].apply(parse_funding)

# Define success categories
df['reached_signatures'] = df['signatures_numeric'] >= 1000000
df['met_country_threshold'] = df['signatures_threshold_met_numeric'] >= 7
df['successful_eci'] = df['reached_signatures'] & df['met_country_threshold']
df['commission_responded'] = df['final_outcome'] == 'Commission Response'

# Create enhanced outcome category
def categorize_outcome(row):
    if row['final_outcome'] == 'Withdrawn':
        return 'Withdrawn'
    elif row['final_outcome'] == 'Commission Response':
        return 'Commission Response'
    elif row['successful_eci']:
        return 'Collected Signatures'
    else:
        return 'Unsuccessful Collection'

df['final_outcome_enhanced'] = df.apply(categorize_outcome, axis=1)

print("\n=== Data Cleaning Complete ===")
print(f"Total initiatives: {len(df)}")
print(f"Reached 1M signatures: {df['reached_signatures'].sum()}")
print(f"Met country threshold (7+): {df['met_country_threshold'].sum()}")
print(f"Successful ECIs (both criteria): {df['successful_eci'].sum()}")
print(f"Commission responded: {df['commission_responded'].sum()}")
print(f"Waiting for response: {(df['final_outcome_enhanced'] == 'Waiting Response').sum()}")

In [None]:
# ============================================================================
# CATEGORY ASSIGNMENT (moved here for early availability)
# ============================================================================

import numpy as np

# --- 1) Load predefined categories and merge into df ---
# NOTE: Manual categorization chosen for simplicity and speed of development.
# Dataset updates are infrequent, so manual maintenance remains practical.
# Alternative: automated classification (ML model) not justified for this scale.
df_cats = pd.read_csv(
    "eci_categories.csv",
    dtype={"registration_number": "string", "categories": "string"},
    usecols=["registration_number", "categories"],
)

# Ensure merge keys are consistent types
df["registration_number"] = df["registration_number"].astype("string")

# Left-join: keep all initiatives; attach predefined category when available
df = df.merge(df_cats, on="registration_number", how="left")

# Treat empty strings as missing (defensive)
df["categories"] = df["categories"].replace("", pd.NA)

# --- 2) Keyword fallback logic (only used if no predefined category exists) ---
policy_keywords = {
    "Education & Culture": [
        "education", "school", "university", "student", "teacher", "learning",
        "culture", "cultural", "heritage", "art", "museum", "language",
        "erasmus", "youth", "training", "literacy", "academic", "curriculum"
    ],
    
    "Digital & Communications": [
        "digital", "internet", "online", "cyber", "data protection", "privacy",
        "technology", "telecommunication", "broadband", "connectivity",
        "artificial intelligence", "ai", "platform", "social media", "network",
        "electronic", "software", "digital rights", "tech"
    ],
    
    "Social Policy": [
        "social", "poverty", "inequality", "welfare", "housing", "homeless",
        "employment", "unemployment", "worker", "labour", "disability",
        "pension", "retirement", "family", "child", "children", "elderly",
        "inclusion", "discrimination", "gender", "equal", "minimum wage"
    ],
    
    "Agriculture & Fisheries & Animal rights": [
        "agriculture", "farming", "farm", "farmer", "crop", "livestock",
        "fisheries", "fishing", "fish", "aquaculture", "animal", "animals",
        "animal welfare", "animal rights", "wildlife", "bee", "bees",
        "pesticide", "herbicide", "rural", "veterinary", "cattle", "meat",
        "dairy", "fur", "hunting", "animal testing", "vegan", "slaughter", "diary"
    ],
    
    "Health": [
        "health", "healthcare", "medical", "medicine", "hospital", "patient",
        "disease", "pandemic", "epidemic", "vaccine", "vaccination",
        "pharmaceutical", "drug", "mental health", "healthcare system",
        "doctor", "nurse", "cancer", "diabetes", "public health", "covid",
        "tobacco", "smoking", "alcohol", "addiction", "nutrition", "cannabis",
        "psychedelics"
    ],
    
    "Democracy & Citizens' rights": [
        "democracy", "democratic", "citizen", "citizenship", "rights",
        "human rights", "freedom", "vote", "voting", "election", "referendum",
        "participation", "transparency", "accountability", "rule of law",
        "justice", "court", "legal", "civil rights", "fundamental rights",
        "constitution", "treaty", "sovereignty", "parliament", "representation"
    ],
    
    "Environment & Climate": [
        "environment", "environmental", "climate", "climate change", "global warming",
        "pollution", "air quality", "water", "ocean", "sea", "plastic",
        "waste", "recycling", "biodiversity", "ecosystem", "nature",
        "conservation", "deforestation", "forest", "renewable", "energy",
        "sustainability", "sustainable", "carbon", "emission", "green deal",
        "fossil fuel", "coal", "oil", "gas", "nuclear", "solar", "wind"
    ],
    
    "Transport": [
        "transport", "transportation", "traffic", "road", "highway", "railway",
        "train", "aviation", "aircraft", "flight", "airport", "vehicle",
        "car", "automobile", "bus", "truck", "shipping", "maritime",
        "mobility", "infrastructure", "public transport", "metro", "cycling"
    ],
    
    "Economy & Finance": [
        "economy", "economic", "finance", "financial", "tax", "taxation",
        "budget", "fiscal", "bank", "banking", "investment", "currency",
        "euro", "trade", "market", "business", "industry", "growth",
        "inflation", "debt", "deficit", "monetary", "corporate", "vat",
        "subsidy", "funding", "revenue", "economic policy"
    ],
    
    "Consumer protection": [
        "consumer", "consumer rights", "consumer protection", "product safety",
        "food safety", "labelling", "label", "quality", "warranty",
        "advertising", "marketing", "price", "competition", "monopoly",
        "contract", "purchase", "refund", "complaint", "fraud", "scam"
    ],
    
    "Joke ECI": [
        "european day of whatever it takes"
    ],
    
    "Other": [],  # Fallback
}


def categorize_initiative(title, objective):
    text = (str(title) + " " + str(objective)).lower()
    matched = []
    for category, keywords in policy_keywords.items():
        if keywords and any(k.lower() in text for k in keywords):
            matched.append(category)
    return matched if matched else ["Other"]


# Compute fallback categories for all rows
df["policy_areas_fallback"] = df.apply(
    lambda row: categorize_initiative(row["title"], row["objective"]),
    axis=1,
)
df["primary_policy_area_fallback"] = df["policy_areas_fallback"].apply(lambda x: x[0] if x else "Other")

# --- 3) Final policy area: predefined when available, else fallback ---
df["primary_policy_area"] = df["categories"].fillna(df["primary_policy_area_fallback"]).fillna("Other")

# Optional: keep a unified list-form column too
df["policy_areas"] = np.where(
    df["categories"].notna(),
    df["categories"].apply(lambda x: [x]),
    df["policy_areas_fallback"],
)

print("âœ“ Categories assigned to all initiatives")


## QUESTION 1: Status and Outcome Distribution

In [None]:
print("\n" + "="*80)
print("QUESTION 2: STATUS AND OUTCOME DISTRIBUTION")
print("="*80)

print("\nCurrent Status Distribution:")
status_dist = df['current_status'].value_counts().reset_index()
status_dist.columns = ['Status', 'Count']
status_dist['Percentage'] = (status_dist['Count'] / len(df) * 100).round(2)
print(status_dist.to_string(index=False))

print("\n\nFinal Outcome Distribution:")
outcome_dist = df['final_outcome_enhanced'].value_counts(dropna=False).reset_index()
outcome_dist.columns = ['Outcome', 'Count']
outcome_dist['Percentage'] = (outcome_dist['Count'] / len(df) * 100).round(2)
print(outcome_dist.to_string(index=False))

print("\n\nWithdrawal Analysis:")
withdrawn = df[df['final_outcome_enhanced'] == 'Withdrawn']
print(f"Total withdrawn: {len(withdrawn)}")
print(f"Percentage of all initiatives: {(len(withdrawn)/len(df)*100):.2f}%")
print(f"Had collection data: {withdrawn['collection_start_date'].notna().sum()}")
print(f"Had signatures data: {withdrawn['signatures_numeric'].notna().sum()}")

In [None]:
# ðŸ“Š VISUALIZATION: Pie Chart of Final Outcomes
outcome_clean = outcome_dist[outcome_dist['Outcome'].notna()].copy()

# Prepare ECI lists BEFORE renaming
outcome_eci_lists = []
for outcome in outcome_clean['Outcome']:
    ecis = df[df['final_outcome_enhanced'] == outcome]['title'].tolist()
    if not ecis:
        eci_text = "No ECIs"
    elif len(ecis) <= 15:
        eci_text = '<br>'.join(f"â€¢ {title}" for title in ecis)
    else:
        eci_text = '<br>'.join(f"â€¢ {title}" for title in ecis[:15])
        eci_text += f"<br><i>... (and {len(ecis) - 15} more)</i>"
    outcome_eci_lists.append(eci_text)

outcome_clean['ECI_List'] = outcome_eci_lists

# Rename AFTER preparing ECI lists
outcome_clean['Outcome'] = outcome_clean['Outcome'].replace('Collected Signatures', 'Waiting Response')

# Define custom colors
custom_colors = {
    'Unsuccessful Collection': '#C34242',
    'Commission Response': '#3CA371',
    'Waiting Response': '#F5A623',
    'Withdrawn': '#909090'
}

color_sequence = [custom_colors.get(outcome, '#CCCCCC') for outcome in outcome_clean['Outcome']]

fig = px.pie(outcome_clean, 
             values='Count', 
             names='Outcome',
             title='Final Outcome Distribution of All ECI Initiatives',
             hole=0.1,
             color_discrete_sequence=color_sequence,
             custom_data=['ECI_List'])

fig.update_traces(
    textposition='inside', 
    textinfo='percent+label+value',
    hovertemplate='<b>%{label}</b><br>' +
                  'Count: %{value}<br>' +
                  'Percentage: %{percent}<br><br>' +
                  '<b>ECIs:</b><br>%{customdata[0]}' +
                  '<extra></extra>',
    textfont=dict(size=16, family='Arial Black', color='white')
)
fig.update_layout(height=600)
fig.show()

In [None]:
# Remove column width limit for full title display
with pd.option_context('display.max_colwidth', None):
    result = (
        df[(df['current_status'].isin(['Answered initiative', 
                                        'Unsuccessful collection',
                                        'Verification\n                  \n                   *',
                                        'Verification',
                                        'Withdrawn'])) & 
           (df['signatures_numeric'] >= 500000)]
        [['title', 'signatures_collected', 'signatures_threshold_met', 'primary_policy_area', 'successful_eci']]
        .sort_values(['successful_eci', 'signatures_collected', 'signatures_threshold_met'], 
                     ascending=[False, False, False])
        .rename(columns={'signatures_threshold_met': 'countries_threshold_met'})
        .reset_index(drop=True)
    )
    result.index = result.index + 1
    display(result)

## QUESTION 2: Overall Success Funnel Analysis

In [None]:
# Calculate conversion rates at each stage of the ECI process
total_registered = len(df)
started_collection = df['collection_start_date'].notna().sum()
completed_collection = df['collection_closed_date'].notna().sum()
reached_1m = df['reached_signatures'].sum()
met_threshold = df['met_country_threshold'].sum()
both_criteria = df['successful_eci'].sum()
commission_response = df['commission_responded'].sum()

# Create funnel dataframe
funnel_data = pd.DataFrame({
    'Stage': [
        '1. Registered',
        '2. Collection Started',
        '3. Collection Completed',
        '4. Met Country Threshold (7+)',
        '5. Reached 1M Signatures',
        '6. Successful (Both Criteria)',
        '7. Commission Response'
    ],
    'Count': [
        total_registered,
        started_collection,
        completed_collection,
        met_threshold,
        reached_1m,
        both_criteria,
        commission_response
    ]
})

funnel_data['Percentage of Registered'] = (funnel_data['Count'] / total_registered * 100).round(2)
funnel_data['Conversion from Previous Stage'] = (funnel_data['Count'] / funnel_data['Count'].shift(1) * 100).round(2)

print("\n" + "="*80)
print("QUESTION 2: ECI SUCCESS FUNNEL ANALYSIS")
print("="*80)
print("\nSuccess Funnel:")
print(funnel_data.to_string(index=False))

print(f"\n\nKey Insights:")
print(f"- Overall success rate (registered â†’ commission response): {(commission_response/total_registered*100):.2f}%")
print(f"- Success rate (registered â†’ met both criteria): {(both_criteria/total_registered*100):.2f}%")
print(f"- Response rate (successful â†’ commission response): {(commission_response/both_criteria*100):.2f}%")

In [None]:
# ðŸ“Š VISUALIZATION: Interactive Funnel Chart
colors = [viridis_colors[i] for i in [0, 2, 4, 6, 7, 8, 9]]

# Create stage names without numbers for hover
stage_names_no_numbers = funnel_data['Stage']

fig = go.Figure(go.Funnel(
    y = funnel_data['Stage'],
    x = funnel_data['Count'],
    textposition = "inside",
    textinfo = "value+percent previous",
    marker = dict(
        color = colors
    ),
    connector = {
        "fillcolor": "#CEDFF6",  # Light steel blue for connector fill
        "visible": True
    },
    customdata = stage_names_no_numbers,
    hovertemplate = '<b>%{customdata}:</b><br>' +
                    'Count: %{x}<br>' +
                    'Percent of initial: %{percentInitial}<br>' +
                    'Percent of previous: %{percentPrevious}<br>' +
                    '<extra></extra>'
))

fig.update_layout(
    title="ECI Success Funnel: From Registration to Commission Response",
    height=600,
    font=dict(size=14)
)

fig.show()

## QUESTION 3: Signature Collection Performance Analysis

In [None]:
print("\n" + "="*80)
print("QUESTION 3: SIGNATURE COLLECTION PERFORMANCE ANALYSIS")
print("="*80)

initiatives_with_sigs = df[df['signatures_numeric'].notna()].copy()

print(f"\nTotal initiatives with signature data: {len(initiatives_with_sigs)}")
print(f"\nSignature collection statistics:")
sig_stats = initiatives_with_sigs['signatures_numeric'].describe()
print(sig_stats)

# Break down by success
print("\n\nSignature distribution by outcome:")
sig_by_outcome = initiatives_with_sigs.groupby('final_outcome').agg({
    'signatures_numeric': ['count', 'mean', 'median', 'min', 'max']
}).round(0)
print(sig_by_outcome)

# Country threshold analysis
print("\n\nCountry threshold analysis:")
threshold_stats = initiatives_with_sigs.groupby('met_country_threshold').agg({
    'signatures_numeric': ['count', 'mean', 'median'],
    'signatures_threshold_met_numeric': ['mean', 'max']
}).round(2)
print(threshold_stats)

# Correlation between collection duration and signatures
print("\n\nCorrelation: Collection Duration vs Signatures Collected")
initiatives_with_both = initiatives_with_sigs[initiatives_with_sigs['collection_duration_days'].notna()]
if len(initiatives_with_both) > 0:
    correlation = initiatives_with_both[['collection_duration_days', 'signatures_numeric']].corr()
    print(correlation)
    
print("\n\nSuccess rate by signature brackets:")
initiatives_with_sigs['sig_bracket'] = pd.cut(
    initiatives_with_sigs['signatures_numeric'],
    bins=[0, 100000, 250000, 500000, 750000, 1000000, 10000000],
    labels=['<100K', '100K-250K', '250K-500K', '500K-750K', '750K-1M', '1M+']
)
sig_bracket_analysis = initiatives_with_sigs.groupby('sig_bracket', observed=True).agg({
    'registration_number': 'count',
    'successful_eci': 'sum',
    'commission_responded': 'sum'
}).reset_index()
sig_bracket_analysis.columns = ['Signature Bracket', 'Count', 'Successful', 'Commission Response']
print(sig_bracket_analysis.to_string(index=False))

In [None]:
# ðŸ“Š VISUALIZATION: Histogram of Signature Distribution with Gradient Colors

# Create bins manually
num_bins = 50
bins = np.linspace(0, initiatives_with_sigs['signatures_numeric'].max(), num_bins + 1)

threshold = 1000000
below_bins = bins[bins < threshold]
above_bins = bins[bins >= threshold]

# Create histograms
hist_below, bin_edges_below = np.histogram(
    initiatives_with_sigs[initiatives_with_sigs['signatures_numeric'] < threshold]['signatures_numeric'],
    bins=below_bins
)

hist_above, bin_edges_above = np.histogram(
    initiatives_with_sigs[initiatives_with_sigs['signatures_numeric'] >= threshold]['signatures_numeric'],
    bins=above_bins
)

# Function to get ECI titles for a bin
def get_bin_ecis(bin_start, bin_end):
    bin_ecis = initiatives_with_sigs[
        (initiatives_with_sigs['signatures_numeric'] >= bin_start) & 
        (initiatives_with_sigs['signatures_numeric'] <= bin_end)
    ]['title'].tolist()
    
    if not bin_ecis:
        return "No ECIs"
    elif len(bin_ecis) <= 15:
        return '<br>'.join(f"â€¢ {title}" for title in bin_ecis)
    else:
        titles_text = '<br>'.join(f"â€¢ {title}" for title in bin_ecis[:15])
        titles_text += f"<br><i>... (and {len(bin_ecis) - 15} more)</i>"
        return titles_text

# Prepare ECI lists for each bin
eci_lists_below = [get_bin_ecis(bin_edges_below[i], bin_edges_below[i+1]) for i in range(len(bin_edges_below)-1)]
eci_lists_above = [get_bin_ecis(bin_edges_above[i], bin_edges_above[i+1]) for i in range(len(bin_edges_above)-1)]

bin_centers_below = (bin_edges_below[:-1] + bin_edges_below[1:]) / 2
bin_centers_above = (bin_edges_above[:-1] + bin_edges_above[1:]) / 2

# Create color arrays
colors_below = []
for center in bin_centers_below:
    ratio = center / threshold
    r = int(195 + (255 - 195) * ratio)
    g = int(66 + (244 - 66) * ratio)
    b = int(66 + (79 - 66) * ratio)
    colors_below.append(f'rgb({r},{g},{b})')

colors_above = []
max_sig = initiatives_with_sigs['signatures_numeric'].max()
for center in bin_centers_above:
    ratio = min((center - threshold) / threshold, 1.0)
    r = int(184 - (184 - 60) * ratio)
    g = int(216 - (216 - 163) * ratio)
    b = int(127 - (127 - 113) * ratio)
    colors_above.append(f'rgb({r},{g},{b})')

fig = go.Figure()

# Add bars for below threshold
fig.add_trace(go.Bar(
    x=bin_centers_below,
    y=hist_below,
    name='Below 1M Threshold',
    marker=dict(color=colors_below, line=dict(color='white', width=0.5)),
    width=np.diff(bin_edges_below),
    customdata=eci_lists_below,
    hovertemplate='<b>Signatures Range:</b> %{x:,.0f}<br>' +
                  '<b>Count:</b> %{y}<br><br>' +
                  '<b>ECIs:</b><br>%{customdata}' +
                  '<extra></extra>'
))

# Add bars for above threshold
fig.add_trace(go.Bar(
    x=bin_centers_above,
    y=hist_above,
    name='Above 1M Threshold',
    marker=dict(color=colors_above, line=dict(color='white', width=0.5)),
    width=np.diff(bin_edges_above),
    customdata=eci_lists_above,
    hovertemplate='<b>Signatures Range:</b> %{x:,.0f}<br>' +
                  '<b>Count:</b> %{y}<br><br>' +
                  '<b>ECIs:</b><br>%{customdata}' +
                  '<extra></extra>'
))

fig.add_vline(x=1000000, line_dash="dash", line_color="#3AB23F", line_width=3,
              annotation_text="1M Threshold", annotation_position="top right",
              annotation_font_size=14)

fig.update_layout(
    title='Distribution of Signature Counts (All ECIs with Data)',
    xaxis_title="Signatures Collected",
    yaxis_title="Number of Initiatives",
    height=500,
    showlegend=True,
    font=dict(size=14),
    bargap=0.05
)

fig.show()

## QUESTION 4: Member State Participation Analysis

In [None]:
print("\n" + "="*80)
print("QUESTION 4: MEMBER STATE PARTICIPATION ANALYSIS")
print("="*80)

# Function to extract countries that met threshold
def extract_countries_met_threshold(row):
    if pd.isna(row['signatures_collected_by_country']):
        return []
    
    try:
        country_data = json.loads(row['signatures_collected_by_country'])
        countries_met = []
        for country, data in country_data.items():
            if isinstance(data, dict) and 'percentage' in data:
                pct_str = data['percentage'].rstrip('%')
                try:
                    pct = float(pct_str)
                    if pct >= 100.0:
                        countries_met.append(country)
                except:
                    pass
        return countries_met
    except:
        return []

df['countries_met_threshold_list'] = df.apply(extract_countries_met_threshold, axis=1)

# Count countries in successful ECIs
all_countries = []
for countries in df[df['successful_eci']]['countries_met_threshold_list']:
    all_countries.extend(countries)

country_counts = Counter(all_countries)
country_participation = pd.DataFrame(country_counts.items(), columns=['Country', 'Times Met Threshold'])
country_participation = country_participation.sort_values('Times Met Threshold', ascending=False)

print(f"\nCountries that met threshold in successful ECIs (n={len(df[df['successful_eci']])} successful initiatives):")
country_participation['Participation Rate (%)'] = (country_participation['Times Met Threshold'] / len(df[df['successful_eci']]) * 100).round(2)
print(country_participation.to_string(index=False))

# Organizer country analysis
def extract_organizer_countries(row):
    try:
        org_data = json.loads(row['organizer_representative'])
        if 'countries_of_residence' in org_data:
            return list(org_data['countries_of_residence'].keys())
        return []
    except:
        return []

df['organizer_countries'] = df.apply(extract_organizer_countries, axis=1)

# Count organizer countries
all_org_countries = []
for countries in df['organizer_countries']:
    all_org_countries.extend(countries)

org_country_counts = Counter(all_org_countries)
org_participation = pd.DataFrame(org_country_counts.items(), columns=['Country', 'Initiatives Organized'])
org_participation = org_participation.sort_values('Initiatives Organized', ascending=False)

print("\n\nTop 15 countries by number of initiatives organized:")
print(org_participation.head(15).to_string(index=False))

# Successful organizer countries
successful_org_countries = []
for countries in df[df['successful_eci']]['organizer_countries']:
    successful_org_countries.extend(countries)

successful_org_counts = Counter(successful_org_countries)
successful_org_participation = pd.DataFrame(successful_org_counts.items(), columns=['Country', 'Successful Initiatives'])
successful_org_participation = successful_org_participation.sort_values('Successful Initiatives', ascending=False)

print("\n\nTop 10 countries organizing successful initiatives:")
print(successful_org_participation.head(10).to_string(index=False))

In [None]:
# ðŸ“Š VISUALIZATION: Country Participation Charts
top_countries = country_participation.head(15).iloc[::-1]

# Prepare ECI lists for each country
country_eci_lists = []
for country in top_countries['Country']:
    # Find ECIs where this country met threshold
    country_ecis = []
    for idx, row in df[df['successful_eci']].iterrows():
        if country in row.get('countries_met_threshold_list', []):
            country_ecis.append(row['title'])
    
    if len(country_ecis) <= 15:
        eci_text = '<br>'.join(f"â€¢ {title}" for title in country_ecis)
    else:
        eci_text = '<br>'.join(f"â€¢ {title}" for title in country_ecis[:15])
        eci_text += f"<br><i>... (and {len(country_ecis) - 15} more)</i>"
    country_eci_lists.append(eci_text)

fig = px.bar(top_countries,
             x='Times Met Threshold',
             y='Country',
             orientation='h',
             title='Top 15 Countries Leading ECI Signature Thresholds',
             color='Participation Rate (%)',
             color_continuous_scale='Viridis',
             text='Times Met Threshold',
             custom_data=[country_eci_lists])

fig.update_traces(
    textposition='outside',
    hovertemplate='<b>%{y}</b><br>' +
                  'Times Met Threshold: %{x}<br>' +
                  'Participation Rate: %{marker.color:.2f}%<br><br>' +
                  '<b>ECIs:</b><br>%{customdata[0]}' +
                  '<extra></extra>'
)
fig.update_layout(height=600)
fig.show()

# Organizer countries comparison
fig2 = make_subplots(rows=1, cols=2,
                     subplot_titles=('All Initiatives Organized', 'Successful Initiatives Organized'),
                     horizontal_spacing=0.15)

org_top10 = org_participation.head(10).iloc[::-1]
successful_top10 = successful_org_participation.head(10).iloc[::-1]

# Prepare ECI lists for organizer countries
org_eci_lists = []
for country in org_top10['Country']:
    country_ecis = df[df['organizer_countries'].apply(lambda x: country in x)]['title'].tolist()
    if len(country_ecis) <= 15:
        eci_text = '<br>'.join(f"â€¢ {title}" for title in country_ecis)
    else:
        eci_text = '<br>'.join(f"â€¢ {title}" for title in country_ecis[:15])
        eci_text += f"<br><i>... (and {len(country_ecis) - 15} more)</i>"
    org_eci_lists.append(eci_text)

successful_eci_lists = []
for country in successful_top10['Country']:
    country_ecis = df[(df['successful_eci']) & (df['organizer_countries'].apply(lambda x: country in x))]['title'].tolist()
    if len(country_ecis) <= 15:
        eci_text = '<br>'.join(f"â€¢ {title}" for title in country_ecis)
    else:
        eci_text = '<br>'.join(f"â€¢ {title}" for title in country_ecis[:15])
        eci_text += f"<br><i>... (and {len(country_ecis) - 15} more)</i>"
    successful_eci_lists.append(eci_text)

fig2.add_trace(
    go.Bar(x=org_top10['Initiatives Organized'],
           y=org_top10['Country'],
           orientation='h',
           marker_color='lightblue',
           showlegend=False,
           customdata=org_eci_lists,
           hovertemplate='<b>%{y}</b><br>' +
                         'Initiatives: %{x}<br><br>' +
                         '<b>ECIs:</b><br>%{customdata}' +
                         '<extra></extra>'),
    row=1, col=1
)

fig2.add_trace(
    go.Bar(x=successful_top10['Successful Initiatives'],
           y=successful_top10['Country'],
           orientation='h',
           marker_color='#3CA371',
           showlegend=False,
           customdata=successful_eci_lists,
           hovertemplate='<b>%{y}</b><br>' +
                         'Successful: %{x}<br><br>' +
                         '<b>ECIs:</b><br>%{customdata}' +
                         '<extra></extra>'),
    row=1, col=2
)

fig2.update_layout(height=500, title_text='Countries Organizing ECIs')
fig2.show()

## QUESTION 5: Temporal Trends in Success Rates

In [None]:
print("\n" + "="*80)
print("QUESTION 5: TEMPORAL TRENDS IN ECI SUCCESS RATES")
print("="*80)

yearly_stats = df.groupby('registration_year').agg({
    'registration_number': 'count',
    'reached_signatures': 'sum',
    'met_country_threshold': 'sum',
    'successful_eci': 'sum',
    'commission_responded': 'sum'
}).reset_index()

yearly_stats.columns = ['Year', 'Total Registered', 'Reached 1M', 'Met Country Threshold', 'Successful', 'Commission Response']
yearly_stats['Success Rate (%)'] = (yearly_stats['Successful'] / yearly_stats['Total Registered'] * 100).round(2)
yearly_stats['Response Rate (%)'] = (yearly_stats['Commission Response'] / yearly_stats['Total Registered'] * 100).round(2)

# Calculate Failed count
yearly_stats['Failed'] = yearly_stats['Total Registered'] - yearly_stats['Successful']
yearly_stats['Successful No Response'] = yearly_stats['Successful'] - yearly_stats['Commission Response']

print("\nYearly Success Rates:")
print(yearly_stats.to_string(index=False))

print(f"\n\nTrend Analysis:")
print(f"Best year for success rate: {yearly_stats.loc[yearly_stats['Success Rate (%)'].idxmax(), 'Year']:.0f} ({yearly_stats['Success Rate (%)'].max():.2f}%)")
print(f"Worst year for success rate: {yearly_stats.loc[yearly_stats['Success Rate (%)'].idxmin(), 'Year']:.0f} ({yearly_stats['Success Rate (%)'].min():.2f}%)")
print(f"Average success rate: {yearly_stats['Success Rate (%)'].mean():.2f}%")

In [None]:
# Prepare custom hover data with ECI titles for each category and year
years = sorted(df['registration_year'].unique())

# Calculate yearly stats including in-progress initiatives
yearly_stats_detailed = df.groupby('registration_year').agg({
    'registration_number': 'count',
    'reached_signatures': 'sum',
    'met_country_threshold': 'sum',
    'successful_eci': 'sum',
    'commission_responded': 'sum'
}).reset_index()

yearly_stats_detailed.columns = ['Year', 'Total Registered', 'Reached 1M', 'Met Country Threshold', 'Successful', 'Commission Response']

# Calculate in-progress initiatives (not failed, not successful yet)
# In progress: current_status is 'Collection ongoing', 'Verification', 'Valid initiative', etc.
in_progress_by_year = df[df['final_outcome'].isna()].groupby('registration_year').size().reset_index(name='In Progress')
yearly_stats_detailed = yearly_stats_detailed.merge(in_progress_by_year, left_on='Year', right_on='registration_year', how='left').drop('registration_year', axis=1)

# Calculate truly failed (unsuccessful or withdrawn)
failed_by_year = df[df['final_outcome'].isin(['Unsuccessful Collection', 'Withdrawn'])].groupby('registration_year').size().reset_index(name='Failed')
yearly_stats_detailed = yearly_stats_detailed.merge(failed_by_year, left_on='Year', right_on='registration_year', how='left').drop('registration_year', axis=1)

# Calculate successful without response
yearly_stats_detailed['Successful No Response'] = yearly_stats_detailed['Successful'] - yearly_stats_detailed['Commission Response']

# Create hover text for each category
failed_hover = []
in_progress_hover = []
successful_no_response_hover = []
commission_response_hover = []

for year in years:
    year_df = df[df['registration_year'] == year]
    
    # Failed ECIs
    failed_ecis = year_df[year_df['final_outcome'].isin(['Unsuccessful Collection', 'Withdrawn'])]['title'].tolist()
    failed_hover.append('<br>'.join([f'â€¢ {title}' for title in failed_ecis]) if failed_ecis else 'None')
    
    # In Progress ECIs
    in_progress_ecis = year_df[year_df['final_outcome'].isna()]['title'].tolist()
    in_progress_hover.append('<br>'.join([f'â€¢ {title}' for title in in_progress_ecis]) if in_progress_ecis else 'None')
    
    # Successful No Response ECIs
    successful_no_resp = year_df[(year_df['successful_eci']) & (~year_df['commission_responded'])]['title'].tolist()
    successful_no_response_hover.append('<br>'.join([f'â€¢ {title}' for title in successful_no_resp]) if successful_no_resp else 'None')
    
    # Commission Responded ECIs
    commission_resp = year_df[year_df['commission_responded'] == True]['title'].tolist()
    commission_response_hover.append('<br>'.join([f'â€¢ {title}' for title in commission_resp]) if commission_resp else 'None')

# ðŸ“Š VISUALIZATION: Stacked Bar Chart with ECI Titles in Tooltip
fig = go.Figure()

# Add stacked bars with custom hover data
fig.add_trace(go.Bar(
    x=yearly_stats_detailed['Year'],
    y=yearly_stats_detailed['Failed'],
    name='Failed',
    marker_color='#C34242',
    customdata=failed_hover,
    hovertemplate='<b>Failed</b><br>Year: %{x}<br>Count: %{y}<br><br>ECIs:<br>%{customdata}<extra></extra>'
))

fig.add_trace(go.Bar(
    x=yearly_stats_detailed['Year'],
    y=yearly_stats_detailed['In Progress'],
    name='In Progress',
    marker_color='#6C9BD1',
    customdata=in_progress_hover,
    hovertemplate='<b>In Progress</b><br>Year: %{x}<br>Count: %{y}<br><br>ECIs:<br>%{customdata}<extra></extra>'
))

fig.add_trace(go.Bar(
    x=yearly_stats_detailed['Year'],
    y=yearly_stats_detailed['Successful No Response'],
    name='Successful',
    marker_color='#F0B840',
    customdata=successful_no_response_hover,
    hovertemplate='<b>Successful (No Response)</b><br>Year: %{x}<br>Count: %{y}<br><br>ECIs:<br>%{customdata}<extra></extra>'
))

fig.add_trace(go.Bar(
    x=yearly_stats_detailed['Year'],
    y=yearly_stats_detailed['Commission Response'],
    name='Commission Responded',
    marker_color='#3CA371',
    customdata=commission_response_hover,
    hovertemplate='<b>Commission Responded</b><br>Year: %{x}<br>Count: %{y}<br><br>ECIs:<br>%{customdata}<extra></extra>'
))

fig.update_layout(
    title='ECI Outcomes by Registration Year',
    xaxis_title='Registration Year',
    yaxis_title='Number of Initiatives',
    barmode='stack',
    height=600,
    font=dict(size=14),
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    )
)

fig.show()

## QUESTION 6: Duration Analysis - Time Waiting at Each Step

In [None]:
print("\n" + "="*80)
print("QUESTION 6: DURATION ANALYSIS - TIME WAITING AT EACH STEP")
print("="*80)

# Calculate time between key milestones
df['registration_to_collection_days'] = (df['collection_start_date'] - df['registered_date']).dt.days
df['collection_to_response_days'] = (df['commission_response_date'] - df['collection_closed_date']).dt.days

print("\nCollection Period Duration (days):")
collection_stats = df[df['collection_duration_days'].notna()]['collection_duration_days'].describe()
print(collection_stats)

print("\n\nTime from Registration to Collection Start (days):")
reg_to_coll = df[df['registration_to_collection_days'].notna()]['registration_to_collection_days'].describe()
print(reg_to_coll)

print("\n\nTime from Collection End to Commission Response (days):")
coll_to_resp = df[df['collection_to_response_days'].notna()]['collection_to_response_days'].describe()
print(coll_to_resp)

print("\n\nTime from Registration to Commission Response (days) [for responded initiatives]:")
responded = df[df['commission_responded']]
time_to_response = responded['time_to_commission_response_days'].describe()
print(time_to_response)

print("\n\nBreakdown by milestone (median days):")
milestone_df = pd.DataFrame({
    'Milestone': [
        'Registration â†’ Collection Start',
        'Collection Period Duration',
        'Collection End â†’ Commission Response',
        'Registration â†’ Commission Response'
    ],
    'Median Days': [
        df['registration_to_collection_days'].median(),
        df['collection_duration_days'].median(),
        df['collection_to_response_days'].median(),
        responded['time_to_commission_response_days'].median()
    ],
    'Mean Days': [
        df['registration_to_collection_days'].mean(),
        df['collection_duration_days'].mean(),
        df['collection_to_response_days'].mean(),
        responded['time_to_commission_response_days'].mean()
    ]
})
print(milestone_df.to_string(index=False))

In [None]:
# ðŸ“Š VISUALIZATION 1: Registration â†’ Collection Start
import numpy as np

data1 = df[df['registration_to_collection_days'].notna()].copy()
hist1, edges1 = np.histogram(data1['registration_to_collection_days'], bins=30)
bin_centers1 = (edges1[:-1] + edges1[1:]) / 2

# Group ECIs by bin
hover_text1 = []
for i in range(len(edges1) - 1):
    if i == len(edges1) - 2:  # Last bin
        bin_ecis = data1[(data1['registration_to_collection_days'] >= edges1[i]) & 
                         (data1['registration_to_collection_days'] <= edges1[i+1])]['title'].tolist()
    else:
        bin_ecis = data1[(data1['registration_to_collection_days'] >= edges1[i]) & 
                         (data1['registration_to_collection_days'] < edges1[i+1])]['title'].tolist()
                         
    # Limit to 15 titles
    if not bin_ecis:
        titles_text = "No ECIs"
    elif len(bin_ecis) <= 15:
        titles_text = '<br>'.join(f"â€¢ {title}" for title in bin_ecis)
    else:
        titles_text = '<br>'.join(f"â€¢ {title}" for title in bin_ecis[:15])
        titles_text += f"<br><i>... (and {len(bin_ecis) - 15} more)</i>"

    hover_text1.append(titles_text)

# Create aggressive gradient from very light to very dark teal
colors1 = []
for i, center in enumerate(bin_centers1):
    ratio = i / len(bin_centers1)
    r = int(200 + (15 - 200) * ratio)
    g = int(240 + (100 - 240) * ratio)
    b = int(220 + (120 - 220) * ratio)
    colors1.append(f'rgb({r},{g},{b})')

fig1 = go.Figure(go.Bar(
    x=bin_centers1,
    y=hist1,
    marker=dict(color=colors1, line=dict(color='white', width=0.5)),
    width=np.diff(edges1),
    customdata=hover_text1,
    hovertemplate='<b>Registration â†’ Collection Start</b><br><b>Days:</b> %{x:.0f}<br><b>Count:</b> %{y}<br><br><b>ECIs:</b><br>%{customdata}<extra></extra>'
))

fig1.update_layout(
    title='Registration â†’ Collection Start',
    xaxis_title='Days',
    yaxis_title='Number of Initiatives',
    height=400,
    font=dict(size=14)
)
fig1.show()

In [None]:


# ðŸ“Š VISUALIZATION 2: Collection Period Duration
data2 = df[df['collection_duration_days'].notna()].copy()
hist2, edges2 = np.histogram(data2['collection_duration_days'], bins=30)
bin_centers2 = (edges2[:-1] + edges2[1:]) / 2

# Group ECIs by bin
hover_text2 = []
    
for i in range(len(edges2) - 1):
    if i == len(edges2) - 2:
        bin_ecis = data2[(data2['collection_duration_days'] >= edges2[i]) & 
                         (data2['collection_duration_days'] <= edges2[i+1])]['title'].tolist()
    else:
        bin_ecis = data2[(data2['collection_duration_days'] >= edges2[i]) & 
                         (data2['collection_duration_days'] < edges2[i+1])]['title'].tolist()
                         
    # Limit to 15 titles
    if not bin_ecis:
        titles_text = "No ECIs"
    elif len(bin_ecis) <= 15:
        titles_text = '<br>'.join(f"â€¢ {title}" for title in bin_ecis)
    else:
        titles_text = '<br>'.join(f"â€¢ {title}" for title in bin_ecis[:15])
        titles_text += f"<br><i>... (and {len(bin_ecis) - 15} more)</i>"

    hover_text2.append(titles_text)


colors2 = []
for i, center in enumerate(bin_centers2):
    ratio = i / len(bin_centers2)
    r = int(255 + (180 - 255) * ratio)
    g = int(200 + (80 - 200) * ratio)
    b = int(150 + (20 - 150) * ratio)
    colors2.append(f'rgb({r},{g},{b})')

fig2 = go.Figure(go.Bar(
    x=bin_centers2,
    y=hist2,
    marker=dict(color=colors2, line=dict(color='white', width=0.5)),
    width=np.diff(edges2),
    customdata=hover_text2,
    hovertemplate='<b>Collection Period Duration</b><br><b>Days:</b> %{x:.0f}<br><b>Count:</b> %{y}<br><br><b>ECIs:</b><br>%{customdata}<extra></extra>'
))

fig2.update_layout(
    title='Collection Period Duration',
    xaxis_title='Days',
    yaxis_title='Number of Initiatives',
    height=400,
    font=dict(size=14)
)
fig2.show()

In [None]:


# ðŸ“Š VISUALIZATION 3: Collection End â†’ Commission Response
data3 = df[df['collection_to_response_days'].notna()].copy()
hist3, edges3 = np.histogram(data3['collection_to_response_days'], bins=30)
bin_centers3 = (edges3[:-1] + edges3[1:]) / 2

# Group ECIs by bin
hover_text3 = []

for i in range(len(edges2) - 1):
    if i == len(edges3) - 2:
        bin_ecis = data3[(data3['collection_to_response_days'] >= edges3[i]) & 
                         (data3['collection_to_response_days'] <= edges3[i+1])]['title'].tolist()
    else:
        bin_ecis = data3[(data3['collection_to_response_days'] >= edges3[i]) & 
                         (data3['collection_to_response_days'] < edges3[i+1])]['title'].tolist()

    # Limit to 15 titles
    if not bin_ecis:
        titles_text = "No ECIs"
    elif len(bin_ecis) <= 15:
        titles_text = '<br>'.join(f"â€¢ {title}" for title in bin_ecis)
    else:
        titles_text = '<br>'.join(f"â€¢ {title}" for title in bin_ecis[:15])
        
        titles_text += f"<br><i>... (and {len(bin_ecis) - 15} more)</i>"
        
    hover_text3.append(titles_text)


colors3 = []
for i, center in enumerate(bin_centers3):
    ratio = i / len(bin_centers3)
    r = int(200 + (20 - 200) * ratio)
    g = int(250 + (120 - 250) * ratio)
    b = int(150 + (50 - 150) * ratio)
    colors3.append(f'rgb({r},{g},{b})')

fig3 = go.Figure(go.Bar(
    x=bin_centers3,
    y=hist3,
    marker=dict(color=colors3, line=dict(color='white', width=0.5)),
    width=np.diff(edges3),
    customdata=hover_text3,
    hovertemplate='<b>Collection End â†’ Commission Response</b><br><b>Days:</b> %{x:.0f}<br><b>Count:</b> %{y}<br><br><b>ECIs:</b><br>%{customdata}<extra></extra>'
))

fig3.update_layout(
    title='Collection End â†’ Commission Response',
    xaxis_title='Days',
    yaxis_title='Number of Initiatives',
    height=400,
    font=dict(size=14)
)
fig3.show()

In [None]:


# ðŸ“Š VISUALIZATION 4: Registration â†’ Commission Response
data4 = responded[responded['time_to_commission_response_days'].notna()].copy()
hist4, edges4 = np.histogram(data4['time_to_commission_response_days'], bins=30)
bin_centers4 = (edges4[:-1] + edges4[1:]) / 2

# Group ECIs by bin
hover_text4 = []
for i in range(len(edges4) - 1):
    if i == len(edges4) - 2:
        bin_ecis = data4[(data4['time_to_commission_response_days'] >= edges4[i]) & 
                         (data4['time_to_commission_response_days'] <= edges4[i+1])]['title'].tolist()
    else:
        bin_ecis = data4[(data4['time_to_commission_response_days'] >= edges4[i]) & 
                         (data4['time_to_commission_response_days'] < edges4[i+1])]['title'].tolist()
                         
    # Limit to 15 titles
    if not bin_ecis:
        titles_text = "No ECIs"
    elif len(bin_ecis) <= 15:
        titles_text = '<br>'.join(f"â€¢ {title}" for title in bin_ecis)
    else:
        titles_text = '<br>'.join(f"â€¢ {title}" for title in bin_ecis[:15])
        titles_text += f"<br><i>... (and {len(bin_ecis) - 15} more)</i>"

    hover_text4.append(titles_text)

colors4 = []
for i, center in enumerate(bin_centers4):
    ratio = i / len(bin_centers4)
    r = int(255 + (100 - 255) * ratio)
    g = int(255 + (140 - 255) * ratio)
    b = int(150 + (20 - 150) * ratio)
    colors4.append(f'rgb({r},{g},{b})')

fig4 = go.Figure(go.Bar(
    x=bin_centers4,
    y=hist4,
    marker=dict(color=colors4, line=dict(color='white', width=0.5)),
    width=np.diff(edges4),
    customdata=hover_text4,
    hovertemplate='<b>Registration â†’ Commission Response</b><br><b>Days:</b> %{x:.0f}<br><b>Count:</b> %{y}<br><br><b>ECIs:</b><br>%{customdata}<extra></extra>'
))

fig4.update_layout(
    title='Registration â†’ Commission Response',
    xaxis_title='Days',
    yaxis_title='Number of Initiatives',
    height=400,
    font=dict(size=14),
)
fig4.show()

## QUESTION 7: Topic and Policy Area Analysis

In [None]:
print("\n" + "="*80)
print("QUESTION 7: TOPIC AND POLICY AREA ANALYSIS")
print("="*80)

# --- 4) Analysis output (same as before, but use primary_policy_area) ---
print("\nInitiatives by Primary Policy Area:")
policy_dist = df["primary_policy_area"].value_counts().reset_index()
policy_dist.columns = ["Policy Area", "Count"]
policy_dist["Percentage"] = (policy_dist["Count"] / len(df) * 100).round(2)
print(policy_dist.to_string(index=False))

print("\n\nSuccess Rate by Policy Area:")
policy_success = df.groupby("primary_policy_area").agg({
    "registration_number": "count",
    "successful_eci": "sum",
    "commission_responded": "sum",
    "signatures_numeric": ["mean", "median"]
}).reset_index()
policy_success.columns = ["Policy Area", "Total", 'Successful', "Commission Response", "Avg Signatures", "Median Signatures"]
policy_success["Success Rate (%)"] = (policy_success['Successful'] / policy_success["Total"] * 100).round(2)

# Round signature columns for readability
policy_success["Avg Signatures"] = policy_success["Avg Signatures"].round(0).astype(int)
policy_success["Median Signatures"] = policy_success["Median Signatures"].fillna(0).astype(int)

policy_success = policy_success.sort_values("Success Rate (%)", ascending=False)
print(policy_success.to_string(index=False))

In [None]:
import plotly.graph_objects as go

# ðŸ“Š VISUALIZATION: Grouped Bar Chart - Policy Area Performance

# Prepare custom hover data for each trace separately
def prepare_hover_data(policy_area, trace_type):
    """
    trace_type: 'Total', 'Collected Signatures', or 'Commission Response'
    """
    # Get all ECIs in this policy area
    ecis_df = df[df['primary_policy_area'] == policy_area].copy()
    
    # Filter based on trace type
    if trace_type == 'Collected Signatures':
        ecis_df = ecis_df[ecis_df['successful_eci'] == True]
    elif trace_type == 'Commission Response':
        ecis_df = ecis_df[ecis_df['commission_responded'] == True]
    # For 'Total', no filtering needed
    
    # Build the ECI list (show first 15)
    eci_items = []
    for _, row in ecis_df.head(15).iterrows():
        eci_items.append(f"â€¢ {row['title']}")
    
    eci_list = '<br>'.join(eci_items)
    
    # Add "and X more" if applicable
    if len(ecis_df) > 15:
        eci_list += f'<br><i>... (and {len(ecis_df) - 15} more)</i>'
    
    return eci_list, len(ecis_df)

# Create figure
fig = go.Figure()

# Prepare hover data for each trace
total_hover = []
waiting_hover = []
commission_hover = []

for policy_area in policy_success['Policy Area']:
    total_list, total_count = prepare_hover_data(policy_area, 'Total')
    waiting_list, waiting_count = prepare_hover_data(policy_area, 'Collected Signatures')
    comm_list, comm_count = prepare_hover_data(policy_area, 'Commission Response')
    
    total_hover.append((total_list, total_count, 'Total Initiatives'))
    waiting_hover.append((waiting_list, waiting_count, 'Collected Signatures'))
    commission_hover.append((comm_list, comm_count, 'Commission Response'))

# Add traces with custom hover templates
fig.add_trace(go.Bar(
    x=policy_success['Policy Area'],
    y=policy_success['Total'],
    name='Total Initiatives',
    marker_color='lightblue',
    customdata=total_hover,
    hovertemplate='<b>%{x}</b><br>' +
                  '<b>Count:</b> %{customdata[1]}<br>' +
                  '<b>Type:</b> %{customdata[2]}<br><br>' +
                  '<b>ECIs:</b><br>%{customdata[0]}' +
                  '<extra></extra>'
))

fig.add_trace(go.Bar(
    x=policy_success['Policy Area'],
    y=policy_success['Successful'],
    name='Collected Signatures',
    marker_color='#F5A623',
    customdata=waiting_hover,
    hovertemplate='<b>%{x}</b><br>' +
                  '<b>Count:</b> %{customdata[1]}<br>' +
                  '<b>Type:</b> %{customdata[2]}<br><br>' +
                  '<b>ECIs:</b><br>%{customdata[0]}' +
                  '<extra></extra>'
))

fig.add_trace(go.Bar(
    x=policy_success['Policy Area'],
    y=policy_success['Commission Response'],
    name='Commission Response',
    marker_color='#3CA371',
    customdata=commission_hover,
    hovertemplate='<b>%{x}</b><br>' +
                  '<b>Count:</b> %{customdata[1]}<br>' +
                  '<b>Type:</b> %{customdata[2]}<br><br>' +
                  '<b>ECIs:</b><br>%{customdata[0]}' +
                  '<extra></extra>'
))

fig.update_layout(
    title='ECI Performance by Policy Area',
    xaxis_title='Policy Area',
    yaxis_title='Number of Initiatives',
    barmode='group',
    height=600,
    xaxis={'categoryorder':'total descending'}
)

fig.show()

# ðŸ“Š VISUALIZATION: Success Rate bar chart by Policy Area
policy_sorted = policy_success.sort_values('Success Rate (%)', ascending=True)

# Prepare ECI lists for each policy area
policy_eci_lists = []
for policy in policy_sorted['Policy Area']:
    ecis = df[df['primary_policy_area'] == policy]['title'].tolist()
    if len(ecis) <= 15:
        eci_text = '<br>'.join(f"â€¢ {title}" for title in ecis)
    else:
        eci_text = '<br>'.join(f"â€¢ {title}" for title in ecis[:15])
        eci_text += f"<br><i>... (and {len(ecis) - 15} more)</i>"
    policy_eci_lists.append(eci_text)

fig2 = px.bar(policy_sorted,
              x='Success Rate (%)', y='Policy Area',
              orientation='h',
              title='Success Rate (%) by Policy Area',
              color='Success Rate (%)',
              color_continuous_scale='RdYlGn',
              text='Success Rate (%)',
              custom_data=[policy_eci_lists])

fig2.update_traces(
    texttemplate='%{text:.1f}%', 
    textposition='outside',
    hovertemplate='<b>%{y}</b><br>' +
                  '<b>Success Rate:</b> %{x:.2f}%<br><br>' +
                  '<b>ECIs:</b><br>%{customdata[0]}' +
                  '<extra></extra>'
)
fig2.update_layout(height=500)
fig2.show()

## QUESTION 8: Detailed Analysis of Successful ECIs

In [None]:
print("\n" + "="*80)
print("QUESTION 8: DETAILED ANALYSIS OF SUCCESSFUL ECIs")
print("="*80)

successful = df[df['successful_eci'] == True].copy()

print(f"\nTotal successful ECIs: {len(successful)}")

print("\n\nSuccessful ECIs by year:")
successful_by_year = successful.groupby('registration_year').size().reset_index()
successful_by_year.columns = ['Year', 'Count']
print(successful_by_year.to_string(index=False))

print("\n\nSuccessful ECIs by policy area:")
successful_by_policy = successful['primary_policy_area'].value_counts().reset_index()
successful_by_policy.columns = ['Policy Area', 'Count']
print(successful_by_policy.to_string(index=False))

print("\n\nSignature statistics for successful ECIs:")
print(successful['signatures_numeric'].describe())

print("\n\nCountry threshold statistics for successful ECIs:")
print(successful['signatures_threshold_met_numeric'].describe())

print("\n\nCollection duration for successful ECIs:")
print(successful['collection_duration_days'].describe())

print("\n\nDetailed list of successful ECIs:")
successful_detail = successful[['registration_number', 'title', 'registration_year', 
                                  'signatures_numeric', 'signatures_threshold_met_numeric',
                                  'primary_policy_area', 'commission_responded']].copy()
successful_detail = successful_detail.sort_values('signatures_numeric', ascending=False)
successful_detail.columns = ['Reg #', 'Title', 'Year', 'Signatures', 'Countries', 'Policy Area', 'Commission Responded']
print(successful_detail.to_string(index=False))

In [None]:
# ðŸ“Š VISUALIZATION: Scatter Plot of Successful ECIs
successful_plot = successful.copy()
successful_plot['title_short'] = successful_plot['title'].str[:50] + '...'

# Apply power scaling (e.g., signatures^4.3)
successful_plot['size_scaled'] = successful_plot['signatures_numeric'] ** 4.3

# Get policy area order by count (most to least)
policy_order = successful_plot['primary_policy_area'].value_counts().index.tolist()

color_map = {
    'Agriculture & Fisheries & Animal rights': '#8bc34a',  # Green (nature/plants)
    'Democracy & Citizens\' rights': '#5c6bc0',  # Indigo (official/formal)
    'Social Policy': '#ec407a',  # Pink (community/people)
    'Health': '#ef5350',  # Red (medical/health)
    'Environment & Climate': '#66bb6a',  # Forest green (nature)
    'Consumer protection': '#ffa726',  # Orange (warning/safety)
    'Transport': '#42a5f5',  # Blue (sky/movement)
    'Economy & Finance': '#fdd835',  # Yellow (gold/money)
    'Digital & Communications': '#7e57c2',  # Purple (technology)
    'Education & Culture': '#26c6da',  # Cyan (knowledge/creativity)
    'Joke ECI': '#bdbdbd',  # Grey (neutral)
    'Other': '#9e9e9e'  # Grey (neutral)
}


fig = px.scatter(successful_plot, 
                 x='signatures_threshold_met_numeric', 
                 y='signatures_numeric',
                 size='size_scaled',
                 color='primary_policy_area',
                 custom_data=['title', 'registration_year', 'primary_policy_area'],  # Explicitly set custom data
                 title='Successful ECIs: Signatures vs Countries Meeting Threshold',
                 labels={
                     'signatures_threshold_met_numeric': 'Number of Countries Meeting Threshold',
                     'signatures_numeric': 'Total Signatures Collected',
                     'primary_policy_area': 'Policy Area'
                 },
                 category_orders={'primary_policy_area': policy_order},  # Order legend
                 color_discrete_map=color_map,
                #  color_discrete_sequence=px.colors.sequential.Viridis,
                 size_max=50)

# Custom hover template
fig.update_traces(
    hovertemplate='<b>%{customdata[0]}:</b><br>' +
                  '<b>%{customdata[2]}</b><br>' +
                  '<b>Year:</b> %{customdata[1]}<br>' +
                  '<b>Signatures Collected:</b> %{y:,.0f}<br>' +
                  '<b>Countries Meeting Threshold:</b> %{x}<br>' +
                  '<extra></extra>'
)

fig.update_layout(height=600)
fig.show()


## QUESTION 9: Commission Response Analysis

In [None]:
print("\n" + "="*80)
print("QUESTION 9: COMMISSION RESPONSE ANALYSIS")
print("="*80)

responded = df[df['commission_responded'] == True].copy()

print(f"\nTotal initiatives with Commission response: {len(responded)}")
print(f"Response rate (of successful ECIs): {(len(responded)/df['successful_eci'].sum()*100):.2f}%")

print("\n\nTime to receive Commission response (from registration):")
print(responded['time_to_commission_response_days'].describe())

print(f"\nIn years: Mean = {responded['time_to_commission_response_days'].mean()/365:.2f}, Median = {responded['time_to_commission_response_days'].median()/365:.2f}")

print("\n\nResponded initiatives by registration year:")
responded_by_year = responded.groupby('registration_year').size().reset_index()
responded_by_year.columns = ['Year', 'Count']
print(responded_by_year.to_string(index=False))

print("\n\nResponded initiatives by policy area:")
responded_by_policy = responded['primary_policy_area'].value_counts().reset_index()
responded_by_policy.columns = ['Policy Area', 'Count']
print(responded_by_policy.to_string(index=False))

print("\n\nAverage signatures for responded initiatives:")
print(f"Mean: {responded['signatures_numeric'].mean():.0f}")
print(f"Median: {responded['signatures_numeric'].median():.0f}")

print("\n\nDetailed list of initiatives with Commission response:")
responded_detail = responded[['registration_number', 'title', 'registration_year', 
                               'signatures_numeric', 'time_to_commission_response_days',
                               'primary_policy_area']].copy()
responded_detail = responded_detail.sort_values('time_to_commission_response_days')
responded_detail['Years to Response'] = (responded_detail['time_to_commission_response_days'] / 365).round(2)
responded_detail = responded_detail[['registration_number', 'title', 'registration_year', 
                                      'signatures_numeric', 'Years to Response', 'primary_policy_area']]
responded_detail.columns = ['Reg #', 'Title', 'Year', 'Signatures', 'Years to Response', 'Policy Area']
print(responded_detail.to_string(index=False))

In [None]:
# ðŸ“Š VISUALIZATION: Timeline to Commission Response
responded_plot = responded.copy()
responded_plot['title_short'] = responded_plot['title'].str[:40] + '...'
responded_plot['years_to_response'] = responded_plot['time_to_commission_response_days'] / 365

# Calculate time components
responded_plot['years_full'] = (responded_plot['time_to_commission_response_days'] // 365).astype(int)
responded_plot['months_full'] = ((responded_plot['time_to_commission_response_days'] % 365) // 30).astype(int)
responded_plot['days_full'] = ((responded_plot['time_to_commission_response_days'] % 365) % 30).astype(int)

# Create formatted time string with proper singular/plural
def format_time(row):
    parts = []
    if row['years_full'] > 0:
        parts.append(f"{row['years_full']} year{'s' if row['years_full'] != 1 else ''}")
    if row['months_full'] > 0:
        parts.append(f"{row['months_full']} month{'s' if row['months_full'] != 1 else ''}")
    if row['days_full'] > 0 or len(parts) == 0:
        parts.append(f"{row['days_full']} day{'s' if row['days_full'] != 1 else ''}")
    return ' '.join(parts)

# Format objective with line breaks every 11 words
def format_objective(objective):
    if pd.isna(objective):
        return "No objective provided"
    
    obj_str = str(objective)
    
    # Add line breaks every 11 words
    words = obj_str.split()
    lines = []
    for i in range(0, len(words), 11):
        lines.append(' '.join(words[i:i+11]))
    
    return '<br>'.join(lines)

responded_plot['time_formatted'] = responded_plot.apply(format_time, axis=1)
responded_plot['objective_formatted'] = responded_plot['objective'].apply(format_objective)

fig = px.bar(responded_plot.sort_values('time_to_commission_response_days'),
             y='title_short',
             x='years_to_response',
             orientation='h',
             title='Time to Commission Response (Years) for Each Initiative',
             labels={'years_to_response': 'Years to Response', 'title_short': 'Initiative'},
             color='years_to_response',
             color_continuous_scale='Viridis_r',
             custom_data=['title', 'time_formatted', 'objective_formatted'])

# Custom hover template
fig.update_traces(
    hovertemplate='<b>%{customdata[0]}</b><br>' +
                  '<b>Time to Response:</b> %{customdata[1]}<br>' +
                  '<b>Objective:</b><br>' +
                  '%{customdata[2]}<br>' +
                  '<extra></extra>'
)

fig.update_layout(height=600, showlegend=False)
fig.show()


## QUESTION 10: Funding Analysis

In [None]:
print("\n" + "="*80)
print("QUESTION 10: FUNDING ANALYSIS")
print("="*80)

print(f"\nInitiatives with funding data: {df['funding_numeric'].notna().sum()}")

print("\n\nFunding statistics:")
print(df['funding_numeric'].describe())

print("\n\nFunding by outcome:")
funding_by_outcome = df.groupby('final_outcome').agg({
    'funding_numeric': ['count', 'mean', 'median', 'min', 'max']
}).round(2)
print(funding_by_outcome)

print("\n\nFunding comparison: Successful vs Unsuccessful:")
successful_data = df[df['successful_eci'] == True]
unsuccessful_data = df[df['successful_eci'] == False]
successful_funding = successful_data['funding_numeric']
unsuccessful_funding = unsuccessful_data['funding_numeric']

funding_comparison = pd.DataFrame({
    'Category': ['Successful ECIs', 'Unsuccessful ECIs'],
    'Count': [successful_funding.notna().sum(), unsuccessful_funding.notna().sum()],
    'Mean Funding': [successful_funding.mean(), unsuccessful_funding.mean()],
    'Median Funding': [successful_funding.median(), unsuccessful_funding.median()],
    'Max Funding': [successful_funding.max(), unsuccessful_funding.max()]
})
print(funding_comparison.to_string(index=False))

print("\n\nCorrelation: Funding vs Signatures")
initiatives_with_both = df[(df['funding_numeric'].notna()) & (df['signatures_numeric'].notna())]
if len(initiatives_with_both) > 0:
    correlation = initiatives_with_both[['funding_numeric', 'signatures_numeric']].corr()
    print(correlation)
    print(f"\nCorrelation coefficient: {correlation.iloc[0, 1]:.4f}")

In [None]:
# ðŸ“Š VISUALIZATION: Scatter Plot - Funding vs Signatures
plot_data = df[(df['funding_numeric'].notna()) & (df['signatures_numeric'].notna())].copy()

# Format funding to K/M with 2 decimals
def format_currency(value):
    if value >= 1_000_000:
        return f"â‚¬{value/1_000_000:.2f}M"
    elif value >= 1_000:
        return f"â‚¬{value/1_000:.1f}K"
    else:
        return f"â‚¬{value}"

plot_data['funding_formatted'] = plot_data['funding_numeric'].apply(format_currency)
plot_data['successful_text'] = plot_data['successful_eci'].map({True: 'Yes', False: 'No'})

fig = px.scatter(plot_data,
                 y='funding_numeric',
                 x='signatures_numeric',
                 color='successful_eci',
                 size='funding_numeric',
                 custom_data=['title', 'registration_year', 'funding_formatted', 'successful_text'],
                 title='Funding vs Signatures Collected (Correlation Analysis)',
                 labels={
                     'funding_numeric': 'Funding (EUR)',
                     'signatures_numeric': 'Signatures Collected',
                     'successful_eci': 'Successful'
                 },
                 color_discrete_map={True: '#2ecc71', False: '#e74c3c'},
                 trendline='ols',
                 size_max=50)

# Custom hover template
fig.update_traces(
    hovertemplate='<b>%{customdata[0]}</b><br>' +
                  '<b>Year:</b> %{customdata[1]}<br>' +
                  '<b>EUR:</b> %{customdata[2]}<br>' +
                  '<b>Signatures:</b> %{x:,.0f}<br>' +
                  '<b>Successful:</b> %{customdata[3]}<br>' +
                  '<extra></extra>',
    selector=dict(mode='markers')  # Only apply to scatter points, not trendline
)

fig.update_layout(height=600)
fig.show()


## QUESTION 11: Correlation Analysis - Key Success Factors

In [None]:
print("\n" + "="*80)
print("QUESTION 11: CORRELATION ANALYSIS - KEY SUCCESS FACTORS")
print("="*80)

# Create analysis dataset
analysis_df = df[[
    'successful_eci',
    'commission_responded',
    'signatures_numeric',
    'signatures_threshold_met_numeric',
    'funding_numeric',
    'collection_duration_days',
    'registration_to_collection_days',
    'registration_year'
]].copy()

# Convert boolean to numeric
analysis_df['successful_numeric'] = analysis_df['successful_eci'].astype(int)
analysis_df['responded_numeric'] = analysis_df['commission_responded'].astype(int)

print("\nCorrelation matrix of key metrics with success:")
corr_columns = [
    'successful_numeric',
    'signatures_numeric',
    'signatures_threshold_met_numeric',
    'funding_numeric',
    'collection_duration_days',
    'registration_to_collection_days',
    'registration_year'
]

correlation_matrix = analysis_df[corr_columns].corr()
print(correlation_matrix['successful_numeric'].sort_values(ascending=False).to_string())

print("\n\nCorrelation with Commission response:")
print(analysis_df[corr_columns[:-1] + ['responded_numeric']].corr()['responded_numeric'].sort_values(ascending=False).to_string())

# Comparison for key metrics
print("\n\nDetailed comparison:")
successful_data = df[df['successful_eci'] == True]
unsuccessful_data = df[df['successful_eci'] == False]

comparison_detail = pd.DataFrame({
    'Metric': [
        'Avg Signatures',
        'Avg Funding (â‚¬)',
        'Avg Collection Duration (days)',
        'Avg Countries Met Threshold',
        'Avg Time to Collection Start (days)'
    ],
    'Successful ECIs': [
        successful_data['signatures_numeric'].mean(),
        successful_data['funding_numeric'].mean(),
        successful_data['collection_duration_days'].mean(),
        successful_data['signatures_threshold_met_numeric'].mean(),
        successful_data['registration_to_collection_days'].mean()
    ],
    'Unsuccessful ECIs': [
        unsuccessful_data['signatures_numeric'].mean(),
        unsuccessful_data['funding_numeric'].mean(),
        unsuccessful_data['collection_duration_days'].mean(),
        unsuccessful_data['signatures_threshold_met_numeric'].mean(),
        unsuccessful_data['registration_to_collection_days'].mean()
    ]
})

comparison_detail['Difference'] = comparison_detail['Successful ECIs'] - comparison_detail['Unsuccessful ECIs']
comparison_detail['Ratio'] = (comparison_detail['Successful ECIs'] / comparison_detail['Unsuccessful ECIs']).round(2)
print(comparison_detail.to_string(index=False))

In [None]:
# ðŸ“Š VISUALIZATION: Correlation Heatmap
corr_labels = {
    'successful_numeric': 'Success',
    'signatures_numeric': 'Signatures',
    'signatures_threshold_met_numeric': 'Countries Thresholds Met',
    'funding_numeric': 'Funding',
    'collection_duration_days': 'Collection Days',
    'registration_to_collection_days': 'Reg â†’ Collection Days',
    'registration_year': 'Year'
}

# Rename for display
corr_display = correlation_matrix.rename(columns=corr_labels, index=corr_labels)

# Keep only lower triangle (remove diagonal and upper triangle)
mask = np.triu(np.ones_like(corr_display, dtype=bool))
corr_display_masked = corr_display.mask(mask)

fig = px.imshow(corr_display_masked,
                text_auto='.2f',
                aspect='auto',
                color_continuous_scale='RdBu_r',
                color_continuous_midpoint=0,
                title='Correlation Matrix: Key Success Factors',
                zmin=-1, zmax=1)

fig.update_layout(height=600)
fig.show()


## QUESTION 12: Executive Summary for ECI Organizers

In [None]:
print("\n" + "="*80)
print("QUESTION 13: EXECUTIVE SUMMARY FOR ECI ORGANIZERS")
print("="*80)

print("\n### OVERALL ECI LANDSCAPE ###")
print(f"Total ECIs registered (all time): {len(df)}")
print(f"Time period: {df['registration_year'].min():.0f} - {df['registration_year'].max():.0f}")
print(f"Average initiatives per year: {len(df) / (df['registration_year'].max() - df['registration_year'].min() + 1):.1f}")

print("\n\n### SUCCESS RATES ###")
print(f"Initiatives reaching 1M signatures: {df['reached_signatures'].sum()} ({(df['reached_signatures'].sum()/len(df)*100):.1f}%)")
print(f"Initiatives meeting country threshold: {df['met_country_threshold'].sum()} ({(df['met_country_threshold'].sum()/len(df)*100):.1f}%)")
print(f"Successful ECIs (both criteria): {df['successful_eci'].sum()} ({(df['successful_eci'].sum()/len(df)*100):.1f}%)")
print(f"Commission responses received: {df['commission_responded'].sum()} ({(df['commission_responded'].sum()/len(df)*100):.1f}%)")
print(f"Response rate for successful ECIs: {(df['commission_responded'].sum()/df['successful_eci'].sum()*100):.1f}%")

print("\n\n### KEY BARRIERS ###")
unsuccessful = df[df['final_outcome'] == 'Unsuccessful Collection']
withdrawn = df[df['final_outcome'] == 'Withdrawn']
print(f"Unsuccessful collections: {len(unsuccessful)} ({(len(unsuccessful)/len(df)*100):.1f}%)")
print(f"Withdrawn initiatives: {len(withdrawn)} ({(len(withdrawn)/len(df)*100):.1f}%)")
print(f"Attrition rate (did not complete): {((len(unsuccessful) + len(withdrawn))/len(df)*100):.1f}%")

print("\n\n### SIGNATURES REQUIRED ###")
successful_sigs = df[df['successful_eci']]['signatures_numeric']
print(f"Minimum signatures among successful: {successful_sigs.min():,.0f}")
print(f"Average signatures for successful: {successful_sigs.mean():,.0f}")
print(f"Median signatures for successful: {successful_sigs.median():,.0f}")
print(f"Maximum signatures achieved: {successful_sigs.max():,.0f}")

print("\n\n### COUNTRY THRESHOLD PATTERNS ###")
successful_countries = df[df['successful_eci']]['signatures_threshold_met_numeric']
print(f"Minimum countries needed: 7")
print(f"Average countries met in successful: {successful_countries.mean():.1f}")
print(f"Maximum countries met: {int(successful_countries.max())}")

print("\n\n### TIME EXPECTATIONS ###")
print(f"Average collection period: {df['collection_duration_days'].mean():.0f} days ({(df['collection_duration_days'].mean()/365):.1f} years)")
print(f"Median collection period: {df['collection_duration_days'].median():.0f} days ({(df['collection_duration_days'].median()/365):.1f} years)")
successful_collection = df[df['successful_eci']]['collection_duration_days']
print(f"Average for successful: {successful_collection.mean():.0f} days ({(successful_collection.mean()/365):.1f} years)")
print(f"Time to Commission response: {responded['time_to_commission_response_days'].mean():.0f} days ({(responded['time_to_commission_response_days'].mean()/365):.2f} years)")

print("\n\n### FUNDING INSIGHTS ###")
print(f"Successful ECIs avg funding: â‚¬{successful_data['funding_numeric'].mean():,.0f}")
print(f"Unsuccessful ECIs avg funding: â‚¬{unsuccessful_data['funding_numeric'].mean():,.0f}")
print(f"Funding advantage ratio: {(successful_data['funding_numeric'].mean() / unsuccessful_data['funding_numeric'].mean()):.1f}x")
print(f"Correlation (funding vs signatures): {correlation_matrix.iloc[0, 1]:.3f}")

print("\n\n### TOPIC AREAS WITH HIGHEST SUCCESS ###")
top_topics = policy_success[policy_success['Total'] >= 5].sort_values('Success Rate (%)', ascending=False).head(5)
print(top_topics[['Policy Area', 'Total', 'Successful', 'Success Rate (%)']].to_string(index=False))

print("\n\n### TEMPORAL TRENDS ###")
print("Best performing years:")
best_years = yearly_stats.nlargest(3, 'Success Rate (%)')
print(best_years[['Year', 'Total Registered', 'Successful', 'Success Rate (%)']].to_string(index=False))

print("\n\nWorst performing years (with >= 5 registrations):")
worst_years = yearly_stats[yearly_stats['Total Registered'] >= 5].nsmallest(3, 'Success Rate (%)')
print(worst_years[['Year', 'Total Registered', 'Successful', 'Success Rate (%)']].to_string(index=False))

print("\n\n### GEOGRAPHIC INSIGHTS ###")
print("Top 5 countries by threshold achievement in successful ECIs:")
print(country_participation.head(5).to_string(index=False))

print("\n\nTop 5 countries organizing successful initiatives:")
print(successful_org_participation.head(5).to_string(index=False))

## Export Analysis Results to CSV

In [None]:
# Create output directory if it doesn't exist
os.makedirs('eda_data_output', exist_ok=True)

# Export enhanced dataset with calculated fields
export_df = df[[
    'registration_number',
    'title',
    'registration_year',
    'current_status',
    'final_outcome',
    'primary_policy_area',
    'signatures_numeric',
    'signatures_threshold_met_numeric',
    'funding_numeric',
    'collection_duration_days',
    'registration_to_collection_days',
    'time_to_commission_response_days',
    'reached_signatures',
    'met_country_threshold',
    'successful_eci',
    'commission_responded'
]].copy()

export_df.columns = [
    'Registration_Number',
    'Title',
    'Year',
    'Current_Status',
    'Final_Outcome',
    'Policy_Area',
    'Signatures',
    'Countries_Met_Threshold',
    'Funding_EUR',
    'Collection_Duration_Days',
    'Registration_to_Collection_Days',
    'Time_to_Commission_Response_Days',
    'Reached_1M_Signatures',
    'Met_Country_Threshold_7plus',
    'Successful_ECI',
    'Commission_Responded'
]

export_df.to_csv('eda_data_output/eci_analysis_enhanced.csv', index=False)
print("âœ“ Exported: eci_analysis_enhanced.csv")

# Export summary statistics
# Export summary statistics
summary_stats = pd.DataFrame({
    'Metric': [
        'Total ECIs Registered',
        'Successful ECIs',
        'Commission Responses',
        'Success Rate (%)',
        'Response Rate of Successful (%)',
        'Avg Signatures (Successful)',
        'Median Signatures (Successful)',
        'Max Signatures (Successful)',
        'Avg Funding EUR (Successful)',
        'Median Funding EUR (Successful)',
        'Max Funding EUR (Successful)',
        'Min Funding EUR (Successful)',
        'Avg Collection Duration Days',
        'Median Collection Duration Days',
        'Max Collection Duration Days',
        'Min Collection Duration Days',
        'Avg Time to Response Days',
        'Median Time to Response Days',
        'Max Time to Response Days',
        'Min Time to Response Days',
        'Withdrawn Rate (%)',
        'Unsuccessful Rate (%)'
    ],
    'Value': [
        str(len(df)),
        str(int(df['successful_eci'].sum())),
        str(int(df['commission_responded'].sum())),
        round(df['successful_eci'].sum() / len(df) * 100, 2),
        round(df['commission_responded'].sum() / df['successful_eci'].sum() * 100, 2),
        str(int(round(successful_data['signatures_numeric'].mean()))),
        str(int(successful_data['signatures_numeric'].median())),
        str(int(successful_data['signatures_numeric'].max())),
        round(successful_data['funding_numeric'].mean(), 2),
        round(successful_data['funding_numeric'].median(), 2),
        round(successful_data['funding_numeric'].max(), 2),
        round(successful_data['funding_numeric'].min(), 2),
        str(int(round(df['collection_duration_days'].mean()))),
        str(int(df['collection_duration_days'].median())),
        str(int(df['collection_duration_days'].max())),
        str(int(df['collection_duration_days'].min())),
        str(int(round(responded['time_to_commission_response_days'].mean()))),
        str(int(responded['time_to_commission_response_days'].median())),
        str(int(responded['time_to_commission_response_days'].max())),
        str(int(responded['time_to_commission_response_days'].min())),
        round(len(withdrawn) / len(df) * 100, 2),
        round(len(unsuccessful) / len(df) * 100, 2)
    ]
})

# Format Value column properly (no .0 for integers)
summary_stats['Value'] = summary_stats['Value'].apply(
    lambda x: int(x) if isinstance(x, (int, float)) and x == int(x) else x
)

summary_stats.to_csv('eda_data_output/eci_summary_statistics.csv', index=False)
print("âœ“ Exported: eci_summary_statistics.csv")

# Export policy area analysis
policy_success.to_csv('eda_data_output/eci_policy_area_analysis.csv', index=False)
print("âœ“ Exported: eci_policy_area_analysis.csv")

# Export yearly trends
yearly_stats.to_csv('eda_data_output/eci_yearly_trends.csv', index=False)
print("âœ“ Exported: eci_yearly_trends.csv")

# Export country participation
country_participation.to_csv('eda_data_output/eci_country_threshold_achievement.csv', index=False)
print("âœ“ Exported: eci_country_threshold_achievement.csv")

print("\n" + "="*80)
print("ANALYSIS COMPLETE - All outputs exported")
print("="*80)