# European Citizens' Initiative (ECI) Data Analysis
## Exploratory Data Analysis for ECI Initiative Organizers

This notebook analyzes 121 ECI initiatives registered between 2012-2025,
examining success patterns, barriers, and key performance indicators.

## Setup: Import Libraries and Load Data

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import json
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

data_folder = "../data/2025-09-18_16-33-57"

# Load the dataset
df = pd.read_csv(f'{data_folder}/eci_initiatives_2025-11-04_11-59-38.csv')
print(f"Dataset loaded: {len(df)} initiatives")
print(f"Columns: {len(df.columns)}")

## Data Cleaning and Feature Engineering

In [None]:
# Parse date strings from DD/MM/YYYY to datetime
def parse_date(date_str):
    if pd.isna(date_str) or date_str == '':
        return pd.NaT
    try:
        return pd.to_datetime(date_str, format='%d/%m/%Y')
    except:
        return pd.NaT

# Parse main timeline dates
df['registered_date'] = df['timeline_registered'].apply(parse_date)
df['collection_start_date'] = df['timeline_collection_start_date'].apply(parse_date)
df['collection_closed_date'] = df['timeline_collection_closed'].apply(parse_date)
df['verification_start_date'] = df['timeline_verification_start'].apply(parse_date)
df['verification_end_date'] = df['timeline_verification_end'].apply(parse_date)
df['commission_response_date'] = df['timeline_response_commission_date'].apply(parse_date)

# Extract year from registration
df['registration_year'] = df['registered_date'].dt.year

# Calculate durations
df['collection_duration_days'] = (df['collection_closed_date'] - df['collection_start_date']).dt.days
df['verification_duration_days'] = (df['verification_end_date'] - df['verification_start_date']).dt.days
df['time_to_commission_response_days'] = (df['commission_response_date'] - df['registered_date']).dt.days
df['registration_to_collection_days'] = (df['collection_start_date'] - df['registered_date']).dt.days

# Parse signatures (handle commas and convert to numeric)
def parse_signatures(sig):
    if pd.isna(sig):
        return np.nan
    if isinstance(sig, str):
        return float(sig.replace(',', ''))
    return float(sig)

df['signatures_numeric'] = df['signatures_collected'].apply(parse_signatures)
df['signatures_threshold_met_numeric'] = pd.to_numeric(df['signatures_threshold_met'], errors='coerce')

# Parse funding (handle commas and convert to numeric)
def parse_funding(funding):
    if pd.isna(funding):
        return np.nan
    if isinstance(funding, str):
        return float(funding.replace(',', ''))
    return float(funding)

df['funding_numeric'] = df['funding_total'].apply(parse_funding)

# Define success categories
df['reached_signatures'] = df['signatures_numeric'] >= 1000000
df['met_country_threshold'] = df['signatures_threshold_met_numeric'] >= 7
df['successful_eci'] = df['reached_signatures'] & df['met_country_threshold']
df['commission_responded'] = df['final_outcome'] == 'Commission Response'

print("\n=== Data Cleaning Complete ===")
print(f"Total initiatives: {len(df)}")
print(f"Reached 1M signatures: {df['reached_signatures'].sum()}")
print(f"Met country threshold (7+): {df['met_country_threshold'].sum()}")
print(f"Successful ECIs (both criteria): {df['successful_eci'].sum()}")
print(f"Commission responded: {df['commission_responded'].sum()}")

## QUESTION 1: Overall Success Funnel Analysis

In [None]:
# Calculate conversion rates at each stage of the ECI process
total_registered = len(df)
started_collection = df['collection_start_date'].notna().sum()
completed_collection = df['collection_closed_date'].notna().sum()
reached_1m = df['reached_signatures'].sum()
met_threshold = df['met_country_threshold'].sum()
both_criteria = df['successful_eci'].sum()
commission_response = df['commission_responded'].sum()

# Create funnel dataframe
funnel_data = pd.DataFrame({
    'Stage': [
        '1. Registered',
        '2. Collection Started',
        '3. Collection Completed',
        '4. Reached 1M Signatures',
        '5. Met Country Threshold (7+)',
        '6. Successful (Both Criteria)',
        '7. Commission Response'
    ],
    'Count': [
        total_registered,
        started_collection,
        completed_collection,
        reached_1m,
        met_threshold,
        both_criteria,
        commission_response
    ]
})

funnel_data['Percentage of Registered'] = (funnel_data['Count'] / total_registered * 100).round(2)
funnel_data['Conversion from Previous Stage'] = (funnel_data['Count'] / funnel_data['Count'].shift(1) * 100).round(2)

print("\n" + "="*80)
print("QUESTION 1: ECI SUCCESS FUNNEL ANALYSIS")
print("="*80)
print("\nSuccess Funnel:")
print(funnel_data.to_string(index=False))

print(f"\n\nKey Insights:")
print(f"- Overall success rate (registered → commission response): {(commission_response/total_registered*100):.2f}%")
print(f"- Success rate (registered → met both criteria): {(both_criteria/total_registered*100):.2f}%")
print(f"- Response rate (successful → commission response): {(commission_response/both_criteria*100):.2f}%")

## QUESTION 2: Status and Outcome Distribution

In [None]:
print("\n" + "="*80)
print("QUESTION 2: STATUS AND OUTCOME DISTRIBUTION")
print("="*80)

print("\nCurrent Status Distribution:")
status_dist = df['current_status'].value_counts().reset_index()
status_dist.columns = ['Status', 'Count']
status_dist['Percentage'] = (status_dist['Count'] / len(df) * 100).round(2)
print(status_dist.to_string(index=False))

print("\n\nFinal Outcome Distribution:")
outcome_dist = df['final_outcome'].value_counts(dropna=False).reset_index()
outcome_dist.columns = ['Outcome', 'Count']
outcome_dist['Percentage'] = (outcome_dist['Count'] / len(df) * 100).round(2)
print(outcome_dist.to_string(index=False))

print("\n\nWithdrawal Analysis:")
withdrawn = df[df['final_outcome'] == 'Withdrawn']
print(f"Total withdrawn: {len(withdrawn)}")
print(f"Percentage of all initiatives: {(len(withdrawn)/len(df)*100):.2f}%")
print(f"Had collection data: {withdrawn['collection_start_date'].notna().sum()}")
print(f"Had signatures data: {withdrawn['signatures_numeric'].notna().sum()}")

## QUESTION 3: Temporal Trends in Success Rates

In [None]:
print("\n" + "="*80)
print("QUESTION 3: TEMPORAL TRENDS IN ECI SUCCESS RATES")
print("="*80)

yearly_stats = df.groupby('registration_year').agg({
    'registration_number': 'count',
    'reached_signatures': 'sum',
    'met_country_threshold': 'sum',
    'successful_eci': 'sum',
    'commission_responded': 'sum'
}).reset_index()

yearly_stats.columns = ['Year', 'Total Registered', 'Reached 1M', 'Met Country Threshold', 'Successful', 'Commission Response']
yearly_stats['Success Rate (%)'] = (yearly_stats['Successful'] / yearly_stats['Total Registered'] * 100).round(2)
yearly_stats['Response Rate (%)'] = (yearly_stats['Commission Response'] / yearly_stats['Total Registered'] * 100).round(2)

print("\nYearly Success Rates:")
print(yearly_stats.to_string(index=False))

print(f"\n\nTrend Analysis:")
print(f"Best year for success rate: {yearly_stats.loc[yearly_stats['Success Rate (%)'].idxmax(), 'Year']:.0f} ({yearly_stats['Success Rate (%)'].max():.2f}%)")
print(f"Worst year for success rate: {yearly_stats.loc[yearly_stats['Success Rate (%)'].idxmin(), 'Year']:.0f} ({yearly_stats['Success Rate (%)'].min():.2f}%)")
print(f"Average success rate: {yearly_stats['Success Rate (%)'].mean():.2f}%")

## QUESTION 4: Duration Analysis - Time Waiting at Each Step

In [None]:
print("\n" + "="*80)
print("QUESTION 4: DURATION ANALYSIS - TIME WAITING AT EACH STEP")
print("="*80)

# Calculate time between key milestones
df['registration_to_collection_days'] = (df['collection_start_date'] - df['registered_date']).dt.days
df['collection_to_verification_days'] = (df['verification_start_date'] - df['collection_closed_date']).dt.days
df['verification_to_response_days'] = (df['commission_response_date'] - df['verification_end_date']).dt.days

print("\nCollection Period Duration (days):")
collection_stats = df[df['collection_duration_days'].notna()]['collection_duration_days'].describe()
print(collection_stats)

print("\n\nTime from Registration to Collection Start (days):")
reg_to_coll = df[df['registration_to_collection_days'].notna()]['registration_to_collection_days'].describe()
print(reg_to_coll)

print("\n\nTime from Collection End to Verification Start (days) [for successful initiatives]:")
successful_coll_to_verif = df[(df['successful_eci']) & (df['collection_to_verification_days'].notna())]['collection_to_verification_days'].describe()
print(successful_coll_to_verif)

print("\n\nTime from Registration to Commission Response (days) [for responded initiatives]:")
responded = df[df['commission_responded']]
time_to_response = responded['time_to_commission_response_days'].describe()
print(time_to_response)

print("\n\nBreakdown by milestone (median days):")
milestone_df = pd.DataFrame({
    'Milestone': [
        'Registration → Collection Start',
        'Collection Period Duration',
        'Collection End → Verification Start',
        'Registration → Commission Response'
    ],
    'Median Days': [
        df['registration_to_collection_days'].median(),
        df['collection_duration_days'].median(),
        df[df['successful_eci']]['collection_to_verification_days'].median(),
        responded['time_to_commission_response_days'].median()
    ],
    'Mean Days': [
        df['registration_to_collection_days'].mean(),
        df['collection_duration_days'].mean(),
        df[df['successful_eci']]['collection_to_verification_days'].mean(),
        responded['time_to_commission_response_days'].mean()
    ]
})
print(milestone_df.to_string(index=False))

## QUESTION 5: Signature Collection Performance Analysis

In [None]:
print("\n" + "="*80)
print("QUESTION 5: SIGNATURE COLLECTION PERFORMANCE ANALYSIS")
print("="*80)

initiatives_with_sigs = df[df['signatures_numeric'].notna()].copy()

print(f"\nTotal initiatives with signature data: {len(initiatives_with_sigs)}")
print(f"\nSignature collection statistics:")
sig_stats = initiatives_with_sigs['signatures_numeric'].describe()
print(sig_stats)

# Break down by success
print("\n\nSignature distribution by outcome:")
sig_by_outcome = initiatives_with_sigs.groupby('final_outcome').agg({
    'signatures_numeric': ['count', 'mean', 'median', 'min', 'max']
}).round(0)
print(sig_by_outcome)

# Country threshold analysis
print("\n\nCountry threshold analysis:")
threshold_stats = initiatives_with_sigs.groupby('met_country_threshold').agg({
    'signatures_numeric': ['count', 'mean', 'median'],
    'signatures_threshold_met_numeric': ['mean', 'max']
}).round(2)
print(threshold_stats)

# Correlation between collection duration and signatures
print("\n\nCorrelation: Collection Duration vs Signatures Collected")
initiatives_with_both = initiatives_with_sigs[initiatives_with_sigs['collection_duration_days'].notna()]
if len(initiatives_with_both) > 0:
    correlation = initiatives_with_both[['collection_duration_days', 'signatures_numeric']].corr()
    print(correlation)
    
print("\n\nSuccess rate by signature brackets:")
initiatives_with_sigs['sig_bracket'] = pd.cut(
    initiatives_with_sigs['signatures_numeric'],
    bins=[0, 100000, 250000, 500000, 750000, 1000000, 10000000],
    labels=['<100K', '100K-250K', '250K-500K', '500K-750K', '750K-1M', '1M+']
)
sig_bracket_analysis = initiatives_with_sigs.groupby('sig_bracket', observed=True).agg({
    'registration_number': 'count',
    'successful_eci': 'sum',
    'commission_responded': 'sum'
}).reset_index()
sig_bracket_analysis.columns = ['Signature Bracket', 'Count', 'Successful', 'Commission Response']
print(sig_bracket_analysis.to_string(index=False))

## QUESTION 6: Topic and Policy Area Analysis

In [None]:
print("\n" + "="*80)
print("QUESTION 6: TOPIC AND POLICY AREA ANALYSIS")
print("="*80)

# Define policy area keywords
policy_keywords = {
    'Environment': ['climate', 'environment', 'pollution', 'waste', 'nature', 'biodiversity', 'water', 'emissions', 'green', 'sustainability'],
    'Animal Welfare': ['animal', 'vivisection', 'fur', 'hunting', 'wildlife', 'welfare'],
    'Health': ['health', 'healthcare', 'medical', 'medicine', 'covid', 'pandemic', 'disease'],
    'Rights & Democracy': ['rights', 'democracy', 'freedom', 'liberty', 'justice', 'vote', 'citizenship', 'equality'],
    'Economy & Finance': ['economy', 'tax', 'finance', 'budget', 'trade', 'economic', 'financial', 'income'],
    'Education & Culture': ['education', 'culture', 'art', 'heritage', 'learning', 'school', 'university'],
    'Agriculture & Food': ['agriculture', 'food', 'farming', 'dairy', 'rural', 'pesticide', 'organic'],
    'Social Policy': ['social', 'poverty', 'welfare', 'employment', 'worker', 'housing', 'family'],
    'Migration': ['migration', 'immigrant', 'refugee', 'asylum', 'border'],
    'Digital & Tech': ['digital', 'internet', 'technology', 'data', 'privacy', 'cyber'],
    'Energy': ['energy', 'nuclear', 'renewable', 'fossil', 'electricity'],
    'Transport': ['transport', 'mobility', 'traffic', 'railway', 'vehicle']
}

# Function to categorize initiatives
def categorize_initiative(title, objective):
    text = (str(title) + ' ' + str(objective)).lower()
    categories = []
    for category, keywords in policy_keywords.items():
        if any(keyword in text for keyword in keywords):
            categories.append(category)
    return categories if categories else ['Other']

df['policy_areas'] = df.apply(lambda row: categorize_initiative(row['title'], row['objective']), axis=1)
df['primary_policy_area'] = df['policy_areas'].apply(lambda x: x[0] if x else 'Other')

# Count by policy area
print("\nInitiatives by Primary Policy Area:")
policy_dist = df['primary_policy_area'].value_counts().reset_index()
policy_dist.columns = ['Policy Area', 'Count']
policy_dist['Percentage'] = (policy_dist['Count'] / len(df) * 100).round(2)
print(policy_dist.to_string(index=False))

# Success rate by policy area
print("\n\nSuccess Rate by Policy Area:")
policy_success = df.groupby('primary_policy_area').agg({
    'registration_number': 'count',
    'successful_eci': 'sum',
    'commission_responded': 'sum',
    'signatures_numeric': 'mean'
}).reset_index()
policy_success.columns = ['Policy Area', 'Total', 'Successful', 'Commission Response', 'Avg Signatures']
policy_success['Success Rate (%)'] = (policy_success['Successful'] / policy_success['Total'] * 100).round(2)
policy_success = policy_success.sort_values('Success Rate (%)', ascending=False)
print(policy_success.to_string(index=False))

## QUESTION 7: Detailed Analysis of Successful ECIs

In [None]:
print("\n" + "="*80)
print("QUESTION 7: DETAILED ANALYSIS OF SUCCESSFUL ECIs")
print("="*80)

successful = df[df['successful_eci'] == True].copy()

print(f"\nTotal successful ECIs: {len(successful)}")

print("\n\nSuccessful ECIs by year:")
successful_by_year = successful.groupby('registration_year').size().reset_index()
successful_by_year.columns = ['Year', 'Count']
print(successful_by_year.to_string(index=False))

print("\n\nSuccessful ECIs by policy area:")
successful_by_policy = successful['primary_policy_area'].value_counts().reset_index()
successful_by_policy.columns = ['Policy Area', 'Count']
print(successful_by_policy.to_string(index=False))

print("\n\nSignature statistics for successful ECIs:")
print(successful['signatures_numeric'].describe())

print("\n\nCountry threshold statistics for successful ECIs:")
print(successful['signatures_threshold_met_numeric'].describe())

print("\n\nCollection duration for successful ECIs:")
print(successful['collection_duration_days'].describe())

print("\n\nDetailed list of successful ECIs:")
successful_detail = successful[['registration_number', 'title', 'registration_year', 
                                  'signatures_numeric', 'signatures_threshold_met_numeric',
                                  'primary_policy_area', 'commission_responded']].copy()
successful_detail = successful_detail.sort_values('signatures_numeric', ascending=False)
successful_detail.columns = ['Reg #', 'Title', 'Year', 'Signatures', 'Countries', 'Policy Area', 'Commission Responded']
print(successful_detail.to_string(index=False))

## QUESTION 8: Commission Response Analysis

In [None]:
print("\n" + "="*80)
print("QUESTION 8: COMMISSION RESPONSE ANALYSIS")
print("="*80)

responded = df[df['commission_responded'] == True].copy()

print(f"\nTotal initiatives with Commission response: {len(responded)}")
print(f"Response rate (of successful ECIs): {(len(responded)/df['successful_eci'].sum()*100):.2f}%")

print("\n\nTime to receive Commission response (from registration):")
print(responded['time_to_commission_response_days'].describe())

print(f"\nIn years: Mean = {responded['time_to_commission_response_days'].mean()/365:.2f}, Median = {responded['time_to_commission_response_days'].median()/365:.2f}")

print("\n\nResponded initiatives by registration year:")
responded_by_year = responded.groupby('registration_year').size().reset_index()
responded_by_year.columns = ['Year', 'Count']
print(responded_by_year.to_string(index=False))

print("\n\nResponded initiatives by policy area:")
responded_by_policy = responded['primary_policy_area'].value_counts().reset_index()
responded_by_policy.columns = ['Policy Area', 'Count']
print(responded_by_policy.to_string(index=False))

print("\n\nAverage signatures for responded initiatives:")
print(f"Mean: {responded['signatures_numeric'].mean():.0f}")
print(f"Median: {responded['signatures_numeric'].median():.0f}")

print("\n\nDetailed list of initiatives with Commission response:")
responded_detail = responded[['registration_number', 'title', 'registration_year', 
                               'signatures_numeric', 'time_to_commission_response_days',
                               'primary_policy_area']].copy()
responded_detail = responded_detail.sort_values('time_to_commission_response_days')
responded_detail['Years to Response'] = (responded_detail['time_to_commission_response_days'] / 365).round(2)
responded_detail = responded_detail[['registration_number', 'title', 'registration_year', 
                                      'signatures_numeric', 'Years to Response', 'primary_policy_area']]
responded_detail.columns = ['Reg #', 'Title', 'Year', 'Signatures', 'Years to Response', 'Policy Area']
print(responded_detail.to_string(index=False))

## QUESTION 9: Funding Analysis

In [None]:
print("\n" + "="*80)
print("QUESTION 9: FUNDING ANALYSIS")
print("="*80)

print(f"\nInitiatives with funding data: {df['funding_numeric'].notna().sum()}")

print("\n\nFunding statistics:")
print(df['funding_numeric'].describe())

print("\n\nFunding by outcome:")
funding_by_outcome = df.groupby('final_outcome').agg({
    'funding_numeric': ['count', 'mean', 'median', 'min', 'max']
}).round(2)
print(funding_by_outcome)

print("\n\nFunding comparison: Successful vs Unsuccessful:")
successful_data = df[df['successful_eci'] == True]
unsuccessful_data = df[df['successful_eci'] == False]
successful_funding = successful_data['funding_numeric']
unsuccessful_funding = unsuccessful_data['funding_numeric']

funding_comparison = pd.DataFrame({
    'Category': ['Successful ECIs', 'Unsuccessful ECIs'],
    'Count': [successful_funding.notna().sum(), unsuccessful_funding.notna().sum()],
    'Mean Funding': [successful_funding.mean(), unsuccessful_funding.mean()],
    'Median Funding': [successful_funding.median(), unsuccessful_funding.median()],
    'Max Funding': [successful_funding.max(), unsuccessful_funding.max()]
})
print(funding_comparison.to_string(index=False))

print("\n\nCorrelation: Funding vs Signatures")
initiatives_with_both = df[(df['funding_numeric'].notna()) & (df['signatures_numeric'].notna())]
if len(initiatives_with_both) > 0:
    correlation = initiatives_with_both[['funding_numeric', 'signatures_numeric']].corr()
    print(correlation)
    print(f"\nCorrelation coefficient: {correlation.iloc[0, 1]:.4f}")

## QUESTION 10: Member State Participation Analysis

In [None]:
print("\n" + "="*80)
print("QUESTION 10: MEMBER STATE PARTICIPATION ANALYSIS")
print("="*80)

# Function to extract countries that met threshold
def extract_countries_met_threshold(row):
    if pd.isna(row['signatures_collected_by_country']):
        return []
    
    try:
        country_data = json.loads(row['signatures_collected_by_country'])
        countries_met = []
        for country, data in country_data.items():
            if isinstance(data, dict) and 'percentage' in data:
                pct_str = data['percentage'].rstrip('%')
                try:
                    pct = float(pct_str)
                    if pct >= 100.0:
                        countries_met.append(country)
                except:
                    pass
        return countries_met
    except:
        return []

df['countries_met_threshold_list'] = df.apply(extract_countries_met_threshold, axis=1)

# Count countries in successful ECIs
all_countries = []
for countries in df[df['successful_eci']]['countries_met_threshold_list']:
    all_countries.extend(countries)

country_counts = Counter(all_countries)
country_participation = pd.DataFrame(country_counts.items(), columns=['Country', 'Times Met Threshold'])
country_participation = country_participation.sort_values('Times Met Threshold', ascending=False)

print(f"\nCountries that met threshold in successful ECIs (n={len(df[df['successful_eci']])} successful initiatives):")
country_participation['Participation Rate (%)'] = (country_participation['Times Met Threshold'] / len(df[df['successful_eci']]) * 100).round(2)
print(country_participation.to_string(index=False))

# Organizer country analysis
def extract_organizer_countries(row):
    try:
        org_data = json.loads(row['organizer_representative'])
        if 'countries_of_residence' in org_data:
            return list(org_data['countries_of_residence'].keys())
        return []
    except:
        return []

df['organizer_countries'] = df.apply(extract_organizer_countries, axis=1)

# Count organizer countries
all_org_countries = []
for countries in df['organizer_countries']:
    all_org_countries.extend(countries)

org_country_counts = Counter(all_org_countries)
org_participation = pd.DataFrame(org_country_counts.items(), columns=['Country', 'Initiatives Organized'])
org_participation = org_participation.sort_values('Initiatives Organized', ascending=False)

print("\n\nTop 15 countries by number of initiatives organized:")
print(org_participation.head(15).to_string(index=False))

# Successful organizer countries
successful_org_countries = []
for countries in df[df['successful_eci']]['organizer_countries']:
    successful_org_countries.extend(countries)

successful_org_counts = Counter(successful_org_countries)
successful_org_participation = pd.DataFrame(successful_org_counts.items(), columns=['Country', 'Successful Initiatives'])
successful_org_participation = successful_org_participation.sort_values('Successful Initiatives', ascending=False)

print("\n\nTop 10 countries organizing successful initiatives:")
print(successful_org_participation.head(10).to_string(index=False))

## QUESTION 11: Correlation Analysis - Key Success Factors

In [None]:
print("\n" + "="*80)
print("QUESTION 11: CORRELATION ANALYSIS - KEY SUCCESS FACTORS")
print("="*80)

# Create analysis dataset
analysis_df = df[[
    'successful_eci',
    'commission_responded',
    'signatures_numeric',
    'signatures_threshold_met_numeric',
    'funding_numeric',
    'collection_duration_days',
    'registration_to_collection_days',
    'registration_year'
]].copy()

# Convert boolean to numeric
analysis_df['successful_numeric'] = analysis_df['successful_eci'].astype(int)
analysis_df['responded_numeric'] = analysis_df['commission_responded'].astype(int)

print("\nCorrelation matrix of key metrics with success:")
corr_columns = [
    'successful_numeric',
    'signatures_numeric',
    'signatures_threshold_met_numeric',
    'funding_numeric',
    'collection_duration_days',
    'registration_to_collection_days',
    'registration_year'
]

correlation_matrix = analysis_df[corr_columns].corr()
print(correlation_matrix['successful_numeric'].sort_values(ascending=False).to_string())

print("\n\nCorrelation with Commission response:")
print(analysis_df[corr_columns[:-1] + ['responded_numeric']].corr()['responded_numeric'].sort_values(ascending=False).to_string())

# Comparison for key metrics
print("\n\nDetailed comparison:")
successful_data = df[df['successful_eci'] == True]
unsuccessful_data = df[df['successful_eci'] == False]

comparison_detail = pd.DataFrame({
    'Metric': [
        'Avg Signatures',
        'Avg Funding (€)',
        'Avg Collection Duration (days)',
        'Avg Countries Met Threshold',
        'Avg Time to Collection Start (days)'
    ],
    'Successful ECIs': [
        successful_data['signatures_numeric'].mean(),
        successful_data['funding_numeric'].mean(),
        successful_data['collection_duration_days'].mean(),
        successful_data['signatures_threshold_met_numeric'].mean(),
        successful_data['registration_to_collection_days'].mean()
    ],
    'Unsuccessful ECIs': [
        unsuccessful_data['signatures_numeric'].mean(),
        unsuccessful_data['funding_numeric'].mean(),
        unsuccessful_data['collection_duration_days'].mean(),
        unsuccessful_data['signatures_threshold_met_numeric'].mean(),
        unsuccessful_data['registration_to_collection_days'].mean()
    ]
})

comparison_detail['Difference'] = comparison_detail['Successful ECIs'] - comparison_detail['Unsuccessful ECIs']
comparison_detail['Ratio'] = (comparison_detail['Successful ECIs'] / comparison_detail['Unsuccessful ECIs']).round(2)
print(comparison_detail.to_string(index=False))

## QUESTION 12: Cohort Analysis - Performance by Registration Year

In [None]:
print("\n" + "="*80)
print("QUESTION 12: COHORT ANALYSIS - PERFORMANCE BY REGISTRATION YEAR")
print("="*80)

cohort_analysis = df.groupby('registration_year').agg({
    'registration_number': 'count',
    'successful_eci': 'sum',
    'commission_responded': 'sum',
    'signatures_numeric': ['mean', 'median', 'count'],
    'collection_duration_days': 'mean',
    'funding_numeric': 'mean',
    'time_to_commission_response_days': 'mean'
}).round(2)

cohort_analysis.columns = ['_'.join(col).strip('_') for col in cohort_analysis.columns.values]
cohort_analysis = cohort_analysis.reset_index()
cohort_analysis['success_rate_%'] = (cohort_analysis['successful_eci_sum'] / cohort_analysis['registration_number_count'] * 100).round(2)
cohort_analysis['response_rate_%'] = (cohort_analysis['commission_responded_sum'] / cohort_analysis['registration_number_count'] * 100).round(2)

print("\nCohort performance by registration year:")
display_cols = [
    'registration_year',
    'registration_number_count',
    'successful_eci_sum',
    'commission_responded_sum',
    'success_rate_%',
    'response_rate_%',
    'signatures_numeric_mean',
    'funding_numeric_mean'
]
print(cohort_analysis[display_cols].to_string(index=False))

# Time to response by year
print("\n\nAverage time to Commission response by registration year:")
response_by_year = df[df['commission_responded']].groupby('registration_year').agg({
    'time_to_commission_response_days': ['count', 'mean', 'min', 'max']
}).round(0)
print(response_by_year)

# Current status of recent initiatives (2023-2025)
print("\n\nStatus of recent initiatives (2023-2025):")
recent = df[df['registration_year'] >= 2023]
recent_status = recent.groupby(['registration_year', 'current_status']).size().reset_index(name='Count')
recent_pivot = recent_status.pivot(index='current_status', columns='registration_year', values='Count').fillna(0)
print(recent_pivot)

## QUESTION 13: Executive Summary for ECI Organizers

In [None]:
print("\n" + "="*80)
print("QUESTION 13: EXECUTIVE SUMMARY FOR ECI ORGANIZERS")
print("="*80)

print("\n### OVERALL ECI LANDSCAPE ###")
print(f"Total ECIs registered (all time): {len(df)}")
print(f"Time period: {df['registration_year'].min():.0f} - {df['registration_year'].max():.0f}")
print(f"Average initiatives per year: {len(df) / (df['registration_year'].max() - df['registration_year'].min() + 1):.1f}")

print("\n\n### SUCCESS RATES ###")
print(f"Initiatives reaching 1M signatures: {df['reached_signatures'].sum()} ({(df['reached_signatures'].sum()/len(df)*100):.1f}%)")
print(f"Initiatives meeting country threshold: {df['met_country_threshold'].sum()} ({(df['met_country_threshold'].sum()/len(df)*100):.1f}%)")
print(f"Successful ECIs (both criteria): {df['successful_eci'].sum()} ({(df['successful_eci'].sum()/len(df)*100):.1f}%)")
print(f"Commission responses received: {df['commission_responded'].sum()} ({(df['commission_responded'].sum()/len(df)*100):.1f}%)")
print(f"Response rate for successful ECIs: {(df['commission_responded'].sum()/df['successful_eci'].sum()*100):.1f}%")

print("\n\n### KEY BARRIERS ###")
unsuccessful = df[df['final_outcome'] == 'Unsuccessful Collection']
withdrawn = df[df['final_outcome'] == 'Withdrawn']
print(f"Unsuccessful collections: {len(unsuccessful)} ({(len(unsuccessful)/len(df)*100):.1f}%)")
print(f"Withdrawn initiatives: {len(withdrawn)} ({(len(withdrawn)/len(df)*100):.1f}%)")
print(f"Attrition rate (did not complete): {((len(unsuccessful) + len(withdrawn))/len(df)*100):.1f}%")

print("\n\n### SIGNATURES REQUIRED ###")
successful_sigs = df[df['successful_eci']]['signatures_numeric']
print(f"Minimum signatures among successful: {successful_sigs.min():,.0f}")
print(f"Average signatures for successful: {successful_sigs.mean():,.0f}")
print(f"Median signatures for successful: {successful_sigs.median():,.0f}")
print(f"Maximum signatures achieved: {successful_sigs.max():,.0f}")

print("\n\n### COUNTRY THRESHOLD PATTERNS ###")
successful_countries = df[df['successful_eci']]['signatures_threshold_met_numeric']
print(f"Minimum countries needed: 7")
print(f"Average countries met in successful: {successful_countries.mean():.1f}")
print(f"Maximum countries met: {int(successful_countries.max())}")

print("\n\n### TIME EXPECTATIONS ###")
print(f"Average collection period: {df['collection_duration_days'].mean():.0f} days ({(df['collection_duration_days'].mean()/365):.1f} years)")
print(f"Median collection period: {df['collection_duration_days'].median():.0f} days ({(df['collection_duration_days'].median()/365):.1f} years)")
successful_collection = df[df['successful_eci']]['collection_duration_days']
print(f"Average for successful: {successful_collection.mean():.0f} days ({(successful_collection.mean()/365):.1f} years)")
print(f"Time to Commission response: {responded['time_to_commission_response_days'].mean():.0f} days ({(responded['time_to_commission_response_days'].mean()/365):.2f} years)")

print("\n\n### FUNDING INSIGHTS ###")
print(f"Successful ECIs avg funding: €{successful_data['funding_numeric'].mean():,.0f}")
print(f"Unsuccessful ECIs avg funding: €{unsuccessful_data['funding_numeric'].mean():,.0f}")
print(f"Funding advantage ratio: {(successful_data['funding_numeric'].mean() / unsuccessful_data['funding_numeric'].mean()):.1f}x")
print(f"Correlation (funding vs signatures): {correlation_matrix.iloc[0, 1]:.3f}")

print("\n\n### TOPIC AREAS WITH HIGHEST SUCCESS ###")
top_topics = policy_success[policy_success['Total'] >= 5].sort_values('Success Rate (%)', ascending=False).head(5)
print(top_topics[['Policy Area', 'Total', 'Successful', 'Success Rate (%)']].to_string(index=False))

print("\n\n### TEMPORAL TRENDS ###")
print("Best performing years:")
best_years = yearly_stats.nlargest(3, 'Success Rate (%)')
print(best_years[['Year', 'Total Registered', 'Successful', 'Success Rate (%)']].to_string(index=False))

print("\n\nWorst performing years (with >= 5 registrations):")
worst_years = yearly_stats[yearly_stats['Total Registered'] >= 5].nsmallest(3, 'Success Rate (%)')
print(worst_years[['Year', 'Total Registered', 'Successful', 'Success Rate (%)']].to_string(index=False))

print("\n\n### GEOGRAPHIC INSIGHTS ###")
print("Top 5 countries by threshold achievement in successful ECIs:")
print(country_participation.head(5).to_string(index=False))

print("\n\nTop 5 countries organizing successful initiatives:")
print(successful_org_participation.head(5).to_string(index=False))

## Export Analysis Results to CSV

In [None]:
# Export enhanced dataset with calculated fields
export_df = df[[
    'registration_number',
    'title',
    'registration_year',
    'current_status',
    'final_outcome',
    'primary_policy_area',
    'signatures_numeric',
    'signatures_threshold_met_numeric',
    'funding_numeric',
    'collection_duration_days',
    'registration_to_collection_days',
    'time_to_commission_response_days',
    'reached_signatures',
    'met_country_threshold',
    'successful_eci',
    'commission_responded'
]].copy()

export_df.columns = [
    'Registration_Number',
    'Title',
    'Year',
    'Current_Status',
    'Final_Outcome',
    'Policy_Area',
    'Signatures',
    'Countries_Met_Threshold',
    'Funding_EUR',
    'Collection_Duration_Days',
    'Registration_to_Collection_Days',
    'Time_to_Commission_Response_Days',
    'Reached_1M_Signatures',
    'Met_Country_Threshold_7plus',
    'Successful_ECI',
    'Commission_Responded'
]

export_df.to_csv('eci_analysis_enhanced.csv', index=False)
print("✓ Exported: eci_analysis_enhanced.csv")

# Export summary statistics
summary_stats = pd.DataFrame({
    'Metric': [
        'Total ECIs Registered',
        'Successful ECIs',
        'Commission Responses',
        'Success Rate (%)',
        'Response Rate of Successful (%)',
        'Avg Signatures (Successful)',
        'Avg Funding EUR (Successful)',
        'Avg Collection Duration Days',
        'Avg Time to Response Days',
        'Withdrawn Rate (%)',
        'Unsuccessful Rate (%)'
    ],
    'Value': [
        len(df),
        df['successful_eci'].sum(),
        df['commission_responded'].sum(),
        round(df['successful_eci'].sum() / len(df) * 100, 2),
        round(df['commission_responded'].sum() / df['successful_eci'].sum() * 100, 2),
        round(successful_data['signatures_numeric'].mean(), 0),
        round(successful_data['funding_numeric'].mean(), 0),
        round(df['collection_duration_days'].mean(), 0),
        round(responded['time_to_commission_response_days'].mean(), 0),
        round(len(withdrawn) / len(df) * 100, 2),
        round(len(unsuccessful) / len(df) * 100, 2)
    ]
})

summary_stats.to_csv('eci_summary_statistics.csv', index=False)
print("✓ Exported: eci_summary_statistics.csv")

# Export policy area analysis
policy_success.to_csv('eci_policy_area_analysis.csv', index=False)
print("✓ Exported: eci_policy_area_analysis.csv")

# Export yearly trends
yearly_stats.to_csv('eci_yearly_trends.csv', index=False)
print("✓ Exported: eci_yearly_trends.csv")

# Export country participation
country_participation.to_csv('eci_country_threshold_achievement.csv', index=False)
print("✓ Exported: eci_country_threshold_achievement.csv")

print("\n" + "="*80)
print("ANALYSIS COMPLETE - All outputs exported")
print("="*80)