# European Citizens' Initiative (ECI) Data Analysis
## Exploratory Data Analysis for ECI Initiative Organizers

This notebook analyzes 121 ECI initiatives registered between 2012-2025,
examining success patterns, barriers, and key performance indicators.

**Enhanced with interactive Plotly visualizations** ðŸ“Š

## Setup: Import Libraries and Load Data

In [258]:
import pandas as pd
import numpy as np
from datetime import datetime
import json
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Plotly for interactive visualizations
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Set plotly default template
import plotly.io as pio
pio.templates.default = "plotly_white"

# Load the dataset
data_folder = "../data/2025-09-18_16-33-57"
df = pd.read_csv(f'{data_folder}/eci_initiatives_2025-11-04_11-59-38.csv')
print(f"Dataset loaded: {len(df)} initiatives")
print(f"Columns: {len(df.columns)}")

# Set colors
viridis_colors = px.colors.sequential.Viridis

Dataset loaded: 121 initiatives
Columns: 26


## Data Cleaning and Feature Engineering

In [259]:
# Parse date strings from DD/MM/YYYY to datetime
def parse_date(date_str):
    if pd.isna(date_str) or date_str == '':
        return pd.NaT
    try:
        return pd.to_datetime(date_str, format='%d/%m/%Y')
    except:
        return pd.NaT

# Parse main timeline dates
df['registered_date'] = df['timeline_registered'].apply(parse_date)
df['collection_start_date'] = df['timeline_collection_start_date'].apply(parse_date)
df['collection_closed_date'] = df['timeline_collection_closed'].apply(parse_date)
df['verification_start_date'] = df['timeline_verification_start'].apply(parse_date)
df['verification_end_date'] = df['timeline_verification_end'].apply(parse_date)
df['commission_response_date'] = df['timeline_response_commission_date'].apply(parse_date)

# Extract year from registration
df['registration_year'] = df['registered_date'].dt.year

# Calculate durations
df['collection_duration_days'] = (df['collection_closed_date'] - df['collection_start_date']).dt.days
df['verification_duration_days'] = (df['verification_end_date'] - df['verification_start_date']).dt.days
df['time_to_commission_response_days'] = (df['commission_response_date'] - df['registered_date']).dt.days
df['registration_to_collection_days'] = (df['collection_start_date'] - df['registered_date']).dt.days

# Parse signatures (handle commas and convert to numeric)
def parse_signatures(sig):
    if pd.isna(sig):
        return np.nan
    if isinstance(sig, str):
        return float(sig.replace(',', ''))
    return float(sig)

df['signatures_numeric'] = df['signatures_collected'].apply(parse_signatures)
df['signatures_threshold_met_numeric'] = pd.to_numeric(df['signatures_threshold_met'], errors='coerce')

# Parse funding (handle commas and convert to numeric)
def parse_funding(funding):
    if pd.isna(funding):
        return np.nan
    if isinstance(funding, str):
        return float(funding.replace(',', ''))
    return float(funding)

df['funding_numeric'] = df['funding_total'].apply(parse_funding)

# Define success categories
df['reached_signatures'] = df['signatures_numeric'] >= 1000000
df['met_country_threshold'] = df['signatures_threshold_met_numeric'] >= 7
df['successful_eci'] = df['reached_signatures'] & df['met_country_threshold']
df['commission_responded'] = df['final_outcome'] == 'Commission Response'

print("\n=== Data Cleaning Complete ===")
print(f"Total initiatives: {len(df)}")
print(f"Reached 1M signatures: {df['reached_signatures'].sum()}")
print(f"Met country threshold (7+): {df['met_country_threshold'].sum()}")
print(f"Successful ECIs (both criteria): {df['successful_eci'].sum()}")
print(f"Commission responded: {df['commission_responded'].sum()}")


=== Data Cleaning Complete ===
Total initiatives: 121
Reached 1M signatures: 16
Met country threshold (7+): 17
Successful ECIs (both criteria): 16
Commission responded: 11


## QUESTION 1: Status and Outcome Distribution

In [260]:
print("\n" + "="*80)
print("QUESTION 2: STATUS AND OUTCOME DISTRIBUTION")
print("="*80)

print("\nCurrent Status Distribution:")
status_dist = df['current_status'].value_counts().reset_index()
status_dist.columns = ['Status', 'Count']
status_dist['Percentage'] = (status_dist['Count'] / len(df) * 100).round(2)
print(status_dist.to_string(index=False))

print("\n\nFinal Outcome Distribution:")
outcome_dist = df['final_outcome'].value_counts(dropna=False).reset_index()
outcome_dist.columns = ['Outcome', 'Count']
outcome_dist['Percentage'] = (outcome_dist['Count'] / len(df) * 100).round(2)
print(outcome_dist.to_string(index=False))

print("\n\nWithdrawal Analysis:")
withdrawn = df[df['final_outcome'] == 'Withdrawn']
print(f"Total withdrawn: {len(withdrawn)}")
print(f"Percentage of all initiatives: {(len(withdrawn)/len(df)*100):.2f}%")
print(f"Had collection data: {withdrawn['collection_start_date'].notna().sum()}")
print(f"Had signatures data: {withdrawn['signatures_numeric'].notna().sum()}")


QUESTION 2: STATUS AND OUTCOME DISTRIBUTION

Current Status Distribution:
                                                Status  Count  Percentage
                               Unsuccessful collection     71       58.68
                                             Withdrawn     27       22.31
                                   Answered initiative     11        9.09
                                    Collection ongoing      7        5.79
                                          Verification      2        1.65
Verification\n                  \n                   *      1        0.83
                                      Valid initiative      1        0.83
                                            Registered      1        0.83


Final Outcome Distribution:
                Outcome  Count  Percentage
Unsuccessful Collection     71       58.68
              Withdrawn     27       22.31
                    NaN     12        9.92
    Commission Response     11        9.09


Withdrawal A

In [261]:
# ðŸ“Š VISUALIZATION: Pie Chart of Final Outcomes
outcome_clean = outcome_dist[outcome_dist['Outcome'].notna()].copy()


# Define custom colors: green for success, reddish tones for unsuccessful/withdrawn
custom_colors = {
    'Unsuccessful Collection': '#C34242',  # Muted red
    'Commission Response': '#3CA371',      # Muted green
    'Withdrawn': '#909090'                 # Grey
}


# Create color sequence based on outcome order
color_sequence = [custom_colors.get(outcome, '#CCCCCC') for outcome in outcome_clean['Outcome']]


fig = px.pie(outcome_clean, 
             values='Count', 
             names='Outcome',
             title='Final Outcome Distribution of All ECI Initiatives',
             hole=0.1,
             color_discrete_sequence=color_sequence)



fig.update_traces(
    textposition='inside', 
    textinfo='percent+label+value', 
    hoverinfo='none', 
    # insidetextorientation='horizontal',
    textfont=dict(size=16, family='Arial Black', color='white')
)
fig.update_layout(height=600)
fig.show()

## QUESTION 2: Overall Success Funnel Analysis

In [262]:
# Calculate conversion rates at each stage of the ECI process
total_registered = len(df)
started_collection = df['collection_start_date'].notna().sum()
completed_collection = df['collection_closed_date'].notna().sum()
reached_1m = df['reached_signatures'].sum()
met_threshold = df['met_country_threshold'].sum()
both_criteria = df['successful_eci'].sum()
commission_response = df['commission_responded'].sum()

# Create funnel dataframe
funnel_data = pd.DataFrame({
    'Stage': [
        '1. Registered',
        '2. Collection Started',
        '3. Collection Completed',
        '4. Met Country Threshold (7+)',
        '5. Reached 1M Signatures',
        '6. Successful (Both Criteria)',
        '7. Commission Response'
    ],
    'Count': [
        total_registered,
        started_collection,
        completed_collection,
        met_threshold,
        reached_1m,
        both_criteria,
        commission_response
    ]
})

funnel_data['Percentage of Registered'] = (funnel_data['Count'] / total_registered * 100).round(2)
funnel_data['Conversion from Previous Stage'] = (funnel_data['Count'] / funnel_data['Count'].shift(1) * 100).round(2)

print("\n" + "="*80)
print("QUESTION 2: ECI SUCCESS FUNNEL ANALYSIS")
print("="*80)
print("\nSuccess Funnel:")
print(funnel_data.to_string(index=False))

print(f"\n\nKey Insights:")
print(f"- Overall success rate (registered â†’ commission response): {(commission_response/total_registered*100):.2f}%")
print(f"- Success rate (registered â†’ met both criteria): {(both_criteria/total_registered*100):.2f}%")
print(f"- Response rate (successful â†’ commission response): {(commission_response/both_criteria*100):.2f}%")


QUESTION 2: ECI SUCCESS FUNNEL ANALYSIS

Success Funnel:
                        Stage  Count  Percentage of Registered  Conversion from Previous Stage
                1. Registered    121                    100.00                             NaN
        2. Collection Started    110                     90.91                           90.91
      3. Collection Completed     86                     71.07                           78.18
4. Met Country Threshold (7+)     17                     14.05                           19.77
     5. Reached 1M Signatures     16                     13.22                           94.12
6. Successful (Both Criteria)     16                     13.22                          100.00
       7. Commission Response     11                      9.09                           68.75


Key Insights:
- Overall success rate (registered â†’ commission response): 9.09%
- Success rate (registered â†’ met both criteria): 13.22%
- Response rate (successful â†’ commissio

In [263]:
# ðŸ“Š VISUALIZATION: Interactive Funnel Chart
colors = [viridis_colors[i] for i in [0, 2, 4, 6, 7, 8, 9]]

# Create stage names without numbers for hover
stage_names_no_numbers = funnel_data['Stage']

fig = go.Figure(go.Funnel(
    y = funnel_data['Stage'],
    x = funnel_data['Count'],
    textposition = "inside",
    textinfo = "value+percent previous",
    marker = dict(
        color = colors
    ),
    connector = {
        "fillcolor": "#CEDFF6",  # Light steel blue for connector fill
        "visible": True
    },
    customdata = stage_names_no_numbers,
    hovertemplate = '<b>%{customdata}:</b><br>' +
                    'Count: %{x}<br>' +
                    'Percent of initial: %{percentInitial}<br>' +
                    'Percent of previous: %{percentPrevious}<br>' +
                    '<extra></extra>'
))

fig.update_layout(
    title="ECI Success Funnel: From Registration to Commission Response",
    height=600,
    font=dict(size=14)
)

fig.show()

## QUESTION 3: Signature Collection Performance Analysis

In [264]:
print("\n" + "="*80)
print("QUESTION 3: SIGNATURE COLLECTION PERFORMANCE ANALYSIS")
print("="*80)

initiatives_with_sigs = df[df['signatures_numeric'].notna()].copy()

print(f"\nTotal initiatives with signature data: {len(initiatives_with_sigs)}")
print(f"\nSignature collection statistics:")
sig_stats = initiatives_with_sigs['signatures_numeric'].describe()
print(sig_stats)

# Break down by success
print("\n\nSignature distribution by outcome:")
sig_by_outcome = initiatives_with_sigs.groupby('final_outcome').agg({
    'signatures_numeric': ['count', 'mean', 'median', 'min', 'max']
}).round(0)
print(sig_by_outcome)

# Country threshold analysis
print("\n\nCountry threshold analysis:")
threshold_stats = initiatives_with_sigs.groupby('met_country_threshold').agg({
    'signatures_numeric': ['count', 'mean', 'median'],
    'signatures_threshold_met_numeric': ['mean', 'max']
}).round(2)
print(threshold_stats)

# Correlation between collection duration and signatures
print("\n\nCorrelation: Collection Duration vs Signatures Collected")
initiatives_with_both = initiatives_with_sigs[initiatives_with_sigs['collection_duration_days'].notna()]
if len(initiatives_with_both) > 0:
    correlation = initiatives_with_both[['collection_duration_days', 'signatures_numeric']].corr()
    print(correlation)
    
print("\n\nSuccess rate by signature brackets:")
initiatives_with_sigs['sig_bracket'] = pd.cut(
    initiatives_with_sigs['signatures_numeric'],
    bins=[0, 100000, 250000, 500000, 750000, 1000000, 10000000],
    labels=['<100K', '100K-250K', '250K-500K', '500K-750K', '750K-1M', '1M+']
)
sig_bracket_analysis = initiatives_with_sigs.groupby('sig_bracket', observed=True).agg({
    'registration_number': 'count',
    'successful_eci': 'sum',
    'commission_responded': 'sum'
}).reset_index()
sig_bracket_analysis.columns = ['Signature Bracket', 'Count', 'Successful', 'Commission Response']
print(sig_bracket_analysis.to_string(index=False))


QUESTION 3: SIGNATURE COLLECTION PERFORMANCE ANALYSIS

Total initiatives with signature data: 60

Signature collection statistics:
count    6.000000e+01
mean     4.091707e+05
std      5.649858e+05
min      2.540000e+02
25%      5.578750e+03
50%      3.851450e+04
75%      1.058428e+06
max      1.721626e+06
Name: signatures_numeric, dtype: float64


Signature distribution by outcome:
                        signatures_numeric                                   \
                                     count       mean     median        min   
final_outcome                                                                 
Commission Response                     11  1300932.0  1217916.0  1054973.0   
Unsuccessful Collection                 33    93363.0    15364.0      254.0   
Withdrawn                                5     8395.0     2248.0      291.0   

                                    
                               max  
final_outcome                       
Commission Response      172

In [265]:
# ðŸ“Š VISUALIZATION: Histogram of Signature Distribution with Gradient Colors
import numpy as np

# Create bins manually for better control
num_bins = 50
bins = np.linspace(0, initiatives_with_sigs['signatures_numeric'].max(), num_bins + 1)

# Split at 1M threshold
threshold = 1000000
below_bins = bins[bins < threshold]
above_bins = bins[bins >= threshold]

# Create histograms for below threshold
hist_below, bin_edges_below = np.histogram(
    initiatives_with_sigs[initiatives_with_sigs['signatures_numeric'] < threshold]['signatures_numeric'],
    bins=below_bins
)

# Create histograms for above threshold  
hist_above, bin_edges_above = np.histogram(
    initiatives_with_sigs[initiatives_with_sigs['signatures_numeric'] >= threshold]['signatures_numeric'],
    bins=above_bins
)

# Calculate bin centers for coloring
bin_centers_below = (bin_edges_below[:-1] + bin_edges_below[1:]) / 2
bin_centers_above = (bin_edges_above[:-1] + bin_edges_above[1:]) / 2

# Create color arrays based on bin positions
# Below threshold: red to yellow gradient (red=0, yellow=1M)
colors_below = []
for center in bin_centers_below:
    ratio = center / threshold
    # Interpolate from red (#C34242) to yellow (#FFF44F)
    r = int(195 + (255 - 195) * ratio)
    g = int(66 + (244 - 66) * ratio)
    b = int(66 + (79 - 66) * ratio)
    colors_below.append(f'rgb({r},{g},{b})')

# Above threshold: yellow-green to deep green gradient (1M to 2M+)
colors_above = []
max_sig = initiatives_with_sigs['signatures_numeric'].max()
for center in bin_centers_above:
    ratio = min((center - threshold) / threshold, 1.0)  # Normalize to 1M range
    # Interpolate from yellow-green (#B8D87F) to deep green (#3CA371)
    r = int(184 - (184 - 60) * ratio)
    g = int(216 - (216 - 163) * ratio)
    b = int(127 - (127 - 113) * ratio)
    colors_above.append(f'rgb({r},{g},{b})')

fig = go.Figure()

# Add bars for below threshold
fig.add_trace(go.Bar(
    x=bin_centers_below,
    y=hist_below,
    name='Below 1M Threshold',
    marker=dict(
        color=colors_below,
        line=dict(color='white', width=0.5)
    ),
    width=np.diff(bin_edges_below),
    hovertemplate='Signatures to: %{x:,.0f}<br>Count: %{y}<extra></extra>'
))

# Add bars for above threshold
fig.add_trace(go.Bar(
    x=bin_centers_above,
    y=hist_above,
    name='Above 1M Threshold',
    marker=dict(
        color=colors_above,
        line=dict(color='white', width=0.5)
    ),
    width=np.diff(bin_edges_above),
    hovertemplate='Signatures to: %{x:,.0f}<br>Count: %{y}<extra></extra>'
))

# Add vertical line at 1M threshold
fig.add_vline(x=1000000, line_dash="dash", line_color="#3AB23F", line_width=3,
              annotation_text="1M Threshold", annotation_position="top right",
              annotation_font_size=14)

fig.update_layout(
    title='Distribution of Signature Counts (All ECIs with Data)',
    xaxis_title="Signatures Collected",
    yaxis_title="Number of Initiatives",
    height=500,
    showlegend=True,
    font=dict(size=14),
    bargap=0.05
)

fig.show()


## QUESTION 4: Member State Participation Analysis

In [266]:
print("\n" + "="*80)
print("QUESTION 4: MEMBER STATE PARTICIPATION ANALYSIS")
print("="*80)

# Function to extract countries that met threshold
def extract_countries_met_threshold(row):
    if pd.isna(row['signatures_collected_by_country']):
        return []
    
    try:
        country_data = json.loads(row['signatures_collected_by_country'])
        countries_met = []
        for country, data in country_data.items():
            if isinstance(data, dict) and 'percentage' in data:
                pct_str = data['percentage'].rstrip('%')
                try:
                    pct = float(pct_str)
                    if pct >= 100.0:
                        countries_met.append(country)
                except:
                    pass
        return countries_met
    except:
        return []

df['countries_met_threshold_list'] = df.apply(extract_countries_met_threshold, axis=1)

# Count countries in successful ECIs
all_countries = []
for countries in df[df['successful_eci']]['countries_met_threshold_list']:
    all_countries.extend(countries)

country_counts = Counter(all_countries)
country_participation = pd.DataFrame(country_counts.items(), columns=['Country', 'Times Met Threshold'])
country_participation = country_participation.sort_values('Times Met Threshold', ascending=False)

print(f"\nCountries that met threshold in successful ECIs (n={len(df[df['successful_eci']])} successful initiatives):")
country_participation['Participation Rate (%)'] = (country_participation['Times Met Threshold'] / len(df[df['successful_eci']]) * 100).round(2)
print(country_participation.to_string(index=False))

# Organizer country analysis
def extract_organizer_countries(row):
    try:
        org_data = json.loads(row['organizer_representative'])
        if 'countries_of_residence' in org_data:
            return list(org_data['countries_of_residence'].keys())
        return []
    except:
        return []

df['organizer_countries'] = df.apply(extract_organizer_countries, axis=1)

# Count organizer countries
all_org_countries = []
for countries in df['organizer_countries']:
    all_org_countries.extend(countries)

org_country_counts = Counter(all_org_countries)
org_participation = pd.DataFrame(org_country_counts.items(), columns=['Country', 'Initiatives Organized'])
org_participation = org_participation.sort_values('Initiatives Organized', ascending=False)

print("\n\nTop 15 countries by number of initiatives organized:")
print(org_participation.head(15).to_string(index=False))

# Successful organizer countries
successful_org_countries = []
for countries in df[df['successful_eci']]['organizer_countries']:
    successful_org_countries.extend(countries)

successful_org_counts = Counter(successful_org_countries)
successful_org_participation = pd.DataFrame(successful_org_counts.items(), columns=['Country', 'Successful Initiatives'])
successful_org_participation = successful_org_participation.sort_values('Successful Initiatives', ascending=False)

print("\n\nTop 10 countries organizing successful initiatives:")
print(successful_org_participation.head(10).to_string(index=False))


QUESTION 4: MEMBER STATE PARTICIPATION ANALYSIS

Countries that met threshold in successful ECIs (n=16 successful initiatives):
       Country  Times Met Threshold  Participation Rate (%)
       Germany                   13                   81.25
         Spain                   13                   81.25
   Netherlands                   13                   81.25
       Belgium                   12                   75.00
        France                   11                   68.75
      Slovakia                   10                   62.50
       Ireland                   10                   62.50
       Denmark                   10                   62.50
       Croatia                   10                   62.50
       Austria                   10                   62.50
         Italy                   10                   62.50
       Finland                   10                   62.50
       Hungary                    8                   50.00
        Greece                 

In [267]:
# ðŸ“Š VISUALIZATION: Country Participation Charts
# Top countries meeting thresholds (reversed order - highest at top)
top_countries = country_participation.head(15).iloc[::-1]  # Reverse the order

fig = px.bar(top_countries,
             x='Times Met Threshold',
             y='Country',
             orientation='h',
             title='Top 15 Countries Leading ECI Signature Thresholds',
             color='Participation Rate (%)',
             color_continuous_scale='Viridis',
             text='Times Met Threshold')

fig.update_traces(textposition='outside')
fig.update_layout(height=600)
fig.show()

# Organizer countries comparison (reversed order - highest at top)
fig2 = make_subplots(rows=1, cols=2,
                     subplot_titles=('All Initiatives Organized', 'Successful Initiatives Organized'),
                     horizontal_spacing=0.15)

org_top10 = org_participation.head(10).iloc[::-1]  # Reverse the order
successful_top10 = successful_org_participation.head(10).iloc[::-1]  # Reverse the order

fig2.add_trace(
    go.Bar(x=org_top10['Initiatives Organized'],
           y=org_top10['Country'],
           orientation='h',
           marker_color='lightblue',
           showlegend=False),
    row=1, col=1
)

fig2.add_trace(
    go.Bar(x=successful_top10['Successful Initiatives'],
           y=successful_top10['Country'],
           orientation='h',
           marker_color='#3CA371',
           showlegend=False),
    row=1, col=2
)

fig2.update_layout(height=500, title_text='Countries Organizing ECIs')
fig2.show()

## QUESTION 5: Temporal Trends in Success Rates

In [268]:
print("\n" + "="*80)
print("QUESTION 5: TEMPORAL TRENDS IN ECI SUCCESS RATES")
print("="*80)

yearly_stats = df.groupby('registration_year').agg({
    'registration_number': 'count',
    'reached_signatures': 'sum',
    'met_country_threshold': 'sum',
    'successful_eci': 'sum',
    'commission_responded': 'sum'
}).reset_index()

yearly_stats.columns = ['Year', 'Total Registered', 'Reached 1M', 'Met Country Threshold', 'Successful', 'Commission Response']
yearly_stats['Success Rate (%)'] = (yearly_stats['Successful'] / yearly_stats['Total Registered'] * 100).round(2)
yearly_stats['Response Rate (%)'] = (yearly_stats['Commission Response'] / yearly_stats['Total Registered'] * 100).round(2)

# Calculate Failed count
yearly_stats['Failed'] = yearly_stats['Total Registered'] - yearly_stats['Successful']
yearly_stats['Successful No Response'] = yearly_stats['Successful'] - yearly_stats['Commission Response']

print("\nYearly Success Rates:")
print(yearly_stats.to_string(index=False))

print(f"\n\nTrend Analysis:")
print(f"Best year for success rate: {yearly_stats.loc[yearly_stats['Success Rate (%)'].idxmax(), 'Year']:.0f} ({yearly_stats['Success Rate (%)'].max():.2f}%)")
print(f"Worst year for success rate: {yearly_stats.loc[yearly_stats['Success Rate (%)'].idxmin(), 'Year']:.0f} ({yearly_stats['Success Rate (%)'].min():.2f}%)")
print(f"Average success rate: {yearly_stats['Success Rate (%)'].mean():.2f}%")


QUESTION 5: TEMPORAL TRENDS IN ECI SUCCESS RATES

Yearly Success Rates:
 Year  Total Registered  Reached 1M  Met Country Threshold  Successful  Commission Response  Success Rate (%)  Response Rate (%)  Failed  Successful No Response
 2012                16           3                      3           3                    3             18.75              18.75      13                       0
 2013                 9           0                      0           0                    0              0.00               0.00       9                       0
 2014                 5           0                      0           0                    0              0.00               0.00       5                       0
 2015                 6           0                      0           0                    0              0.00               0.00       6                       0
 2016                 3           0                      0           0                    0              0.00             

In [269]:
# Prepare custom hover data with ECI titles for each category and year
years = sorted(df['registration_year'].unique())

# Calculate yearly stats including in-progress initiatives
yearly_stats_detailed = df.groupby('registration_year').agg({
    'registration_number': 'count',
    'reached_signatures': 'sum',
    'met_country_threshold': 'sum',
    'successful_eci': 'sum',
    'commission_responded': 'sum'
}).reset_index()

yearly_stats_detailed.columns = ['Year', 'Total Registered', 'Reached 1M', 'Met Country Threshold', 'Successful', 'Commission Response']

# Calculate in-progress initiatives (not failed, not successful yet)
# In progress: current_status is 'Collection ongoing', 'Verification', 'Valid initiative', etc.
in_progress_by_year = df[df['final_outcome'].isna()].groupby('registration_year').size().reset_index(name='In Progress')
yearly_stats_detailed = yearly_stats_detailed.merge(in_progress_by_year, left_on='Year', right_on='registration_year', how='left').drop('registration_year', axis=1)

# Calculate truly failed (unsuccessful or withdrawn)
failed_by_year = df[df['final_outcome'].isin(['Unsuccessful Collection', 'Withdrawn'])].groupby('registration_year').size().reset_index(name='Failed')
yearly_stats_detailed = yearly_stats_detailed.merge(failed_by_year, left_on='Year', right_on='registration_year', how='left').drop('registration_year', axis=1)

# Calculate successful without response
yearly_stats_detailed['Successful No Response'] = yearly_stats_detailed['Successful'] - yearly_stats_detailed['Commission Response']

# Create hover text for each category
failed_hover = []
in_progress_hover = []
successful_no_response_hover = []
commission_response_hover = []

for year in years:
    year_df = df[df['registration_year'] == year]
    
    # Failed ECIs
    failed_ecis = year_df[year_df['final_outcome'].isin(['Unsuccessful Collection', 'Withdrawn'])]['title'].tolist()
    failed_hover.append('<br>'.join([f'â€¢ {title}' for title in failed_ecis]) if failed_ecis else 'None')
    
    # In Progress ECIs
    in_progress_ecis = year_df[year_df['final_outcome'].isna()]['title'].tolist()
    in_progress_hover.append('<br>'.join([f'â€¢ {title}' for title in in_progress_ecis]) if in_progress_ecis else 'None')
    
    # Successful No Response ECIs
    successful_no_resp = year_df[(year_df['successful_eci']) & (~year_df['commission_responded'])]['title'].tolist()
    successful_no_response_hover.append('<br>'.join([f'â€¢ {title}' for title in successful_no_resp]) if successful_no_resp else 'None')
    
    # Commission Responded ECIs
    commission_resp = year_df[year_df['commission_responded'] == True]['title'].tolist()
    commission_response_hover.append('<br>'.join([f'â€¢ {title}' for title in commission_resp]) if commission_resp else 'None')

# ðŸ“Š VISUALIZATION: Stacked Bar Chart with ECI Titles in Tooltip
fig = go.Figure()

# Add stacked bars with custom hover data
fig.add_trace(go.Bar(
    x=yearly_stats_detailed['Year'],
    y=yearly_stats_detailed['Failed'],
    name='Failed',
    marker_color='#C34242',
    customdata=failed_hover,
    hovertemplate='<b>Failed</b><br>Year: %{x}<br>Count: %{y}<br><br>ECIs:<br>%{customdata}<extra></extra>'
))

fig.add_trace(go.Bar(
    x=yearly_stats_detailed['Year'],
    y=yearly_stats_detailed['In Progress'],
    name='In Progress',
    marker_color='#6C9BD1',
    customdata=in_progress_hover,
    hovertemplate='<b>In Progress</b><br>Year: %{x}<br>Count: %{y}<br><br>ECIs:<br>%{customdata}<extra></extra>'
))

fig.add_trace(go.Bar(
    x=yearly_stats_detailed['Year'],
    y=yearly_stats_detailed['Successful No Response'],
    name='Successful (No Response)',
    marker_color='#F0B840',
    customdata=successful_no_response_hover,
    hovertemplate='<b>Successful (No Response)</b><br>Year: %{x}<br>Count: %{y}<br><br>ECIs:<br>%{customdata}<extra></extra>'
))

fig.add_trace(go.Bar(
    x=yearly_stats_detailed['Year'],
    y=yearly_stats_detailed['Commission Response'],
    name='Commission Responded',
    marker_color='#3CA371',
    customdata=commission_response_hover,
    hovertemplate='<b>Commission Responded</b><br>Year: %{x}<br>Count: %{y}<br><br>ECIs:<br>%{customdata}<extra></extra>'
))

fig.update_layout(
    title='ECI Outcomes by Registration Year',
    xaxis_title='Registration Year',
    yaxis_title='Number of Initiatives',
    barmode='stack',
    height=600,
    font=dict(size=14),
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    )
)

fig.show()

## QUESTION 6: Duration Analysis - Time Waiting at Each Step

In [270]:
print("\n" + "="*80)
print("QUESTION 6: DURATION ANALYSIS - TIME WAITING AT EACH STEP")
print("="*80)

# Calculate time between key milestones
df['registration_to_collection_days'] = (df['collection_start_date'] - df['registered_date']).dt.days
df['collection_to_response_days'] = (df['commission_response_date'] - df['collection_closed_date']).dt.days

print("\nCollection Period Duration (days):")
collection_stats = df[df['collection_duration_days'].notna()]['collection_duration_days'].describe()
print(collection_stats)

print("\n\nTime from Registration to Collection Start (days):")
reg_to_coll = df[df['registration_to_collection_days'].notna()]['registration_to_collection_days'].describe()
print(reg_to_coll)

print("\n\nTime from Collection End to Commission Response (days):")
coll_to_resp = df[df['collection_to_response_days'].notna()]['collection_to_response_days'].describe()
print(coll_to_resp)

print("\n\nTime from Registration to Commission Response (days) [for responded initiatives]:")
responded = df[df['commission_responded']]
time_to_response = responded['time_to_commission_response_days'].describe()
print(time_to_response)

print("\n\nBreakdown by milestone (median days):")
milestone_df = pd.DataFrame({
    'Milestone': [
        'Registration â†’ Collection Start',
        'Collection Period Duration',
        'Collection End â†’ Commission Response',
        'Registration â†’ Commission Response'
    ],
    'Median Days': [
        df['registration_to_collection_days'].median(),
        df['collection_duration_days'].median(),
        df['collection_to_response_days'].median(),
        responded['time_to_commission_response_days'].median()
    ],
    'Mean Days': [
        df['registration_to_collection_days'].mean(),
        df['collection_duration_days'].mean(),
        df['collection_to_response_days'].mean(),
        responded['time_to_commission_response_days'].mean()
    ]
})
print(milestone_df.to_string(index=False))


QUESTION 6: DURATION ANALYSIS - TIME WAITING AT EACH STEP

Collection Period Duration (days):
count     86.000000
mean     428.383721
std      126.835156
min      158.000000
25%      365.000000
50%      365.000000
75%      445.500000
max      731.000000
Name: collection_duration_days, dtype: float64


Time from Registration to Collection Start (days):
count    110.000000
mean      34.818182
std       58.857811
min        0.000000
25%        0.000000
50%        0.000000
75%       42.000000
max      184.000000
Name: registration_to_collection_days, dtype: float64


Time from Collection End to Commission Response (days):
count      11.000000
mean      547.636364
std       430.169565
min       138.000000
25%       244.500000
50%       520.000000
75%       618.500000
max      1580.000000
Name: collection_to_response_days, dtype: float64


Time from Registration to Commission Response (days) [for responded initiatives]:
count      11.000000
mean     1044.272727
std       532.240376
min     

In [271]:
# ðŸ“Š VISUALIZATION 1: Registration â†’ Collection Start
import numpy as np

data1 = df[df['registration_to_collection_days'].notna()].copy()
hist1, edges1 = np.histogram(data1['registration_to_collection_days'], bins=30)
bin_centers1 = (edges1[:-1] + edges1[1:]) / 2

# Group ECIs by bin
hover_text1 = []
for i in range(len(edges1) - 1):
    if i == len(edges1) - 2:  # Last bin
        bin_ecis = data1[(data1['registration_to_collection_days'] >= edges1[i]) & 
                         (data1['registration_to_collection_days'] <= edges1[i+1])]['title'].tolist()
    else:
        bin_ecis = data1[(data1['registration_to_collection_days'] >= edges1[i]) & 
                         (data1['registration_to_collection_days'] < edges1[i+1])]['title'].tolist()
    titles_text = '<br>'.join(f"â€¢ {title}" for title in bin_ecis) if bin_ecis else "No ECIs"
    hover_text1.append(titles_text)

# Create aggressive gradient from very light to very dark teal
colors1 = []
for i, center in enumerate(bin_centers1):
    ratio = i / len(bin_centers1)
    r = int(200 + (15 - 200) * ratio)
    g = int(240 + (100 - 240) * ratio)
    b = int(220 + (120 - 220) * ratio)
    colors1.append(f'rgb({r},{g},{b})')

fig1 = go.Figure(go.Bar(
    x=bin_centers1,
    y=hist1,
    marker=dict(color=colors1, line=dict(color='white', width=0.5)),
    width=np.diff(edges1),
    customdata=hover_text1,
    hovertemplate='<b>Registration â†’ Collection Start</b><br>Days: %{x:.0f}<br>Count: %{y}<br><br>ECIs:<br>%{customdata}<extra></extra>'
))

fig1.update_layout(
    title='Registration â†’ Collection Start',
    xaxis_title='Days',
    yaxis_title='Number of Initiatives',
    height=400,
    font=dict(size=14)
)
fig1.show()

In [None]:


# ðŸ“Š VISUALIZATION 2: Collection Period Duration
data2 = df[df['collection_duration_days'].notna()].copy()
hist2, edges2 = np.histogram(data2['collection_duration_days'], bins=30)
bin_centers2 = (edges2[:-1] + edges2[1:]) / 2

# Group ECIs by bin
hover_text2 = []
    
for i in range(len(edges2) - 1):
    if i == len(edges2) - 2:
        bin_ecis = data2[(data2['collection_duration_days'] >= edges2[i]) & 
                         (data2['collection_duration_days'] <= edges2[i+1])]['title'].tolist()
    else:
        bin_ecis = data2[(data2['collection_duration_days'] >= edges2[i]) & 
                         (data2['collection_duration_days'] < edges2[i+1])]['title'].tolist()
    titles_text = '<br>'.join(f"â€¢ {title}" for title in bin_ecis) if bin_ecis else "No ECIs"
    hover_text2.append(titles_text)


colors2 = []
for i, center in enumerate(bin_centers2):
    ratio = i / len(bin_centers2)
    r = int(255 + (180 - 255) * ratio)
    g = int(200 + (80 - 200) * ratio)
    b = int(150 + (20 - 150) * ratio)
    colors2.append(f'rgb({r},{g},{b})')

fig2 = go.Figure(go.Bar(
    x=bin_centers2,
    y=hist2,
    marker=dict(color=colors2, line=dict(color='white', width=0.5)),
    width=np.diff(edges2),
    customdata=hover_text2,
    hovertemplate='<b>Collection Period Duration</b><br>Days: %{x:.0f}<br>Count: %{y}<br><br>ECIs:<br>%{customdata}<extra></extra>'
))

fig2.update_layout(
    title='Collection Period Duration',
    xaxis_title='Days',
    yaxis_title='Number of Initiatives',
    height=400,
    font=dict(size=14)
)
fig2.show()

In [308]:


# ðŸ“Š VISUALIZATION 3: Collection End â†’ Commission Response
data3 = df[df['collection_to_response_days'].notna()].copy()
hist3, edges3 = np.histogram(data3['collection_to_response_days'], bins=30)
bin_centers3 = (edges3[:-1] + edges3[1:]) / 2

# Group ECIs by bin
hover_text3 = []

for i in range(len(edges2) - 1):
    if i == len(edges3) - 2:
        bin_ecis = data3[(data3['collection_to_response_days'] >= edges3[i]) & 
                         (data3['collection_to_response_days'] <= edges3[i+1])]['title'].tolist()
    else:
        bin_ecis = data3[(data3['collection_to_response_days'] >= edges3[i]) & 
                         (data3['collection_to_response_days'] < edges3[i+1])]['title'].tolist()
    titles_text = '<br>'.join(f"â€¢ {title}" for title in bin_ecis) if bin_ecis else "No ECIs"
    hover_text3.append(titles_text)


colors3 = []
for i, center in enumerate(bin_centers3):
    ratio = i / len(bin_centers3)
    r = int(200 + (20 - 200) * ratio)
    g = int(250 + (120 - 250) * ratio)
    b = int(150 + (50 - 150) * ratio)
    colors3.append(f'rgb({r},{g},{b})')

fig3 = go.Figure(go.Bar(
    x=bin_centers3,
    y=hist3,
    marker=dict(color=colors3, line=dict(color='white', width=0.5)),
    width=np.diff(edges3),
    customdata=hover_text3,
    hovertemplate='<b>Collection End â†’ Commission Response</b><br>Days: %{x:.0f}<br>Count: %{y}<br><br>ECIs:<br>%{customdata}<extra></extra>'
))

fig3.update_layout(
    title='Collection End â†’ Commission Response',
    xaxis_title='Days',
    yaxis_title='Number of Initiatives',
    height=400,
    font=dict(size=14)
)
fig3.show()

In [310]:


# ðŸ“Š VISUALIZATION 4: Registration â†’ Commission Response
data4 = responded[responded['time_to_commission_response_days'].notna()].copy()
hist4, edges4 = np.histogram(data4['time_to_commission_response_days'], bins=30)
bin_centers4 = (edges4[:-1] + edges4[1:]) / 2

# Group ECIs by bin
hover_text4 = []
for i in range(len(edges4) - 1):
    if i == len(edges4) - 2:
        bin_ecis = data4[(data4['time_to_commission_response_days'] >= edges4[i]) & 
                         (data4['time_to_commission_response_days'] <= edges4[i+1])]['title'].tolist()
    else:
        bin_ecis = data4[(data4['time_to_commission_response_days'] >= edges4[i]) & 
                         (data4['time_to_commission_response_days'] < edges4[i+1])]['title'].tolist()
    titles_text = '<br>'.join(f"â€¢ {title}" for title in bin_ecis) if bin_ecis else "No ECIs"
    hover_text4.append(titles_text)

colors4 = []
for i, center in enumerate(bin_centers4):
    ratio = i / len(bin_centers4)
    r = int(255 + (100 - 255) * ratio)
    g = int(255 + (140 - 255) * ratio)
    b = int(150 + (20 - 150) * ratio)
    colors4.append(f'rgb({r},{g},{b})')

fig4 = go.Figure(go.Bar(
    x=bin_centers4,
    y=hist4,
    marker=dict(color=colors4, line=dict(color='white', width=0.5)),
    width=np.diff(edges4),
    customdata=hover_text4,
    hovertemplate='<b>Registration â†’ Commission Response</b><br>Days: %{x:.0f}<br>Count: %{y}<br><br>ECIs:<br>%{customdata}<extra></extra>'
))

fig4.update_layout(
    title='Registration â†’ Commission Response',
    xaxis_title='Days',
    yaxis_title='Number of Initiatives',
    height=400,
    font=dict(size=14)
)
fig4.show()

## QUESTION 7: Topic and Policy Area Analysis

In [275]:
print("\n" + "="*80)
print("QUESTION 7: TOPIC AND POLICY AREA ANALYSIS")
print("="*80)

# Define policy area keywords
policy_keywords = {
    'Environment': ['climate', 'environment', 'pollution', 'waste', 'nature', 'biodiversity', 'water', 'emissions', 'green', 'sustainability'],
    'Animal Welfare': ['animal', 'vivisection', 'fur', 'hunting', 'wildlife', 'welfare'],
    'Health': ['health', 'healthcare', 'medical', 'medicine', 'covid', 'pandemic', 'disease'],
    'Rights & Democracy': ['rights', 'democracy', 'freedom', 'liberty', 'justice', 'vote', 'citizenship', 'equality'],
    'Economy & Finance': ['economy', 'tax', 'finance', 'budget', 'trade', 'economic', 'financial', 'income'],
    'Education & Culture': ['education', 'culture', 'art', 'heritage', 'learning', 'school', 'university'],
    'Agriculture & Food': ['agriculture', 'food', 'farming', 'dairy', 'rural', 'pesticide', 'organic'],
    'Social Policy': ['social', 'poverty', 'welfare', 'employment', 'worker', 'housing', 'family'],
    'Migration': ['migration', 'immigrant', 'refugee', 'asylum', 'border'],
    'Digital & Tech': ['digital', 'internet', 'technology', 'data', 'privacy', 'cyber'],
    'Energy': ['energy', 'nuclear', 'renewable', 'fossil', 'electricity'],
    'Transport': ['transport', 'mobility', 'traffic', 'railway', 'vehicle']
}

# Function to categorize initiatives
def categorize_initiative(title, objective):
    text = (str(title) + ' ' + str(objective)).lower()
    categories = []
    for category, keywords in policy_keywords.items():
        if any(keyword in text for keyword in keywords):
            categories.append(category)
    return categories if categories else ['Other']

df['policy_areas'] = df.apply(lambda row: categorize_initiative(row['title'], row['objective']), axis=1)
df['primary_policy_area'] = df['policy_areas'].apply(lambda x: x[0] if x else 'Other')

# Count by policy area
print("\nInitiatives by Primary Policy Area:")
policy_dist = df['primary_policy_area'].value_counts().reset_index()
policy_dist.columns = ['Policy Area', 'Count']
policy_dist['Percentage'] = (policy_dist['Count'] / len(df) * 100).round(2)
print(policy_dist.to_string(index=False))

# Success rate by policy area
print("\n\nSuccess Rate by Policy Area:")
policy_success = df.groupby('primary_policy_area').agg({
    'registration_number': 'count',
    'successful_eci': 'sum',
    'commission_responded': 'sum',
    'signatures_numeric': 'mean'
}).reset_index()
policy_success.columns = ['Policy Area', 'Total', 'Successful', 'Commission Response', 'Avg Signatures']
policy_success['Success Rate (%)'] = (policy_success['Successful'] / policy_success['Total'] * 100).round(2)
policy_success = policy_success.sort_values('Success Rate (%)', ascending=False)
print(policy_success.to_string(index=False))


QUESTION 7: TOPIC AND POLICY AREA ANALYSIS

Initiatives by Primary Policy Area:
        Policy Area  Count  Percentage
        Environment     50       41.32
 Rights & Democracy     26       21.49
Education & Culture     11        9.09
  Economy & Finance      9        7.44
     Animal Welfare      9        7.44
             Health      8        6.61
              Other      5        4.13
      Social Policy      2        1.65
          Transport      1        0.83


Success Rate by Policy Area:
        Policy Area  Total  Successful  Commission Response  Avg Signatures  Success Rate (%)
     Animal Welfare      9           3                    2   773492.000000             33.33
  Economy & Finance      9           2                    0   733089.000000             22.22
        Environment     50           9                    8   432235.424242             18.00
             Health      8           1                    1   673257.333333             12.50
 Rights & Democracy     26  

In [276]:
# ðŸ“Š VISUALIZATION: Grouped Bar Chart - Policy Area Performance
fig = go.Figure()

fig.add_trace(go.Bar(
    x=policy_success['Policy Area'],
    y=policy_success['Total'],
    name='Total Initiatives',
    marker_color='lightblue'
))

fig.add_trace(go.Bar(
    x=policy_success['Policy Area'],
    y=policy_success['Successful'],
    name='Successful',
    marker_color='green'
))

fig.add_trace(go.Bar(
    x=policy_success['Policy Area'],
    y=policy_success['Commission Response'],
    name='Commission Response',
    marker_color='orange'
))

fig.update_layout(
    title='ECI Performance by Policy Area',
    xaxis_title='Policy Area',
    yaxis_title='Number of Initiatives',
    barmode='group',
    height=600,
    xaxis={'categoryorder':'total descending'}
)

fig.show()

# Success Rate bar chart
fig2 = px.bar(policy_success.sort_values('Success Rate (%)', ascending=True),
              x='Success Rate (%)', y='Policy Area',
              orientation='h',
              title='Success Rate (%) by Policy Area',
              color='Success Rate (%)',
              color_continuous_scale='RdYlGn',
              text='Success Rate (%)')

fig2.update_traces(texttemplate='%{text:.1f}%', textposition='outside')
fig2.update_layout(height=500)
fig2.show()

## QUESTION 8: Detailed Analysis of Successful ECIs

In [277]:
print("\n" + "="*80)
print("QUESTION 8: DETAILED ANALYSIS OF SUCCESSFUL ECIs")
print("="*80)

successful = df[df['successful_eci'] == True].copy()

print(f"\nTotal successful ECIs: {len(successful)}")

print("\n\nSuccessful ECIs by year:")
successful_by_year = successful.groupby('registration_year').size().reset_index()
successful_by_year.columns = ['Year', 'Count']
print(successful_by_year.to_string(index=False))

print("\n\nSuccessful ECIs by policy area:")
successful_by_policy = successful['primary_policy_area'].value_counts().reset_index()
successful_by_policy.columns = ['Policy Area', 'Count']
print(successful_by_policy.to_string(index=False))

print("\n\nSignature statistics for successful ECIs:")
print(successful['signatures_numeric'].describe())

print("\n\nCountry threshold statistics for successful ECIs:")
print(successful['signatures_threshold_met_numeric'].describe())

print("\n\nCollection duration for successful ECIs:")
print(successful['collection_duration_days'].describe())

print("\n\nDetailed list of successful ECIs:")
successful_detail = successful[['registration_number', 'title', 'registration_year', 
                                  'signatures_numeric', 'signatures_threshold_met_numeric',
                                  'primary_policy_area', 'commission_responded']].copy()
successful_detail = successful_detail.sort_values('signatures_numeric', ascending=False)
successful_detail.columns = ['Reg #', 'Title', 'Year', 'Signatures', 'Countries', 'Policy Area', 'Commission Responded']
print(successful_detail.to_string(index=False))


QUESTION 8: DETAILED ANALYSIS OF SUCCESSFUL ECIs

Total successful ECIs: 16


Successful ECIs by year:
 Year  Count
 2012      3
 2017      3
 2018      1
 2019      2
 2020      1
 2021      1
 2022      1
 2024      4


Successful ECIs by policy area:
       Policy Area  Count
       Environment      9
    Animal Welfare      3
 Economy & Finance      2
            Health      1
Rights & Democracy      1


Signature statistics for successful ECIs:
count    1.600000e+01
mean     1.279965e+06
std      2.098219e+05
min      1.054973e+06
25%      1.122566e+06
50%      1.231878e+06
75%      1.409902e+06
max      1.721626e+06
Name: signatures_numeric, dtype: float64


Country threshold statistics for successful ECIs:
count    16.000000
mean     13.812500
std       5.114929
min       7.000000
25%       9.750000
50%      12.000000
75%      18.000000
max      24.000000
Name: signatures_threshold_met_numeric, dtype: float64


Collection duration for successful ECIs:
count     15.000000
mean  

In [278]:
# ðŸ“Š VISUALIZATION: Scatter Plot of Successful ECIs
successful_plot = successful.copy()
successful_plot['title_short'] = successful_plot['title'].str[:50] + '...'

fig = px.scatter(successful_plot, 
                 x='signatures_threshold_met_numeric', 
                 y='signatures_numeric',
                 size='signatures_numeric',
                 color='primary_policy_area',
                 hover_data=['title', 'registration_year'],
                 title='Successful ECIs: Signatures vs Countries Meeting Threshold',
                 labels={
                     'signatures_threshold_met_numeric': 'Number of Countries Meeting Threshold',
                     'signatures_numeric': 'Total Signatures Collected',
                     'primary_policy_area': 'Policy Area'
                 })

fig.update_layout(height=600)
fig.show()

## QUESTION 9: Commission Response Analysis

In [279]:
print("\n" + "="*80)
print("QUESTION 9: COMMISSION RESPONSE ANALYSIS")
print("="*80)

responded = df[df['commission_responded'] == True].copy()

print(f"\nTotal initiatives with Commission response: {len(responded)}")
print(f"Response rate (of successful ECIs): {(len(responded)/df['successful_eci'].sum()*100):.2f}%")

print("\n\nTime to receive Commission response (from registration):")
print(responded['time_to_commission_response_days'].describe())

print(f"\nIn years: Mean = {responded['time_to_commission_response_days'].mean()/365:.2f}, Median = {responded['time_to_commission_response_days'].median()/365:.2f}")

print("\n\nResponded initiatives by registration year:")
responded_by_year = responded.groupby('registration_year').size().reset_index()
responded_by_year.columns = ['Year', 'Count']
print(responded_by_year.to_string(index=False))

print("\n\nResponded initiatives by policy area:")
responded_by_policy = responded['primary_policy_area'].value_counts().reset_index()
responded_by_policy.columns = ['Policy Area', 'Count']
print(responded_by_policy.to_string(index=False))

print("\n\nAverage signatures for responded initiatives:")
print(f"Mean: {responded['signatures_numeric'].mean():.0f}")
print(f"Median: {responded['signatures_numeric'].median():.0f}")

print("\n\nDetailed list of initiatives with Commission response:")
responded_detail = responded[['registration_number', 'title', 'registration_year', 
                               'signatures_numeric', 'time_to_commission_response_days',
                               'primary_policy_area']].copy()
responded_detail = responded_detail.sort_values('time_to_commission_response_days')
responded_detail['Years to Response'] = (responded_detail['time_to_commission_response_days'] / 365).round(2)
responded_detail = responded_detail[['registration_number', 'title', 'registration_year', 
                                      'signatures_numeric', 'Years to Response', 'primary_policy_area']]
responded_detail.columns = ['Reg #', 'Title', 'Year', 'Signatures', 'Years to Response', 'Policy Area']
print(responded_detail.to_string(index=False))


QUESTION 9: COMMISSION RESPONSE ANALYSIS

Total initiatives with Commission response: 11
Response rate (of successful ECIs): 68.75%


Time to receive Commission response (from registration):
count      11.000000
mean     1044.272727
std       532.240376
min       321.000000
25%       712.500000
50%      1023.000000
75%      1281.500000
max      2311.000000
Name: time_to_commission_response_days, dtype: float64

In years: Mean = 2.86, Median = 2.80


Responded initiatives by registration year:
 Year  Count
 2012      3
 2017      2
 2018      1
 2019      2
 2020      1
 2021      1
 2022      1


Responded initiatives by policy area:
   Policy Area  Count
   Environment      8
Animal Welfare      2
        Health      1


Average signatures for responded initiatives:
Mean: 1300932
Median: 1217916


Detailed list of initiatives with Commission response:
      Reg #                                                                                       Title  Year  Signatures  Years to Re

In [280]:
# ðŸ“Š VISUALIZATION: Timeline to Commission Response
responded_plot = responded.copy()
responded_plot['title_short'] = responded_plot['title'].str[:40] + '...'
responded_plot['years_to_response'] = responded_plot['time_to_commission_response_days'] / 365

fig = px.bar(responded_plot.sort_values('time_to_commission_response_days'),
             y='title_short',
             x='years_to_response',
             orientation='h',
             title='Time to Commission Response (Years) for Each Initiative',
             labels={'years_to_response': 'Years to Response', 'title_short': 'Initiative'},
             color='years_to_response',
             color_continuous_scale='Reds')

fig.update_layout(height=600, showlegend=False)
fig.show()

## QUESTION 10: Funding Analysis

In [281]:
print("\n" + "="*80)
print("QUESTION 10: FUNDING ANALYSIS")
print("="*80)

print(f"\nInitiatives with funding data: {df['funding_numeric'].notna().sum()}")

print("\n\nFunding statistics:")
print(df['funding_numeric'].describe())

print("\n\nFunding by outcome:")
funding_by_outcome = df.groupby('final_outcome').agg({
    'funding_numeric': ['count', 'mean', 'median', 'min', 'max']
}).round(2)
print(funding_by_outcome)

print("\n\nFunding comparison: Successful vs Unsuccessful:")
successful_data = df[df['successful_eci'] == True]
unsuccessful_data = df[df['successful_eci'] == False]
successful_funding = successful_data['funding_numeric']
unsuccessful_funding = unsuccessful_data['funding_numeric']

funding_comparison = pd.DataFrame({
    'Category': ['Successful ECIs', 'Unsuccessful ECIs'],
    'Count': [successful_funding.notna().sum(), unsuccessful_funding.notna().sum()],
    'Mean Funding': [successful_funding.mean(), unsuccessful_funding.mean()],
    'Median Funding': [successful_funding.median(), unsuccessful_funding.median()],
    'Max Funding': [successful_funding.max(), unsuccessful_funding.max()]
})
print(funding_comparison.to_string(index=False))

print("\n\nCorrelation: Funding vs Signatures")
initiatives_with_both = df[(df['funding_numeric'].notna()) & (df['signatures_numeric'].notna())]
if len(initiatives_with_both) > 0:
    correlation = initiatives_with_both[['funding_numeric', 'signatures_numeric']].corr()
    print(correlation)
    print(f"\nCorrelation coefficient: {correlation.iloc[0, 1]:.4f}")


QUESTION 10: FUNDING ANALYSIS

Initiatives with funding data: 62


Funding statistics:
count    6.200000e+01
mean     1.719314e+05
std      3.874476e+05
min      5.000000e+02
25%      5.313000e+03
50%      2.793800e+04
75%      1.645726e+05
max      2.160615e+06
Name: funding_numeric, dtype: float64


Funding by outcome:
                        funding_numeric                                 \
                                  count       mean     median      min   
final_outcome                                                            
Commission Response                  11  524466.42  282858.84  11933.0   
Unsuccessful Collection              33   84379.30   17000.00    500.0   
Withdrawn                            11   57568.43   19100.00   2000.0   

                                     
                                max  
final_outcome                        
Commission Response      2160614.83  
Unsuccessful Collection   730516.00  
Withdrawn                 345567.00  


F

In [282]:
# ðŸ“Š VISUALIZATION: Scatter Plot - Funding vs Signatures
plot_data = df[(df['funding_numeric'].notna()) & (df['signatures_numeric'].notna())].copy()

fig = px.scatter(plot_data,
                 y='funding_numeric',
                 x='signatures_numeric',
                 color='successful_eci',
                 size='signatures_numeric',
                 hover_data=['title', 'registration_year'],
                 title='Funding vs Signatures Collected (Correlation Analysis)',
                 labels={
                     'funding_numeric': 'Funding (EUR)',
                     'signatures_numeric': 'Signatures Collected',
                     'successful_eci': 'Successful'
                 },
                 color_discrete_map={True: '#2ecc71', False: '#e74c3c'},
                 trendline='ols')

fig.update_layout(height=600)
fig.show()

## QUESTION 11: Correlation Analysis - Key Success Factors

In [283]:
print("\n" + "="*80)
print("QUESTION 11: CORRELATION ANALYSIS - KEY SUCCESS FACTORS")
print("="*80)

# Create analysis dataset
analysis_df = df[[
    'successful_eci',
    'commission_responded',
    'signatures_numeric',
    'signatures_threshold_met_numeric',
    'funding_numeric',
    'collection_duration_days',
    'registration_to_collection_days',
    'registration_year'
]].copy()

# Convert boolean to numeric
analysis_df['successful_numeric'] = analysis_df['successful_eci'].astype(int)
analysis_df['responded_numeric'] = analysis_df['commission_responded'].astype(int)

print("\nCorrelation matrix of key metrics with success:")
corr_columns = [
    'successful_numeric',
    'signatures_numeric',
    'signatures_threshold_met_numeric',
    'funding_numeric',
    'collection_duration_days',
    'registration_to_collection_days',
    'registration_year'
]

correlation_matrix = analysis_df[corr_columns].corr()
print(correlation_matrix['successful_numeric'].sort_values(ascending=False).to_string())

print("\n\nCorrelation with Commission response:")
print(analysis_df[corr_columns[:-1] + ['responded_numeric']].corr()['responded_numeric'].sort_values(ascending=False).to_string())

# Comparison for key metrics
print("\n\nDetailed comparison:")
successful_data = df[df['successful_eci'] == True]
unsuccessful_data = df[df['successful_eci'] == False]

comparison_detail = pd.DataFrame({
    'Metric': [
        'Avg Signatures',
        'Avg Funding (â‚¬)',
        'Avg Collection Duration (days)',
        'Avg Countries Met Threshold',
        'Avg Time to Collection Start (days)'
    ],
    'Successful ECIs': [
        successful_data['signatures_numeric'].mean(),
        successful_data['funding_numeric'].mean(),
        successful_data['collection_duration_days'].mean(),
        successful_data['signatures_threshold_met_numeric'].mean(),
        successful_data['registration_to_collection_days'].mean()
    ],
    'Unsuccessful ECIs': [
        unsuccessful_data['signatures_numeric'].mean(),
        unsuccessful_data['funding_numeric'].mean(),
        unsuccessful_data['collection_duration_days'].mean(),
        unsuccessful_data['signatures_threshold_met_numeric'].mean(),
        unsuccessful_data['registration_to_collection_days'].mean()
    ]
})

comparison_detail['Difference'] = comparison_detail['Successful ECIs'] - comparison_detail['Unsuccessful ECIs']
comparison_detail['Ratio'] = (comparison_detail['Successful ECIs'] / comparison_detail['Unsuccessful ECIs']).round(2)
print(comparison_detail.to_string(index=False))


QUESTION 11: CORRELATION ANALYSIS - KEY SUCCESS FACTORS

Correlation matrix of key metrics with success:
successful_numeric                  1.000000
signatures_numeric                  0.937263
signatures_threshold_met_numeric    0.888032
funding_numeric                     0.484459
collection_duration_days            0.083412
registration_year                   0.045373
registration_to_collection_days    -0.089647


Correlation with Commission response:
responded_numeric                   1.000000
successful_numeric                  0.810093
signatures_numeric                  0.754152
signatures_threshold_met_numeric    0.698153
funding_numeric                     0.426021
collection_duration_days            0.164773
registration_to_collection_days    -0.118441


Detailed comparison:
                             Metric  Successful ECIs  Unsuccessful ECIs    Difference  Ratio
                     Avg Signatures     1.279965e+06       92518.250000  1.187447e+06  13.83
               

In [284]:
# ðŸ“Š VISUALIZATION: Correlation Heatmap
corr_labels = {
    'successful_numeric': 'Success',
    'signatures_numeric': 'Signatures',
    'signatures_threshold_met_numeric': 'Countries',
    'funding_numeric': 'Funding',
    'collection_duration_days': 'Collection Days',
    'registration_to_collection_days': 'Regâ†’Collection',
    'registration_year': 'Year'
}

# Rename for display
corr_display = correlation_matrix.rename(columns=corr_labels, index=corr_labels)

fig = px.imshow(corr_display,
                text_auto='.2f',
                aspect='auto',
                color_continuous_scale='RdBu_r',
                color_continuous_midpoint=0,
                title='Correlation Matrix: Key Success Factors')

fig.update_layout(height=600)
fig.show()

## QUESTION 12: Executive Summary for ECI Organizers

In [285]:
print("\n" + "="*80)
print("QUESTION 13: EXECUTIVE SUMMARY FOR ECI ORGANIZERS")
print("="*80)

print("\n### OVERALL ECI LANDSCAPE ###")
print(f"Total ECIs registered (all time): {len(df)}")
print(f"Time period: {df['registration_year'].min():.0f} - {df['registration_year'].max():.0f}")
print(f"Average initiatives per year: {len(df) / (df['registration_year'].max() - df['registration_year'].min() + 1):.1f}")

print("\n\n### SUCCESS RATES ###")
print(f"Initiatives reaching 1M signatures: {df['reached_signatures'].sum()} ({(df['reached_signatures'].sum()/len(df)*100):.1f}%)")
print(f"Initiatives meeting country threshold: {df['met_country_threshold'].sum()} ({(df['met_country_threshold'].sum()/len(df)*100):.1f}%)")
print(f"Successful ECIs (both criteria): {df['successful_eci'].sum()} ({(df['successful_eci'].sum()/len(df)*100):.1f}%)")
print(f"Commission responses received: {df['commission_responded'].sum()} ({(df['commission_responded'].sum()/len(df)*100):.1f}%)")
print(f"Response rate for successful ECIs: {(df['commission_responded'].sum()/df['successful_eci'].sum()*100):.1f}%")

print("\n\n### KEY BARRIERS ###")
unsuccessful = df[df['final_outcome'] == 'Unsuccessful Collection']
withdrawn = df[df['final_outcome'] == 'Withdrawn']
print(f"Unsuccessful collections: {len(unsuccessful)} ({(len(unsuccessful)/len(df)*100):.1f}%)")
print(f"Withdrawn initiatives: {len(withdrawn)} ({(len(withdrawn)/len(df)*100):.1f}%)")
print(f"Attrition rate (did not complete): {((len(unsuccessful) + len(withdrawn))/len(df)*100):.1f}%")

print("\n\n### SIGNATURES REQUIRED ###")
successful_sigs = df[df['successful_eci']]['signatures_numeric']
print(f"Minimum signatures among successful: {successful_sigs.min():,.0f}")
print(f"Average signatures for successful: {successful_sigs.mean():,.0f}")
print(f"Median signatures for successful: {successful_sigs.median():,.0f}")
print(f"Maximum signatures achieved: {successful_sigs.max():,.0f}")

print("\n\n### COUNTRY THRESHOLD PATTERNS ###")
successful_countries = df[df['successful_eci']]['signatures_threshold_met_numeric']
print(f"Minimum countries needed: 7")
print(f"Average countries met in successful: {successful_countries.mean():.1f}")
print(f"Maximum countries met: {int(successful_countries.max())}")

print("\n\n### TIME EXPECTATIONS ###")
print(f"Average collection period: {df['collection_duration_days'].mean():.0f} days ({(df['collection_duration_days'].mean()/365):.1f} years)")
print(f"Median collection period: {df['collection_duration_days'].median():.0f} days ({(df['collection_duration_days'].median()/365):.1f} years)")
successful_collection = df[df['successful_eci']]['collection_duration_days']
print(f"Average for successful: {successful_collection.mean():.0f} days ({(successful_collection.mean()/365):.1f} years)")
print(f"Time to Commission response: {responded['time_to_commission_response_days'].mean():.0f} days ({(responded['time_to_commission_response_days'].mean()/365):.2f} years)")

print("\n\n### FUNDING INSIGHTS ###")
print(f"Successful ECIs avg funding: â‚¬{successful_data['funding_numeric'].mean():,.0f}")
print(f"Unsuccessful ECIs avg funding: â‚¬{unsuccessful_data['funding_numeric'].mean():,.0f}")
print(f"Funding advantage ratio: {(successful_data['funding_numeric'].mean() / unsuccessful_data['funding_numeric'].mean()):.1f}x")
print(f"Correlation (funding vs signatures): {correlation_matrix.iloc[0, 1]:.3f}")

print("\n\n### TOPIC AREAS WITH HIGHEST SUCCESS ###")
top_topics = policy_success[policy_success['Total'] >= 5].sort_values('Success Rate (%)', ascending=False).head(5)
print(top_topics[['Policy Area', 'Total', 'Successful', 'Success Rate (%)']].to_string(index=False))

print("\n\n### TEMPORAL TRENDS ###")
print("Best performing years:")
best_years = yearly_stats.nlargest(3, 'Success Rate (%)')
print(best_years[['Year', 'Total Registered', 'Successful', 'Success Rate (%)']].to_string(index=False))

print("\n\nWorst performing years (with >= 5 registrations):")
worst_years = yearly_stats[yearly_stats['Total Registered'] >= 5].nsmallest(3, 'Success Rate (%)')
print(worst_years[['Year', 'Total Registered', 'Successful', 'Success Rate (%)']].to_string(index=False))

print("\n\n### GEOGRAPHIC INSIGHTS ###")
print("Top 5 countries by threshold achievement in successful ECIs:")
print(country_participation.head(5).to_string(index=False))

print("\n\nTop 5 countries organizing successful initiatives:")
print(successful_org_participation.head(5).to_string(index=False))


QUESTION 13: EXECUTIVE SUMMARY FOR ECI ORGANIZERS

### OVERALL ECI LANDSCAPE ###
Total ECIs registered (all time): 121
Time period: 2012 - 2025
Average initiatives per year: 8.6


### SUCCESS RATES ###
Initiatives reaching 1M signatures: 16 (13.2%)
Initiatives meeting country threshold: 17 (14.0%)
Successful ECIs (both criteria): 16 (13.2%)
Commission responses received: 11 (9.1%)
Response rate for successful ECIs: 68.8%


### KEY BARRIERS ###
Unsuccessful collections: 71 (58.7%)
Withdrawn initiatives: 27 (22.3%)
Attrition rate (did not complete): 81.0%


### SIGNATURES REQUIRED ###
Minimum signatures among successful: 1,054,973
Average signatures for successful: 1,279,965
Median signatures for successful: 1,231,878
Maximum signatures achieved: 1,721,626


### COUNTRY THRESHOLD PATTERNS ###
Minimum countries needed: 7
Average countries met in successful: 13.8
Maximum countries met: 24


### TIME EXPECTATIONS ###
Average collection period: 428 days (1.2 years)
Median collection period:

## Export Analysis Results to CSV

In [286]:
# # Export enhanced dataset with calculated fields
# export_df = df[[
#     'registration_number',
#     'title',
#     'registration_year',
#     'current_status',
#     'final_outcome',
#     'primary_policy_area',
#     'signatures_numeric',
#     'signatures_threshold_met_numeric',
#     'funding_numeric',
#     'collection_duration_days',
#     'registration_to_collection_days',
#     'time_to_commission_response_days',
#     'reached_signatures',
#     'met_country_threshold',
#     'successful_eci',
#     'commission_responded'
# ]].copy()

# export_df.columns = [
#     'Registration_Number',
#     'Title',
#     'Year',
#     'Current_Status',
#     'Final_Outcome',
#     'Policy_Area',
#     'Signatures',
#     'Countries_Met_Threshold',
#     'Funding_EUR',
#     'Collection_Duration_Days',
#     'Registration_to_Collection_Days',
#     'Time_to_Commission_Response_Days',
#     'Reached_1M_Signatures',
#     'Met_Country_Threshold_7plus',
#     'Successful_ECI',
#     'Commission_Responded'
# ]

# export_df.to_csv('eci_analysis_enhanced.csv', index=False)
# print("âœ“ Exported: eci_analysis_enhanced.csv")

# # Export summary statistics
# summary_stats = pd.DataFrame({
#     'Metric': [
#         'Total ECIs Registered',
#         'Successful ECIs',
#         'Commission Responses',
#         'Success Rate (%)',
#         'Response Rate of Successful (%)',
#         'Avg Signatures (Successful)',
#         'Avg Funding EUR (Successful)',
#         'Avg Collection Duration Days',
#         'Avg Time to Response Days',
#         'Withdrawn Rate (%)',
#         'Unsuccessful Rate (%)'
#     ],
#     'Value': [
#         len(df),
#         df['successful_eci'].sum(),
#         df['commission_responded'].sum(),
#         round(df['successful_eci'].sum() / len(df) * 100, 2),
#         round(df['commission_responded'].sum() / df['successful_eci'].sum() * 100, 2),
#         round(successful_data['signatures_numeric'].mean(), 0),
#         round(successful_data['funding_numeric'].mean(), 0),
#         round(df['collection_duration_days'].mean(), 0),
#         round(responded['time_to_commission_response_days'].mean(), 0),
#         round(len(withdrawn) / len(df) * 100, 2),
#         round(len(unsuccessful) / len(df) * 100, 2)
#     ]
# })

# summary_stats.to_csv('eci_summary_statistics.csv', index=False)
# print("âœ“ Exported: eci_summary_statistics.csv")

# # Export policy area analysis
# policy_success.to_csv('eci_policy_area_analysis.csv', index=False)
# print("âœ“ Exported: eci_policy_area_analysis.csv")

# # Export yearly trends
# yearly_stats.to_csv('eci_yearly_trends.csv', index=False)
# print("âœ“ Exported: eci_yearly_trends.csv")

# # Export country participation
# country_participation.to_csv('eci_country_threshold_achievement.csv', index=False)
# print("âœ“ Exported: eci_country_threshold_achievement.csv")

# print("\n" + "="*80)
# print("ANALYSIS COMPLETE - All outputs exported")
# print("="*80)