In [13]:
import pandas as pd
from pymongo import MongoClient
from collections import defaultdict
from urllib.parse import quote_plus
import requests

In [14]:
url = f"https://ec.europa.eu/eurostat/api/dissemination/sdmx/3.0/data/dataflow/ESTAT/gov_10a_exp/1.0/*.*.*.*.*.*?c[freq]=A&c[unit]=PC_GDP&c[sector]=S13&c[cofog99]=GF09,GF1002&c[na_item]=P2_D29_D5_D8,TE&c[geo]=EU27_2020,DE,ES,IT,AT,SE&c[TIME_PERIOD]=2023,2022,2021,2020,2019,2018,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000,1999,1998,1997,1996,1995,1994,1993,1992,1991,1990&compress=true&format=csvdata&formatVersion=2.0&lang=en&labels=both"

In [15]:
df_raw = requests.get(url, params=None)
df_raw.raise_for_status()
df_raw = pd.read_csv(
    url,
    sep=',',
    compression='gzip',
    dtype=str,        # o tu mapeo de dtypes
    na_values=[':']   # si quieres interpretar ':' como NaN
)

In [17]:
df_raw.head()

Unnamed: 0,STRUCTURE,STRUCTURE_ID,freq: Time frequency,unit: Unit of measure,sector: Sector,cofog99: Classification of the functions of government (COFOG 1999),na_item: National accounts indicator (ESA 2010),geo: Geopolitical entity (reporting),TIME_PERIOD: Time,OBS_VALUE: Observation value,OBS_FLAG: Observation status (Flag) V2 structure,CONF_STATUS: Confidentiality status (flag)
0,dataflow,ESTAT:GOV_10A_EXP(1.0),A: Annual,PC_GDP: Percentage of gross domestic product (...,S13: General government,GF09: Education,P2_D29_D5_D8: Intermediate consumption; other ...,AT: Austria,1995,0.8,,
1,dataflow,ESTAT:GOV_10A_EXP(1.0),A: Annual,PC_GDP: Percentage of gross domestic product (...,S13: General government,GF09: Education,P2_D29_D5_D8: Intermediate consumption; other ...,AT: Austria,1996,0.9,,
2,dataflow,ESTAT:GOV_10A_EXP(1.0),A: Annual,PC_GDP: Percentage of gross domestic product (...,S13: General government,GF09: Education,P2_D29_D5_D8: Intermediate consumption; other ...,AT: Austria,1997,0.9,,
3,dataflow,ESTAT:GOV_10A_EXP(1.0),A: Annual,PC_GDP: Percentage of gross domestic product (...,S13: General government,GF09: Education,P2_D29_D5_D8: Intermediate consumption; other ...,AT: Austria,1998,0.9,,
4,dataflow,ESTAT:GOV_10A_EXP(1.0),A: Annual,PC_GDP: Percentage of gross domestic product (...,S13: General government,GF09: Education,P2_D29_D5_D8: Intermediate consumption; other ...,AT: Austria,1999,0.9,,


In [18]:
def process_country(df_raw, country_label):
    """
    Filtra df_raw para un solo país y devuelve un df_wide con columnas:
      year, education_pct_gdp, pensions_pct_gdp, country
    """
    # 1. Filtrar dimensiones fijas + geo variable
    df = df_raw[
        (df_raw['geo: Geopolitical entity (reporting)'] == country_label) &
        (df_raw['cofog99: Classification of the functions of government (COFOG 1999)']
             .isin(['GF09: Education', 'GF1002: Old age'])) &
        (df_raw['na_item: National accounts indicator (ESA 2010)']
             == 'TE: Total general government expenditure') &
        (df_raw['sector: Sector'] == 'S13: General government') &
        (df_raw['unit: Unit of measure'] == 'PC_GDP: Percentage of gross domestic product (GDP)') &
        (df_raw['freq: Time frequency'] == 'A: Annual')
    ].copy()

    # 2. Renombrado columnas clave
    df = df.rename(columns={
        'TIME_PERIOD: Time': 'year',
        'cofog99: Classification of the functions of government (COFOG 1999)': 'cofog',
        'OBS_VALUE: Observation value': 'value'
    })[['year', 'cofog', 'value']]

    # 3. Convertir tipos
    df['year'] = df['year'].astype(int)
    df['value'] = pd.to_numeric(df['value'], errors='coerce')

    # 4. Pivot a “wide”
    df_wide = df.pivot(index='year', columns='cofog', values='value').reset_index()
    df_wide.columns.name = None
    df_wide = df_wide.rename(columns={
        'GF09: Education':  'education_pct_gdp',
        'GF1002: Old age':  'pensions_pct_gdp'
    })

    # 5. Anotar el país
    df_wide['country'] = country_label.split(':')[0]  # p.ej. "ES", "AT", "EU27_2020"
    return df_wide

In [19]:
# 1) Lista de países a procesar (tal como aparecen en la columna `geo`)
countries = [
    "ES: Spain",
    "AT: Austria",
    "EU27_2020: European Union - 27 countries (from 2020)"
    # añade aquí más entries (p.ej. "DE: Germany", "FR: France")...
]

# 2) Cargar df_raw desde tu CSV/API (solo la primera vez)
# df_raw = pd.read_csv('gov10a_all.csv', sep=';')  

# 3) Procesar y unir
df_list = [ process_country(df_raw, c) for c in countries ]
df_all  = pd.concat(df_list, ignore_index=True)

# 4) Resultado final
# df_all tendrá columnas: year, education_pct_gdp, pensions_pct_gdp, country
print(df_all.head())


   year  education_pct_gdp  pensions_pct_gdp country
0  1995                4.3               6.6      ES
1  1996                4.3               6.7      ES
2  1997                4.2               6.6      ES
3  1998                4.1               6.5      ES
4  1999                4.2               6.4      ES


In [22]:
df_all.sample(5)

Unnamed: 0,year,education_pct_gdp,pensions_pct_gdp,country
22,2017,4.0,9.1,ES
78,2015,4.8,10.5,EU27_2020
11,2006,4.0,6.1,ES
75,2012,4.9,10.5,EU27_2020
25,2020,4.6,10.9,ES


In [23]:
# Configurar conexión
usuario = "jalope"
contrasena = "admin"
host = "127.0.0.1"
puerto = "27250"

uri = f"mongodb://{quote_plus(usuario)}:{quote_plus(contrasena)}@{host}:{puerto}/?directConnection=true"
client = MongoClient(uri)
db = client["tfm_db"]

In [24]:
coll = db['EUROSTAT_GOV10A_EXP_GDP_COMPARATIVA']
coll.drop()
coll.insert_many(df_all.to_dict('records'))
print("Registros en la colección: ", coll.count_documents({}))
print("Número de filas de df_raw: ", len(df_all))

Registros en la colección:  87
Número de filas de df_raw:  87
